#### Ideal Quatum SVM Implementation - Lung Cancer 

In [1]:
# To ensure reproducibility of results
from qiskit_machine_learning.utils import algorithm_globals
algorithm_globals.random_seed = 12345

In [2]:
# --- Libnray Imports ---
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import chi2_contingency
from sklearn.metrics import classification_report, accuracy_score

In [3]:
# Qiskit Imports
# Definine quantum kernel
# Use the FidelityQuantumKernel class 

from qiskit.circuit.library import ZZFeatureMap
from qiskit.primitives import StatevectorSampler as Sampler
from qiskit_machine_learning.state_fidelities import ComputeUncompute
from qiskit_machine_learning.kernels import FidelityQuantumKernel

In [4]:
# Load data first
lung_cancer_column_names = ['label'] + [f'attr_{i}' for i in range(1, 57)]
file_path_lung = r'C:\Users\User\Documents\MyProjects\FYP_ResearchProject\data\lung+cancer\lung-cancer.data'

# reads the data, treating "?" as missing values
df_lung = pd.read_csv(file_path_lung, header=None, names=lung_cancer_column_names, na_values=['?'])

print(f"Original shape of Lung Cancer data: {df_lung.shape}")

Original shape of Lung Cancer data: (32, 57)


In [5]:
# Mode imputation for missing values
modes = df_lung.mode().iloc[0]
df_lung.fillna(modes, inplace=True)

# Then check if all Nan are gone
print(f"Total missing values after imputation: {df_lung.isnull().sum().sum()}\n")

Total missing values after imputation: 0



In [6]:
# Target Binarization
df_lung['label_binary'] = df_lung['label'].apply(lambda x: 0 if x == 1 else 1)

In [7]:
# Separate Features & Target and Split Data
X_lung = df_lung.drop(['label', 'label_binary'], axis=1)
y_lung_binary = df_lung['label_binary']

In [8]:
X_train_lc, X_test_lc, y_train_lc, y_test_lc = train_test_split(
    X_lung, y_lung_binary, test_size=0.3, random_state=42, stratify=y_lung_binary
)

In [9]:
# One-Hot Encoding
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_lc_encoded = pd.DataFrame(encoder.fit_transform(X_train_lc),
columns=encoder.get_feature_names_out())
X_test_lc_encoded = pd.DataFrame(encoder.transform(X_test_lc),
columns=encoder.get_feature_names_out())

In [10]:
# Feature Selection - Cramer's V
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    if min((kcorr-1), (rcorr-1)) == 0: return 0
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

cramers_scores = {col: cramers_v(X_train_lc_encoded[col], y_train_lc) for col in X_train_lc_encoded.columns}
cramers_series = pd.Series(cramers_scores).sort_values(ascending=False)

N_FEATURES_TO_SELECT = 10 
top_features = cramers_series.head(N_FEATURES_TO_SELECT).index.tolist()

X_train_lc_final = X_train_lc_encoded[top_features]
X_test_lc_final = X_test_lc_encoded[top_features]

print("--- Data Preprocessing Complete ---")
print(f"Final training data shape: {X_train_lc_final.shape}")
print(f"Final testing data shape: {X_test_lc_final.shape}\n")


--- Data Preprocessing Complete ---
Final training data shape: (22, 10)
Final testing data shape: (10, 10)



##### Quantum Kernel Implementation

In [11]:
# setup feature map
feature_dim = N_FEATURES_TO_SELECT
fm = ZZFeatureMap(feature_dimension=feature_dim, reps=2, entanglement='linear')

sampler = Sampler()
fidelity = ComputeUncompute(sampler=sampler)
qkernel = FidelityQuantumKernel(fidelity=fidelity, feature_map=fm)

In [12]:
# Compute kernel matrices
print("Calculating training kernel matrix...")
start_time = time.time()
matrix_train_lc = qkernel.evaluate(x_vec=X_train_lc_final.to_numpy())
end_time = time.time()
print(f"Training kernel matrix calculated in {end_time - start_time:.4f} seconds.\n")

print("Calculating testing kernel matrix...")
start_time = time.time()
matrix_test_lc = qkernel.evaluate(x_vec=X_test_lc_final.to_numpy(), y_vec=X_train_lc_final.to_numpy())
end_time = time.time()
print(f"Testing kernel matrix calculated in {end_time - start_time:.4f} seconds.\n")

Calculating training kernel matrix...
Training kernel matrix calculated in 3.1907 seconds.

Calculating testing kernel matrix...
Testing kernel matrix calculated in 2.8678 seconds.



In [13]:
# --- QSVM Training with Precomputed Kernel ---
print("--- Training QSVM with Precomputed Kernel (Lung Cancer) ---")
qsvm = SVC(kernel='precomputed', class_weight='balanced')

--- Training QSVM with Precomputed Kernel (Lung Cancer) ---


In [14]:
# Grid search for hyperparameter tuning implementation

# Define the cross-validation strategy
# n_splits=3 is correct for this small dataset
# shuffle=True and random_state=42 ensure the split is random but reproducible
param_grid = {'C': [0.1, 1, 10, 100, 1000]}
stratified_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search_lc = GridSearchCV(qsvm, param_grid, cv=stratified_kfold, verbose=0)

grid_search_lc.fit(matrix_train_lc, y_train_lc)

print(f"Best parameters found: {grid_search_lc.best_params_}\n")
best_qsvm = grid_search_lc.best_estimator_

# QUestion is , is gamma or not - RESEARCH this over

Best parameters found: {'C': 1}



##### Model Evaluation

In [15]:
y_train_pred_lc = best_qsvm.predict(matrix_train_lc)
train_accuracy_lc = accuracy_score(y_train_lc, y_train_pred_lc)

y_test_pred_lc = best_qsvm.predict(matrix_test_lc)
test_accuracy_lc = accuracy_score(y_test_lc, y_test_pred_lc)

generalization_gap = abs(train_accuracy_lc - test_accuracy_lc)

print(f"--- Ideal QSVM Evaluation (Lung Cancer) ---")
print(f"Training Accuracy: {train_accuracy_lc:.4f}")
print(f"Test Accuracy:     {test_accuracy_lc:.4f}")
print(f"Generalization Gap: {generalization_gap:.4f}")
print("\nClassification Report (Test Set):")
print(classification_report(y_test_lc, y_test_pred_lc, zero_division=0))

--- Ideal QSVM Evaluation (Lung Cancer) ---
Training Accuracy: 0.9091
Test Accuracy:     0.8000
Generalization Gap: 0.1091

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.78      1.00      0.88         7

    accuracy                           0.80        10
   macro avg       0.89      0.67      0.69        10
weighted avg       0.84      0.80      0.76        10

