#### Ideal Quatum SVM Implementation - Lung Cancer 

In [1]:
# To ensure reproducibility of results
from qiskit_machine_learning.utils import algorithm_globals
algorithm_globals.random_seed = 12345

In [2]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [3]:
# Qiskit imports 
# Imports
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Qiskit Imports
# Definine quantum kernel
# Use the FidelityQuantumKernel class 

from qiskit.circuit.library import ZZFeatureMap
from qiskit.primitives import StatevectorSampler as Sampler
from qiskit_machine_learning.state_fidelities import ComputeUncompute
from qiskit_machine_learning.kernels import FidelityQuantumKernel

In [4]:
# Load data first
lung_cancer_column_names = ['label'] + [f'attr_{i}' for i in range(1, 57)]
file_path_lung = r'C:\Users\User\Documents\MyProjects\FYP_ResearchProject\data\lung+cancer\lung-cancer.data'

# reads the data, treating "?" as missing values
df_lung = pd.read_csv(file_path_lung, header=None, names=lung_cancer_column_names, na_values=['?'])

print(f"Original shape of Lung Cancer data: {df_lung.shape}")

Original shape of Lung Cancer data: (32, 57)


In [5]:
# Mode imputation for missing values
modes = df_lung.mode().iloc[0]
df_lung.fillna(modes, inplace=True)

# Then check if all Nan are gone
print(f"Total missing values after imputation: {df_lung.isnull().sum().sum()}\n")

Total missing values after imputation: 0



In [6]:
# Separate features and target variable
X_lung = df_lung.drop('label', axis=1)
y_lung = df_lung['label']

In [7]:
# Target binarization
# y_lung_binary = y_lung.apply(lambda x: 1 if x == 'M' else 0)
y_lung_binary = y_lung.apply(lambda x: 0 if x == 1 else 1)

In [8]:
# Data splitting 
X_train_lc, X_test_lc, y_train_lc, y_test_lc = train_test_split(
    X_lung, y_lung_binary, test_size=0.3, random_state=42, stratify=y_lung_binary
)

In [9]:
print(X_train_lc.shape)
print(X_test_lc.shape)

(22, 56)
(10, 56)


In [10]:
# Scaling and PCA process
scaler_lc = StandardScaler()
X_train_lc_scaled = scaler_lc.fit_transform(X_train_lc)
X_test_lc_scaled = scaler_lc.transform(X_test_lc) # use transform here, not fit_transform

n_components = 4
pca = PCA(n_components=n_components)
X_train_lc_pca = pca.fit_transform(X_train_lc_scaled)
X_test_lc_pca = pca.transform(X_test_lc_scaled)

print(f"Data preprocessed. Training set shape: {X_train_lc_pca.shape}")
print(f"Test set shape: {X_test_lc_pca.shape}\n")


Data preprocessed. Training set shape: (22, 4)
Test set shape: (10, 4)



##### Quantum Kernel Implementation

In [11]:
# setup feature map
fm = ZZFeatureMap(feature_dimension=n_components, reps=2, entanglement='linear')
sampler = Sampler()
fidelity = ComputeUncompute(sampler=sampler)
qkernel = FidelityQuantumKernel(fidelity=fidelity, feature_map=fm)

In [12]:
# Compute kernel matrices
print("Calculating training kernel matrix...")
start_time = time.time()
matrix_train_lc = qkernel.evaluate(x_vec=X_train_lc_pca)
end_time = time.time()
print(f"Training kernel matrix calculated in {end_time - start_time:.4f} seconds.")

print("\nCalculating testing kernel matrix...")
start_time = time.time()
matrix_test_lc = qkernel.evaluate(x_vec=X_test_lc_pca, y_vec=X_train_lc_pca)
end_time = time.time()
print(f"Testing kernel matrix calculated in {end_time - start_time:.4f} seconds.\n")

Calculating training kernel matrix...
Training kernel matrix calculated in 3.2208 seconds.

Calculating testing kernel matrix...
Testing kernel matrix calculated in 3.5607 seconds.



In [13]:
# Feed to SVC
print("--- Training QSVM with Precomputed Kernel (Lung Cancer) ---")
qsvm = SVC(kernel='precomputed')

--- Training QSVM with Precomputed Kernel (Lung Cancer) ---


In [14]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

In [15]:
# Grid search for hyperparameter tuning implementation

# Define the cross-validation strategy
# n_splits=3 is correct for this small dataset
# shuffle=True and random_state=42 ensure the split is random but reproducible
stratified_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
param_grid = {'C': [0.1, 1, 10, 100, 1000]}
grid_search_lc = GridSearchCV(qsvm, param_grid, cv=stratified_kfold, verbose=0)
grid_search_lc.fit(matrix_train_lc, y_train_lc)

# Best parameters
print(f"Best parameters found: {grid_search_lc.best_params_}")
best_qsvm = grid_search_lc.best_estimator_

# QUestion is , is gamma or not - RESEARCH this over

Best parameters found: {'C': 10}


##### Model Evaluation

In [16]:
y_train_pred_lc = best_qsvm.predict(matrix_train_lc)
train_accuracy_lc = accuracy_score(y_train_lc, y_train_pred_lc)

y_test_pred_lc = best_qsvm.predict(matrix_test_lc)
test_accuracy_lc = accuracy_score(y_test_lc, y_test_pred_lc)

generalization_gap = abs(train_accuracy_lc - test_accuracy_lc)

In [17]:
# Show values
print(f"\n--- Ideal QSVM Evaluation (Lung Cancer) ---")
print(f"Training Accuracy: {train_accuracy_lc:.4f}")
print(f"Test Accuracy:     {test_accuracy_lc:.4f}")
print(f"Generalization Gap: {generalization_gap:.4f}")
print("\nClassification Report (Test Set):")
print(classification_report(y_test_lc, y_test_pred_lc, zero_division=0))


--- Ideal QSVM Evaluation (Lung Cancer) ---
Training Accuracy: 1.0000
Test Accuracy:     0.8000
Generalization Gap: 0.2000

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.67      0.67      0.67         3
           1       0.86      0.86      0.86         7

    accuracy                           0.80        10
   macro avg       0.76      0.76      0.76        10
weighted avg       0.80      0.80      0.80        10

