#### Noisy Quantum SVM - Spambase

In [None]:
import qiskit, qiskit_aer, qiskit_machine_learning
print("Qiskit:", qiskit.__version__)
print("Aer:", qiskit_aer.__version__)
print("QML:", qiskit_machine_learning.__version__)

In [None]:
# To ensure reproducibility of results
from qiskit_machine_learning.utils import algorithm_globals
algorithm_globals.random_seed = 12345

In [None]:
# --- Import Libraries ---
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
# from imblearn.over_sampling import RandomOverSampler  # For optional balancing

In [None]:
# --- Qiskit Imports ---
from qiskit.circuit.library import ZZFeatureMap
from qiskit_aer import AerSimulator
from qiskit_aer.noise import NoiseModel, depolarizing_error
from qiskit_aer.primitives import SamplerV2 as AerSampler
from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager
from qiskit_machine_learning.state_fidelities import ComputeUncompute
from qiskit_machine_learning.kernels import FidelityQuantumKernel
from qiskit_machine_learning.algorithms import QSVC

In [None]:
# --- Import Spambase Column Names ---
spambase_columns = [
    "word_freq_make",
    "word_freq_address",
    "word_freq_all",
    "word_freq_3d",
    "word_freq_our",
    "word_freq_over",
    "word_freq_remove",
    "word_freq_internet",
    "word_freq_order",
    "word_freq_mail",
    "word_freq_receive",
    "word_freq_will",
    "word_freq_people",
    "word_freq_report",
    "word_freq_addresses",
    "word_freq_free",
    "word_freq_business",
    "word_freq_email",
    "word_freq_you",
    "word_freq_credit",
    "word_freq_your",
    "word_freq_font",
    "word_freq_000",
    "word_freq_money",
    "word_freq_hp",
    "word_freq_hpl",
    "word_freq_george",
    "word_freq_650",
    "word_freq_lab",
    "word_freq_labs",
    "word_freq_telnet",
    "word_freq_857",
    "word_freq_data",
    "word_freq_415",
    "word_freq_85",
    "word_freq_technology",
    "word_freq_1999",
    "word_freq_parts",
    "word_freq_pm",
    "word_freq_direct",
    "word_freq_cs",
    "word_freq_meeting",
    "word_freq_original",
    "word_freq_project",
    "word_freq_re",
    "word_freq_edu",
    "word_freq_table",
    "word_freq_conference",
    "char_freq_;",
    "char_freq_(",
    "char_freq_[",
    "char_freq_!",
    "char_freq_$",
    "char_freq_#",
    "capital_run_length_average",
    "capital_run_length_longest",
    "capital_run_length_total",
    # finally the target label column:
    "label"
]

# --- 1. Load the Spambase Dataset ---
file_path = r'C:\Users\User\Documents\MyProjects\FYP_ResearchProject\data\spambase\spambase.data'
df = pd.read_csv(file_path, header=None, names=spambase_columns)
df.drop_duplicates(inplace=True)

In [None]:
# Separate features and target
X = df.drop('label', axis=1)
y = df['label']

In [None]:
# Data splitting
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [None]:
# Preprocessing Steps
# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns)

In [None]:
# Feature Selection
THRESH = 0.9
corr_matrix_train = X_train_scaled_df.corr().abs()
upper_triangle = corr_matrix_train.where(np.triu(np.ones(corr_matrix_train.shape), k=1).astype(bool))
columns_to_drop = set()
for column in upper_triangle.columns:
    high_corr_partners = upper_triangle.index[upper_triangle[column] > THRESH].tolist()
    if high_corr_partners:
        for partner in high_corr_partners:
            corr_main_vs_target = y_train.corr(X_train_scaled_df[column])
            corr_partner_vs_target = y_train.corr(X_train_scaled_df[partner])
            if abs(corr_main_vs_target) < abs(corr_partner_vs_target):
                columns_to_drop.add(column)
            else:
                columns_to_drop.add(partner)

to_drop_final = sorted(list(columns_to_drop))
X_train_selected = X_train_scaled_df.drop(columns=to_drop_final)
X_test_selected = X_test_scaled_df.drop(columns=to_drop_final)

In [None]:
# Principal Component Analysis
n_components = 4
pca = PCA(n_components=n_components, random_state=42)
X_train_pca = pca.fit_transform(X_train_selected)
X_test_pca = pca.transform(X_test_selected)

print("--- Data Preprocessing Complete ---")
print(f"Final training data shape: {X_train_pca.shape}")
print(f"Final testing data shape: {X_test_pca.shape}\n")

##### Noise Simulation Setup

In [None]:
# Quantum Kernel Implementation with Noise
# Create noise model
p_error = 0.04  # 4% depolarizing error for 1-qubit gates
depolarizing_prob_2q = 2 * p_error  # 8% for 2-qubit gates

noise_model = NoiseModel()
noise_model.add_all_qubit_quantum_error(depolarizing_error(p_error, 1), ['u1', 'u2', 'u3', 'rx', 'ry', 'rz', 'id'])
noise_model.add_all_qubit_quantum_error(depolarizing_error(depolarizing_prob_2q, 2), ['cx', 'cz', 'ecr'])

print(f"Depolarizing noise: {p_error*100}% (1q), {p_error*200}% (2q)\n")

In [None]:
# Noisy backend
noisy_backend = AerSimulator(
    noise_model=noise_model,
    seed_simulator=12345,
)

In [None]:
# Noisy Sampler with high shots
noise_sampler = AerSampler.from_backend(
    backend = noisy_backend,
    default_shots = 8192
)
print("Noisy Sampler created !")

In [None]:
# Transpilation pass manager
pm = generate_preset_pass_manager(optimization_level=1, backend=noisy_backend)
print("Noisy Sampler and pass manager ready!")

##### Quantum Kernel Implementation

In [None]:
# Feature map setup
feature_dim = n_components
fm = ZZFeatureMap(feature_dimension=feature_dim, reps=2, entanglement='linear')

# Fidelity with noisy sampler and transpilation
fidelity = ComputeUncompute(sampler=noise_sampler, pass_manager=pm)

# Noisy quantum kernel
quantum_kernel_noisy = FidelityQuantumKernel(fidelity=fidelity, feature_map=fm)

In [None]:
# Plotting noisy kernel matrix
matrix_train_noisy = quantum_kernel_noisy.evaluate(x_vec=X_train_pca[:50])  # Subset for plot if too large
plt.figure(figsize=(8, 6))
plt.imshow(matrix_train_noisy, cmap='viridis')
plt.title("Noisy Kernel Matrix (Subset)")
plt.colorbar()
plt.show()

In [None]:
# Ideal QSVC with tuned parameters
print("--- Training Ideal QSVC (Lung Cancer) ---")
start_time = time.time()

param_grid = {
    'C' : [0.1, 1, 10, 100],
    'class_weight': ['balanced']
}

# Use cross-validation suitable for small/imbalanced data
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# qsvc_ideal = QSVC(quantum_kernel=qkernel, C=1, class_weight='balanced')
# Grid search on QSVC
grid_search = GridSearchCV(
    QSVC(quantum_kernel=quantum_kernel_noisy),  # Your QSVC setup
    param_grid,
    cv=cv,
    scoring='accuracy',  # Or 'f1_macro' for imbalanced classes
    n_jobs=-1,
    verbose=1  # For progress output
)

In [None]:
grid_search.fit(X_train_pca, y_train)
qsvc_noisy = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")
end_time = time.time()
print(f"QSVC training finished in {end_time - start_time:.2f} seconds.")

##### Model Evaluation

In [None]:
y_train_pred = qsvc_noisy.predict(X_train_pca)
train_accuracy = accuracy_score(y_train, y_train_pred)

y_test_pred = qsvc_noisy.predict(X_test_pca)
test_accuracy = accuracy_score(y_test, y_test_pred)

generalization_gap = abs(train_accuracy - test_accuracy)

print(f"\n--- Noisy QSVM Evaluation (Spambase) ---")
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy:     {test_accuracy:.4f}")
print(f"Generalization Gap: {generalization_gap:.4f}")
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred, zero_division=0))