In [None]:
# =============================================================================
# Phase 1: Setup and Preprocessing Pipeline
# =============================================================================
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# --- Qiskit Imports for Manual Kernel Construction ---
from qiskit.circuit.library import zz_feature_map
from qiskit.circuit.library import unitary_overlap

# Import StatevectorSampler as our sampler
from qiskit.quantum_info import Statevector
from qiskit.primitives import StatevectorSampler 

print("--- Preparing Spambase Dataset ---")

# --- 1a. Load and Preprocess Spambase Data ---
spambase_columns = ["word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d","word_freq_our", "word_freq_over", "word_freq_remove", "word_freq_internet","word_freq_order", "word_freq_mail", "word_freq_receive", "word_freq_will","word_freq_people", "word_freq_report", "word_freq_addresses", "word_freq_free","word_freq_business", "word_freq_email", "word_freq_you", "word_freq_credit","word_freq_your", "word_freq_font", "word_freq_000", "word_freq_money","word_freq_hp", "word_freq_hpl", "word_freq_george", "word_freq_650","word_freq_lab", "word_freq_labs", "word_freq_telnet", "word_freq_857","word_freq_data", "word_freq_415", "word_freq_85", "word_freq_technology","word_freq_1999", "word_freq_parts", "word_freq_pm", "word_freq_direct","word_freq_cs", "word_freq_meeting", "word_freq_original", "word_freq_project","word_freq_re", "word_freq_edu", "word_freq_table", "word_freq_conference","char_freq_;", "char_freq_(", "char_freq_[", "char_freq_!", "char_freq_$","char_freq_#", "capital_run_length_average", "capital_run_length_longest","capital_run_length_total", "label"]
file_path = r'C:\Users\User\Documents\MyProjects\FYP_ResearchProject\data\spambase\spambase.data'
df = pd.read_csv(file_path, header=None, names=spambase_columns)
df.drop_duplicates(inplace=True)

X = df.drop('label', axis=1)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

n_components = 4
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print("Preprocessing complete.\n")

# =============================================================================
# !! IMPORTANT: For faster development, use a smaller subset of the data !!
# =============================================================================
# Running on the full dataset will take a very long time.
# Use these smaller variables for coding and debugging.
# Once everything works, you can switch back to the full `_pca` variables.

use_subset = True  # Set to False to run on the full dataset

if use_subset:
    num_train_subset = 100  # Number of training samples to use
    num_test_subset = 30   # Number of test samples to use
    
    X_train_run = X_train_pca[:num_train_subset]
    y_train_run = y_train[:num_train_subset]
    X_test_run = X_test_pca[:num_test_subset]
    y_test_run = y_test[:num_test_subset]
    
    print(f"--- RUNNING IN SUBSET MODE ---")
    print(f"Using {num_train_subset} training samples and {num_test_subset} test samples.\n")
else:
    X_train_run = X_train_pca
    y_train_run = y_train
    X_test_run = X_test_pca
    y_test_run = y_test
    print(f"--- RUNNING IN FULL DATASET MODE ---")
    print(f"Using {len(y_train_run)} training samples and {len(y_test_run)} test samples.\n")

# =============================================================================
# Phase 2: Manual Quantum Kernel Computation (Ideal/Noiseless)
# =============================================================================
print("--- Manually Constructing Ideal Quantum Kernel Matrix ---")

feature_map = zz_feature_map(feature_dimension=n_components, reps=2, entanglement='linear')

# Evaluate the problem using state vector primitives
sampler = StatevectorSampler()

# Initialize empty kernel matrices based on the size of the data we are running on
num_train_run = X_train_run.shape[0]
num_test_run = X_test_run.shape[0]
matrix_train_ideal = np.zeros((num_train_run, num_train_run))
matrix_test_ideal = np.zeros((num_test_run, num_train_run))

# --- Compute the Training Kernel Matrix (Pair-by-Pair) ---
print("Calculating training kernel matrix...")
start_time_kernel_train = time.time()

for i in range(num_train_run):
    for j in range(i, num_train_run):
        if i == j:
            matrix_train_ideal[i, j] = 1.0
            continue

        circuit_i = feature_map.assign_parameters(X_train_run[i])
        circuit_j = feature_map.assign_parameters(X_train_run[j])
        
        overlap_circuit = unitary_overlap(circuit_i, circuit_j)
        overlap_circuit.measure_all()
        
        # Runs the qiskit sampler primitive
        # num shots setted to 1024
        counts = (
            sampler.run([overlap_circuit], shots=128)
            .result()[0]
            .data.meas.get_int_counts()
        )

        # The kernel entry is the probability of measuring the all-zero state
        kernel_value = counts.get(0, 0.0) / 128
        matrix_train_ideal[i, j] = kernel_value
        matrix_train_ideal[j, i] = kernel_value # The matrix is symmetric

end_time_kernel_train = time.time()
print(f"Training kernel matrix calculated in {end_time_kernel_train - start_time_kernel_train:.2f} seconds.\n")

# --- Compute the Test Kernel Matrix (Pair-by-Pair) ---
print("Calculating test kernel matrix...")
start_time_kernel_test = time.time()

for i in range(num_test_run):
    for j in range(num_train_run):
        circuit_i = feature_map.assign_parameters(X_test_run[i])
        circuit_j = feature_map.assign_parameters(X_train_run[j])
        
        overlap_circuit = unitary_overlap(circuit_i, circuit_j)
        
        # Runs the qiskit sampler primitive
        # num shots setted to 1024
        counts = (
            sampler.run([overlap_circuit], shots=128)
            .result()[0]
            .data.meas.get_int_counts()
        )

        # The kernel entry is the probability of measuring the all-zero state
        kernel_value = counts.get(0, 0.0) / 128
        matrix_train_ideal[i, j] = kernel_value
        matrix_train_ideal[j, i] = kernel_value # The matrix is symmetric

end_time_kernel_test = time.time()
print(f"Test kernel matrix calculated in {end_time_kernel_test - start_time_kernel_test:.2f} seconds.\n")

# =============================================================================
# Phase 3: Train and Evaluate the Ideal QSVM
# =============================================================================
print("--- Training QSVM with Manually Computed Ideal Kernel ---")
start_time_qsvm_train = time.time()

qsvm_ideal = SVC(kernel='precomputed')
param_grid_qsvm = {'C': [0.1, 1, 10, 100]}
# Use 3-fold CV if using the subset, as it's small
cv_folds = 3 if use_subset else 5
grid_qsvm_ideal = GridSearchCV(qsvm_ideal, param_grid_qsvm, cv=cv_folds, verbose=0)

# Fit on the data we ran the kernel on
grid_qsvm_ideal.fit(matrix_train_ideal, y_train_run)

best_qsvm_ideal = grid_qsvm_ideal.best_estimator_
end_time_qsvm_train = time.time()

print(f"Best parameters for Ideal QSVM: {grid_qsvm_ideal.best_params_}")
print(f"Training time for Ideal QSVM: {end_time_qsvm_train - start_time_qsvm_train:.2f} seconds\n")

# --- Evaluation for Ideal QSVM ---
y_train_pred_qsvm = best_qsvm_ideal.predict(matrix_train_ideal)
y_test_pred_qsvm = best_qsvm_ideal.predict(matrix_test_ideal)

train_accuracy_qsvm = accuracy_score(y_train_run, y_train_pred_qsvm)
test_accuracy_qsvm = accuracy_score(y_test_run, y_test_pred_qsvm)
gen_gap_qsvm = abs(train_accuracy_qsvm - test_accuracy_qsvm)

print("--- Ideal QSVM Evaluation (Spambase) ---")
print(f"Training Accuracy: {train_accuracy_qsvm:.4f}")
print(f"Test Accuracy:     {test_accuracy_qsvm:.4f}")
print(f"Generalization Gap: {gen_gap_qsvm:.4f}")
print("\nClassification Report (Test Set):")
print(classification_report(y_test_run, y_test_pred_qsvm, zero_division=0))

ImportError: cannot import name 'Sampler' from 'qiskit.primitives' (c:\Users\User\Documents\MyProjects\FYP_ResearchProject\fypproj\Lib\site-packages\qiskit\primitives\__init__.py)