#### Classical SVM Implementation - Spambase 

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# --- Import Spambase Column Names ---
spambase_columns = [
    "word_freq_make",
    "word_freq_address",
    "word_freq_all",
    "word_freq_3d",
    "word_freq_our",
    "word_freq_over",
    "word_freq_remove",
    "word_freq_internet",
    "word_freq_order",
    "word_freq_mail",
    "word_freq_receive",
    "word_freq_will",
    "word_freq_people",
    "word_freq_report",
    "word_freq_addresses",
    "word_freq_free",
    "word_freq_business",
    "word_freq_email",
    "word_freq_you",
    "word_freq_credit",
    "word_freq_your",
    "word_freq_font",
    "word_freq_000",
    "word_freq_money",
    "word_freq_hp",
    "word_freq_hpl",
    "word_freq_george",
    "word_freq_650",
    "word_freq_lab",
    "word_freq_labs",
    "word_freq_telnet",
    "word_freq_857",
    "word_freq_data",
    "word_freq_415",
    "word_freq_85",
    "word_freq_technology",
    "word_freq_1999",
    "word_freq_parts",
    "word_freq_pm",
    "word_freq_direct",
    "word_freq_cs",
    "word_freq_meeting",
    "word_freq_original",
    "word_freq_project",
    "word_freq_re",
    "word_freq_edu",
    "word_freq_table",
    "word_freq_conference",
    "char_freq_;",
    "char_freq_(",
    "char_freq_[",
    "char_freq_!",
    "char_freq_$",
    "char_freq_#",
    "capital_run_length_average",
    "capital_run_length_longest",
    "capital_run_length_total",
    # finally the target label column:
    "label"
]

# Load data
file_path = r'C:\Users\User\Documents\MyProjects\FYP_ResearchProject\data\spambase\spambase.data'
df = pd.read_csv(file_path, header=None, names=spambase_columns)


In [2]:
# 2. Some basic processing
print(f"Original shape of Spambase data: {df.shape}") # Prints original dataset shape
df.drop_duplicates(inplace=True) # Remove duplicates
print(f"Shape after dropping duplicates: {df.shape}\n") # Then print again the new shape

Original shape of Spambase data: (4601, 58)
Shape after dropping duplicates: (4210, 58)



In [3]:
# 3. Separate features and target
X = df.drop('label', axis=1)
y = df['label']

# 4. Data Splitting
# Using stratify=y to ensure that class distribution is same in train and test set.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

""" 

Uses the 70/30 split with stratify y
random state - reproducitbility

""" 

' \n\nUses the 70/30 split with stratify y\nrandom state - reproducitbility\n\n'

In [4]:
# Scaling and PCA

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # use .transform not fit_transform

n_components = 4
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)



"""
StandardScaler - as mentioned before, it normalizes the feature to mean 0, std 1
PCA - reduces dimensionality to 4 principal components, 4 also because to match with the 4 qubit feature map
PCA - also why choose change into 4 components is to match 4 qubits of ZZFeatureMap
Also (Binary Classification) is just the classification between two classes. It does nothing to amount of qubits

Data leakage - information from test sets sneaks into the training process, makes model look better because it looks like it seen some information

""" 

'\nStandardScaler - as mentioned before, it normalizes the feature to mean 0, std 1\nPCA - reduces dimensionality to 4 principal components, 4 also because to match with the 4 qubit feature map\nPCA - also why choose change into 4 components is to match 4 qubits of ZZFeatureMap\nAlso (Binary Classification) is just the classification between two classes. It does nothing to amount of qubits\n\nData leakage - information from test sets sneaks into the training process, makes model look better because it looks like it seen some information\n\n'

#### First Model - Linear SVM

In [None]:
# First SVM Model - Classical Baseline (Linear SVM)

print("--- Training Classical Linear SVM ---")
start_time_linear = time.time() # To mark the start time for training

# Grid C 
param_grid_linear = {'C' : [0.01, 0.1, 1, 10, 100]}
grid_linear = GridSearchCV(SVC(kernel='linear', random_state=42), param_grid_linear, cv = 5, verbose = 0)
grid_linear.fit(X_train_pca, y_train)
linear_svm = grid_linear.best_estimator_
end_time_linear = time.time()

"""
Defines the grid of values for C (regularization strength).
Uses GridSearchCV with 5-fold cross-validation → finds best C.
Fits the Linear SVM.
Finds the best estimator.
Records end time → training duration.

"""

--- Training Classical Linear SVM ---


'\nDefines a grid of values for C (regularization strength).\nUses GridSearchCV with 5-fold cross-validation → finds best C.\nFits the Linear SVM.\nFinds the best estimator.\nRecords end time → training duration.\n\n'

In [6]:
# Evaluation for Linear SVM

# Predicts labels for both train and test sets
# This is where the actual prediction works
# linear_svm.predict - takes the features after all the scaling and PCA and output the predicted labels
# X_train_pca → the training data features after preprocessing.
# X_test_pca → the test data features after preprocessing.
# y_train_pred_linear → model’s predicted labels for the training set.
# y_test_pred_linear → model’s predicted labels for the test set.

y_train_pred_linear = linear_svm.predict(X_train_pca)
y_test_pred_linear = linear_svm.predict(X_test_pca)

# Computes the accuracy for training and testing, also the generalization gap
train_accuracy_linear = accuracy_score(y_train, y_train_pred_linear)
test_accuracy_linear = accuracy_score(y_test, y_test_pred_linear)
gen_gap_linear = abs(train_accuracy_linear - test_accuracy_linear)

print("--- Linear SVM Evaluation (Spambase) ---")
print(f"Training Accuracy: {train_accuracy_linear:.4f}")
print(f"Test Accuracy:     {test_accuracy_linear:.4f}")
print(f"Generalization Gap: {gen_gap_linear:.4f}")
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred_linear))


--- Linear SVM Evaluation (Spambase) ---
Training Accuracy: 0.8744
Test Accuracy:     0.8812
Generalization Gap: 0.0068

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.88      0.93      0.90       759
           1       0.89      0.81      0.84       504

    accuracy                           0.88      1263
   macro avg       0.88      0.87      0.87      1263
weighted avg       0.88      0.88      0.88      1263



In [None]:
# Confusion matrix ?

#### Second Model - RBF SVM

In [8]:
print("\n--- Training Classical RBF SVM ---")
start_time_rbf = time.time()


--- Training Classical RBF SVM ---


In [9]:
param_grid_rbf = {
    'C' : [0.1, 1, 10, 100],
    'gamma' : [1, 0.1, 0.01, 0.001]
}

grid_rbf = GridSearchCV(SVC(kernel='rbf', random_state=42), param_grid_rbf, cv=5, verbose=0)
grid_rbf.fit(X_train_pca, y_train)
rbf_svm = grid_rbf.best_estimator_
end_time_rbf = time.time()

# Defines parameter grid for C regularization and gamma (influence of each training point)
# GridSearchCV finds best (C, gamma)
# Fits RBF SVM
# Then record the training time

# C - controls the tradeoff between margin size and classification errors (Small C - big margin, Large C - Smaller margin) - Similar like the previous svm
# Gamma - decides how far the influence of a single training point reaches (Small gamma - wide influence, decision boundary smoother, less flexible ,,,, Large Gamma - narrow influence)

In [10]:
# Evaluation for RBF SVM
y_train_pred_rbf = rbf_svm.predict(X_train_pca)
y_test_pred_rbf = rbf_svm.predict(X_test_pca)

# Predict train/test labels
train_accuracy_rbf = accuracy_score(y_train, y_train_pred_rbf)
test_accuracy_rbf = accuracy_score(y_test, y_test_pred_rbf)
gen_gap_rbf = abs(train_accuracy_rbf - test_accuracy_rbf)

# Print out the precision, recall, F1-score.
print("--- RBF SVM Evaluation (Spambase) ---")
print(f"Training Accuracy: {train_accuracy_rbf:.4f}")
print(f"Test Accuracy:     {test_accuracy_rbf:.4f}")
print(f"Generalization Gap: {gen_gap_rbf:.4f}")
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred_rbf))

--- RBF SVM Evaluation (Spambase) ---
Training Accuracy: 0.9094
Test Accuracy:     0.8915
Generalization Gap: 0.0179

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.90      0.92      0.91       759
           1       0.87      0.85      0.86       504

    accuracy                           0.89      1263
   macro avg       0.89      0.88      0.89      1263
weighted avg       0.89      0.89      0.89      1263

