#### Classical SVM Implementation - Lung Cancer

In [2]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [3]:
# Load data first
lung_cancer_column_names = ['label'] + [f'attr_{i}' for i in range(1, 57)]
file_path_lung = r'C:\Users\User\Documents\MyProjects\FYP_ResearchProject\data\lung+cancer\lung-cancer.data'

# reads the data, treating "?" as missing values
df_lung = pd.read_csv(file_path_lung, header=None, names=lung_cancer_column_names, na_values=['?'])

print(f"Original shape of Lung Cancer data: {df_lung.shape}")

Original shape of Lung Cancer data: (32, 57)


In [4]:
# Correct the Missing value first
modes = df_lung.mode().iloc[0]
df_lung.fillna(modes, inplace=True)

# Then check if all Nan are gone
print(f"Total missing values after imputation: {df_lung.isnull().sum().sum()}\n")

Total missing values after imputation: 0



In [60]:
# Separate features and ORIGINAL target
X_lung = df_lung.drop('label', axis=1)
y_lung = df_lung['label']

In [61]:
# Then handle the missing values
# for col in ['attr_4', 'attr_38']:
    #if df_lung[col].isnull().any():
       # mode_val = df_lung[col].mode()[0]
        # This line is the potential issue
       # df_lung[col].fillna(mode_val, inplace=True)

# Double check that no missing values remain
# print("Missing values left after imputation:\n", df_lung.isnull().sum().sum())

# Basically this one fills those missing values with mode imputation (most frequent value in the column)

In [62]:
# Target binarization
y_lung_binary = y_lung.apply(lambda x: 0 if x == 1 else 1)

# y_lung - original labels from dataset
# .apply(lambda x: 0 if x == 1 else 1) - goes through each label x and applies a rule
# if x == 1 - assign 0
# else x == 2 or x == 3 assign 1

print("Target variable converted from 3 classes to binary:")
print(y_lung_binary.value_counts())
print("")


Target variable converted from 3 classes to binary:
label
1    23
0     9
Name: count, dtype: int64



In [63]:
# Data splitting
# lc - means lung cancer
X_train_lc, X_test_lc, y_train_lc, y_test_lc = train_test_split(
    X_lung, y_lung_binary, test_size=0.3, random_state=42, stratify=y_lung_binary
)

In [64]:
# Scaling and PCA (dimension reduction)
scaler_lc = StandardScaler()
X_train_lc_scaled = scaler_lc.fit_transform(X_train_lc)
X_test_lc_scaled = scaler_lc.transform(X_test_lc)

In [65]:
# Then reduce it to 4 principal components for consistency
n_components = 4
pca_lc = PCA(n_components=n_components)
X_train_lc_pca = pca_lc.fit_transform(X_train_lc_scaled)
X_test_lc_pca = pca_lc.transform(X_test_lc_scaled)

In [66]:
print(f"Data preprocessed and reduced into {n_components} principal components.")
print(f"The Training set shape after PCA: {X_train_lc_pca.shape}")
print(f"The Test set shape after PCA: {X_test_lc_pca.shape}\n")

Data preprocessed and reduced into 4 principal components.
The Training set shape after PCA: (22, 4)
The Test set shape after PCA: (10, 4)



#### Linear SVM Implementation

In [67]:
print("--- Training Classical Linear SVM on Lung Cancer Data ---")
start_time_lc_linear = time.time()

--- Training Classical Linear SVM on Lung Cancer Data ---


In [68]:
param_grid_lc_linear = {'C': [0.01, 0.1, 1, 10, 100]}
grid_lc_linear = GridSearchCV(SVC(kernel='linear', random_state=42), param_grid_lc_linear, cv=3) # cv=3 because n_splits cannot be greater than the number of members in each class.
grid_lc_linear.fit(X_train_lc_pca, y_train_lc)

# cv=3 is used because the dataset is very small (cannot split into many2 folds)

0,1,2
,estimator,SVC(kernel='l...ndom_state=42)
,param_grid,"{'C': [0.01, 0.1, ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,0.1
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [69]:
# Get the best model estimator
linear_svm_lc = grid_lc_linear.best_estimator_
end_time_lc_linear = time.time()

print(f"Best parameters for Linear SVM: {grid_lc_linear.best_params_}")
print(f"Training time for Linear SVM: {end_time_lc_linear - start_time_lc_linear:.2f} seconds\n")

Best parameters for Linear SVM: {'C': 0.1}
Training time for Linear SVM: 0.14 seconds



In [70]:
# Evaluation for the Linear SVM
y_train_pred_lc_linear = linear_svm_lc.predict(X_train_lc_pca)
y_test_pred_lc_linear = linear_svm_lc.predict(X_test_lc_pca)

train_accuracy_lc_linear = accuracy_score(y_train_lc, y_train_pred_lc_linear)
test_accuracy_lc_linear = accuracy_score(y_test_lc, y_test_pred_lc_linear)
gen_gap_lc_linear = abs(train_accuracy_lc_linear - test_accuracy_lc_linear)

In [71]:
print("--- Linear SVM Evaluation (Lung Cancer) ---")
print(f"Training Accuracy: {train_accuracy_lc_linear:.4f}")
print(f"Test Accuracy:     {test_accuracy_lc_linear:.4f}")
print(f"Generalization Gap: {gen_gap_lc_linear:.4f}")
print("\nClassification Report (Test Set):")
print(classification_report(y_test_lc, y_test_pred_lc_linear, zero_division=0))

--- Linear SVM Evaluation (Lung Cancer) ---
Training Accuracy: 0.8636
Test Accuracy:     0.7000
Generalization Gap: 0.1636

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.70      1.00      0.82         7

    accuracy                           0.70        10
   macro avg       0.35      0.50      0.41        10
weighted avg       0.49      0.70      0.58        10



#### RBF kernel SVM

In [72]:
print("\n--- Training Classical RBF SVM on Lung Cancer Data ---")
start_time_lc_rbf = time.time()



--- Training Classical RBF SVM on Lung Cancer Data ---


In [73]:
# RBF
param_grid_lc_rbf = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001]
}
grid_lc_rbf = GridSearchCV(SVC(kernel='rbf', random_state=42), param_grid_lc_rbf, cv=3) # Using cv=3
grid_lc_rbf.fit(X_train_lc_pca, y_train_lc)

0,1,2
,estimator,SVC(random_state=42)
,param_grid,"{'C': [0.1, 1, ...], 'gamma': [1, 0.1, ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,10
,kernel,'rbf'
,degree,3
,gamma,0.01
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [74]:
# Then get the best estimator
rbf_svm_lc = grid_lc_rbf.best_estimator_
end_time_lc_rbf = time.time()

print(f"Best parameters for RBF SVM: {grid_lc_rbf.best_params_}")
print(f"Training time for RBF SVM: {end_time_lc_rbf - start_time_lc_rbf:.2f} seconds\n")

Best parameters for RBF SVM: {'C': 10, 'gamma': 0.01}
Training time for RBF SVM: 0.16 seconds



In [75]:
# Evaluate the RBF SVM
y_train_pred_lc_rbf = rbf_svm_lc.predict(X_train_lc_pca)
y_test_pred_lc_rbf = rbf_svm_lc.predict(X_test_lc_pca)

train_accuracy_lc_rbf = accuracy_score(y_train_lc, y_train_pred_lc_rbf)
test_accuracy_lc_rbf = accuracy_score(y_test_lc, y_test_pred_lc_rbf)
gen_gap_lc_rbf = abs(train_accuracy_lc_rbf - test_accuracy_lc_rbf)


In [76]:
print("--- RBF SVM Evaluation (Lung Cancer) ---")
print(f"Training Accuracy: {train_accuracy_lc_rbf:.4f}")
print(f"Test Accuracy:     {test_accuracy_lc_rbf:.4f}")
print(f"Generalization Gap: {gen_gap_lc_rbf:.4f}")
print("\nClassification Report (Test Set):")
print(classification_report(y_test_lc, y_test_pred_lc_rbf, zero_division=0))

--- RBF SVM Evaluation (Lung Cancer) ---
Training Accuracy: 0.9545
Test Accuracy:     0.9000
Generalization Gap: 0.0545

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.88      1.00      0.93         7

    accuracy                           0.90        10
   macro avg       0.94      0.83      0.87        10
weighted avg       0.91      0.90      0.89        10



In [77]:
# Code now includes mode imputation
# Target binarization - column label is convreted to a binary variable

# Cross validation folds is 3. Why ?
# The Spambase dataset is large enough to do 5-fold cross-validation
# But the lung cancer is very small, only 32 instances. after splitting the training will only have 22 instances
# Overall roughly 6 samples. 
# Cannot do 5-cv if class is fewer than 5 samples. Therefore reduced
