#### Classical SVM Implementation - Lung Cancer

In [17]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import chi2_contingency

##### Data Preprocessing

In [18]:
# Load data first
lung_cancer_column_names = ['label'] + [f'attr_{i}' for i in range(1, 57)]
file_path_lung = r'C:\Users\User\Documents\MyProjects\FYP_ResearchProject\data\lung+cancer\lung-cancer.data'

# reads the data, treating "?" as missing values
df_lung = pd.read_csv(file_path_lung, header=None, names=lung_cancer_column_names, na_values=['?'])

print(f"Original shape of Lung Cancer data: {df_lung.shape}")

Original shape of Lung Cancer data: (32, 57)


In [19]:
# Impute missing values first
# modes = df_lung.mode().iloc[0]
# df_lung.fillna(modes, inplace=True)

# Then check if all Nan are gone
print(f"Total missing values after imputation: {df_lung.isnull().sum().sum()}\n")

print("--- Handling Missing Values ---")
missing_before = df_lung.isnull().sum().sum()
print(f"Total missing values before imputation: {missing_before}")

# Find columns with missing values
missing_cols = df_lung.columns[df_lung.isnull().any()].tolist()
if missing_cols:
    print(f"Columns with missing values: {missing_cols}")
    
    # Impute with the mode using a more robust method
    for col in missing_cols:
        mode_val = df_lung[col].mode()[0]
        # This is the key change: Assign the result back to the column
        df_lung[col] = df_lung[col].fillna(mode_val)
        print(f"Missing values in '{col}' imputed with mode value: {mode_val}")

missing_after = df_lung.isnull().sum().sum()
print(f"\nTotal missing values after imputation: {missing_after}")

Total missing values after imputation: 5

--- Handling Missing Values ---
Total missing values before imputation: 5
Columns with missing values: ['attr_4', 'attr_38']
Missing values in 'attr_4' imputed with mode value: 1.0
Missing values in 'attr_38' imputed with mode value: 2.0

Total missing values after imputation: 0


In [20]:
# Target Binarization
df_lung['label_binary'] = df_lung['label'].apply(lambda x: 0 if x == 1 else 1)

In [21]:
# Separate Features and Target Data
X_lung = df_lung.drop(['label', 'label_binary'], axis=1)
y_lung_binary = df_lung['label_binary']

# Split Data
X_train_lc, X_test_lc, y_train_lc, y_test_lc = train_test_split(
    X_lung, y_lung_binary, test_size=0.3, random_state=42, stratify=y_lung_binary
)

In [22]:
# One-Hot Encoding
print("--- One-Hot Encoding Implementation ---")
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

--- One-Hot Encoding Implementation ---


In [23]:
# Then fit on the training data and transform both train and test
X_train_lc_encoded = pd.DataFrame(encoder.fit_transform(X_train_lc),
columns=encoder.get_feature_names_out())
X_test_lc_encoded = pd.DataFrame(encoder.transform(X_test_lc),
columns=encoder.get_feature_names_out())

print(f"Training set shape after encoding: {X_train_lc_encoded.shape}")
print(f"Test set shape after encoding: {X_test_lc_encoded.shape}\n")

Training set shape after encoding: (22, 153)
Test set shape after encoding: (10, 153)



In [24]:
print("--- Feature Selection ---")

def cramers_v(x, y):
    # (Your Cramer's V function here)
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    if min((kcorr-1), (rcorr-1)) == 0: return 0
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

# Calculate scores on the ENCODED training data
cramers_scores = {col: cramers_v(X_train_lc_encoded[col], y_train_lc) for col in X_train_lc_encoded.columns}
cramers_series = pd.Series(cramers_scores).sort_values(ascending=False)

# Select top 10 features
N_FEATURES_TO_SELECT = 10 
top_features = cramers_series.head(N_FEATURES_TO_SELECT).index.tolist()
print(f"Top {N_FEATURES_TO_SELECT} features selected:")
print(top_features)

# Create the final datasets with only the selected features
X_train_lc_final = X_train_lc_encoded[top_features]
X_test_lc_final = X_test_lc_encoded[top_features]

print(f"\nFinal training data shape: {X_train_lc_final.shape}")
print(f"Final testing data shape: {X_test_lc_final.shape}\n")

--- Feature Selection ---
Top 10 features selected:
['attr_13_3', 'attr_26_2', 'attr_13_2', 'attr_1_1', 'attr_3_0', 'attr_3_1', 'attr_2_2', 'attr_2_3', 'attr_1_0', 'attr_4_2.0']

Final training data shape: (22, 10)
Final testing data shape: (10, 10)



#### Linear SVM Implementation

In [25]:
print("--- Training Classical Linear SVM on Lung Cancer Data ---")
start_time_lc_linear = time.time()

param_grid_lc_linear = {
    'C': [0.1, 1, 10, 100],
    'class_weight': ['balanced']
}
grid_lc_linear = GridSearchCV(SVC(kernel='linear', random_state=42), param_grid_lc_linear, cv=3) # cv=3 because n_splits cannot be greater than the number of members in each class.
grid_lc_linear.fit(X_train_lc_final, y_train_lc)

# cv=3 is used because the dataset is very small (cannot split into many2 folds)
# Get the best model estimator
linear_svm_lc = grid_lc_linear.best_estimator_
end_time_lc_linear = time.time()

print(f"Best parameters for Linear SVM: {grid_lc_linear.best_params_}")
print(f"Training time for Linear SVM: {end_time_lc_linear - start_time_lc_linear:.2f} seconds\n")

--- Training Classical Linear SVM on Lung Cancer Data ---
Best parameters for Linear SVM: {'C': 1, 'class_weight': 'balanced'}
Training time for Linear SVM: 0.44 seconds



In [26]:
# Evaluation for the Linear SVM
y_train_pred_lc_linear = linear_svm_lc.predict(X_train_lc_final)
y_test_pred_lc_linear = linear_svm_lc.predict(X_test_lc_final)

train_accuracy_lc_linear = accuracy_score(y_train_lc, y_train_pred_lc_linear)
test_accuracy_lc_linear = accuracy_score(y_test_lc, y_test_pred_lc_linear)
gen_gap_lc_linear = abs(train_accuracy_lc_linear - test_accuracy_lc_linear)

print("--- Linear SVM Evaluation (Lung Cancer) ---")
print(f"Training Accuracy: {train_accuracy_lc_linear:.4f}")
print(f"Test Accuracy:     {test_accuracy_lc_linear:.4f}")
print(f"Generalization Gap: {gen_gap_lc_linear:.4f}")
print("\nClassification Report (Test Set):")
print(classification_report(y_test_lc, y_test_pred_lc_linear, zero_division=0))

--- Linear SVM Evaluation (Lung Cancer) ---
Training Accuracy: 0.7727
Test Accuracy:     0.7000
Generalization Gap: 0.0727

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.50      0.67      0.57         3
           1       0.83      0.71      0.77         7

    accuracy                           0.70        10
   macro avg       0.67      0.69      0.67        10
weighted avg       0.73      0.70      0.71        10



#### RBF kernel SVM

In [27]:
print("\n--- Training Classical RBF SVM on Lung Cancer Data ---")
start_time_lc_rbf = time.time()

# RBF
param_grid_lc_rbf = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'class_weight': ['balanced']
}
grid_lc_rbf = GridSearchCV(SVC(kernel='rbf', random_state=42), param_grid_lc_rbf, cv=3) # Using cv=3
grid_lc_rbf.fit(X_train_lc_final, y_train_lc)

# Then get the best estimator
rbf_svm_lc = grid_lc_rbf.best_estimator_
end_time_lc_rbf = time.time()
print(f"Best parameters for RBF SVM: {grid_lc_rbf.best_params_}")
print(f"Training time for RBF SVM: {end_time_lc_rbf - start_time_lc_rbf:.2f} seconds\n")


--- Training Classical RBF SVM on Lung Cancer Data ---
Best parameters for RBF SVM: {'C': 10, 'class_weight': 'balanced', 'gamma': 0.1}
Training time for RBF SVM: 1.72 seconds



In [28]:
# Evaluate the RBF SVM
y_train_pred_lc_rbf = rbf_svm_lc.predict(X_train_lc_final)
y_test_pred_lc_rbf = rbf_svm_lc.predict(X_test_lc_final)

train_accuracy_lc_rbf = accuracy_score(y_train_lc, y_train_pred_lc_rbf)
test_accuracy_lc_rbf = accuracy_score(y_test_lc, y_test_pred_lc_rbf)
gen_gap_lc_rbf = abs(train_accuracy_lc_rbf - test_accuracy_lc_rbf)

print("--- RBF SVM Evaluation (Lung Cancer) ---")
print(f"Training Accuracy: {train_accuracy_lc_rbf:.4f}")
print(f"Test Accuracy:     {test_accuracy_lc_rbf:.4f}")
print(f"Generalization Gap: {gen_gap_lc_rbf:.4f}")
print("\nClassification Report (Test Set):")
print(classification_report(y_test_lc, y_test_pred_lc_rbf, zero_division=0))

--- RBF SVM Evaluation (Lung Cancer) ---
Training Accuracy: 0.8636
Test Accuracy:     0.8000
Generalization Gap: 0.0636

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.67      0.67      0.67         3
           1       0.86      0.86      0.86         7

    accuracy                           0.80        10
   macro avg       0.76      0.76      0.76        10
weighted avg       0.80      0.80      0.80        10



In [29]:
# Code now includes mode imputation
# Target binarization - column label is convreted to a binary variable

# Cross validation folds is 3. Why ?
# The Spambase dataset is large enough to do 5-fold cross-validation
# But the lung cancer is very small, only 32 instances. after splitting the training will only have 22 instances
# Overall roughly 6 samples. 
# Cannot do 5-cv if class is fewer than 5 samples. Therefore reduced
