In [None]:
## Support Vector Machine Script

# Script Name: SVMScript.ipynb
# Purpose: Provide an example of a SVM model
# Author: K21014303
# Date: Last edited 20/11/24

In [None]:
## Importing libraries and functions

# Core Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing 
from sklearn.preprocessing import (
    StandardScaler, 
    MinMaxScaler, 
    RobustScaler, 
    LabelEncoder, 
    PowerTransformer
)
from sklearn.utils import class_weight

# Model Selection and evaluation
from sklearn.model_selection import (
    train_test_split, 
    StratifiedKFold, 
    StratifiedShuffleSplit, 
    KFold, 
    cross_val_score, 
    cross_val_predict, 
    GridSearchCV
)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Modelling 
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.svm import SVC, NuSVC
from tensorflow.keras.utils import to_categorical

# Pipeline 
from sklearn.pipeline import Pipeline


In [None]:
## Loading data 

# Loading the .CSV file
data = pd.read_csv('data.csv') # Cannot be provided on github as has confidential data 

# Checking the data shape
print(data.shape) 

# Inspecting the first few rows of the data
print(data.head())

# The data is structured with participant ID, Age, Diagnosis, MRI Field Strength, and remaining 292 columns are MRI-derived features

In [None]:
## Preparing data 

# Dropping non-numeric variables
number_data = data.drop(columns=['ID', 'Age', 'Diagnosis', 'FieldStrength']) 
feature_names = number_data.columns.tolist()  # Saving the variable names for later use 

# Encoding the diagnostic labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['Diagnosis'])


In [None]:
## Exploratory data analysis
# Compute the correlation matrix
corr_matrix = number_data.corr()

# Plot the correlation matrix using a heatmap
plt.figure(figsize=(20, 15))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

# Correlation matrix shows most of our MRI features are correlated except the asymmetry features. This suggests we should be aware of 
# multicolinearity with our highly correlated features. 

In [None]:
## Splitting the dataset into training (80%) and testing (20%) sets to evaluate the model performance on unseen data 
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Generating the training/testing splits
for train_index, test_index in sss.split(number_data, y_encoded):
    X_train, X_test = number_data.iloc[train_index], number_data.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]

In [None]:
## Support Vector Machine

# Proposed pipeline 1: 
# Scaling -> Dimensionality Reduction (PCA) -> kernel SVM (linear/rbf) -> Cross Validation (k = 5) -> Model Evaluation 

# Proposed pipeline 2:
# Scaling -> Dimensionality Reduction (PCA) -> kernel nuSVM (linear/rbf) -> Cross Validation (k = 5) -> Model Evaluation 


In [1]:
## Proposed pipeline 1 for SVC with linear OR radial basis function and GridSearchCV 
# Linear kernel was chosen as an option due to the high dimensionality of our data, even though we know it is not easily linearly separable.
# Radial basis function kernel was chosen due to the complexity and unknown relationships within the data. 

# Defining the SVC model with balanced class weights
svcmodel = SVC(probability=True, class_weight='balanced', max_iter=1000, random_state=42)

# Parameter grid for hyperparameter tuning including variation in scaler, PCA component number and type of kernel
svcparam_grid = {
    'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()], # Scalers to test
    'pca__n_components': range(1, 292),                           # PCA component range representing number of MRI features 
    'classifier__kernel': ['linear', 'rbf'],                      # SVC kernel types (linear or rbf)
}

# Creating a pipeline with scaler, PCA, and SVC
svcpipeline = Pipeline([
    ('scaler', 'passthrough'), # Placeholder for the scalers
    ('pca', PCA()),            # PCA for dimensionality reduction
    ('classifier', svcmodel)   # SVC as the classification model
])

# Setting up GridSearchCV with cross-validation
svcgrid_search = GridSearchCV(
    estimator=svcpipeline,
    param_grid=svcparam_grid,
    scoring='accuracy',       # Accuracy for scoring 
    cv=5,                     # 5-fold cross-validation
    n_jobs=-1,                # Performing multiple jobs at once 
    verbose=10                # Output for progress monitoring
)

# Fitting the GridSearchCV to the training data
svcgrid_search.fit(X_train, y_train)

# Best parameters and cross-validated accuracy
svcbest_params = svcgrid_search.best_params_
svcbest_accuracy = svcgrid_search.best_score_

print(f'Best parameters: {svcbest_params}')
print(f'Best cross-validated accuracy: {svcbest_accuracy}')

# Final model with the best parameters
svcbest_model = svcgrid_search.best_estimator_

# Creating predictions on the test set
y_pred = svcbest_model.predict(X_test)

# Plotting a confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='hot', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
# Diagonal cells should all be closest to white/yellow for the best classification of true labels.

# Classification Report for easy interpretation of model evaluation alongside confusion matrix 
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Best parameters: {'classifier__kernel': 'rbf', 'pca__n_components': 72, 'scaler': StandardScaler()}
# Best cross-validated accuracy: 0.5304
#               precision    recall  f1-score   support
# 
#           AD       0.50      0.65      0.57        52
#           CN       0.45      0.72      0.56        79
#          MCI       0.61      0.33      0.43       137
# 
#     accuracy                           0.51       268
#    macro avg       0.52      0.57      0.52       268
# weighted avg       0.54      0.51      0.49       268

NameError: name 'SVC' is not defined

In [None]:
## Proposed pipeline 2 for nuSVC with linear or radial kernel and GridSearchCV
# nuSVC can be used for imbalanced datasets, where managing errors and ensuring sufficient support vectors is critical.
# nu parameter can control misclassification and margin errors, which could benefit our imbalanced and noisy data set. 

# Define the NuSVC model with balanced class weights
nusvc_model = NuSVC(probability=True, class_weight='balanced', max_iter=2000, random_state=42)

# Parameter grid for hyperparameter tuning
nusvc_param_grid = {
    'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()], # Scalers to test
    'pca__n_components': range(1, 292),                           # PCA component range
    'classifier__kernel': ['linear', 'rbf'],                      # NuSVC kernel types
    'classifier__nu': [0.1, 0.3, 0.5, 0.7]                        # Range for the nu parameter between 0 and 1
}

# Create a pipeline with scaler, PCA, and NuSVC
nusvc_pipeline = Pipeline([
    ('scaler', 'passthrough'),  # Placeholder for scalers
    ('pca', PCA()),             # PCA for dimensionality reduction
    ('classifier', nusvc_model) # NuSVC as the classification model
])

# Set up GridSearchCV with cross-validation
nusvc_grid_search = GridSearchCV(
    estimator=nusvc_pipeline,
    param_grid=nusvc_param_grid,
    scoring='accuracy',       # Accuracy for scoring 
    cv=5,                     # 5-fold cross-validation
    n_jobs=-1,                # Use all available jobs 
    verbose=10                # Output for progress monitoring
)

# Fitting the GridSearchCV to the training data
nusvc_grid_search.fit(X_train, y_train)

# Best parameters and cross-validated accuracy
nusvc_best_params = nusvc_grid_search.best_params_
nusvc_best_accuracy = nusvc_grid_search.best_score_

print(f'Best parameters: {nusvc_best_params}')
print(f'Best cross-validated accuracy: {nusvc_best_accuracy}')

# Final model with the best parameters
nusvc_best_model = nusvc_grid_search.best_estimator_

# Predictions on the test set
y_pred = nusvc_best_model.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='hot', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Classification Report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Best parameters: {'classifier__kernel': 'linear', 'classifier__nu': 0.5, 'pca__n_components': 93, 'scaler': StandardScaler()}
# Best cross-validated accuracy: 0.5901
#               precision    recall  f1-score   support

#           AD       0.52      0.31      0.39        52
#           CN       0.47      0.47      0.47        79
#          MCI       0.53      0.62      0.57       137

#     accuracy                           0.51       268
#    macro avg       0.51      0.47      0.48       268
# weighted avg       0.51      0.51      0.51       268

In [None]:
## Proposed pipeline 2 with stacking and nuSVC

# Defining the base models for the first layer
# NuSVC classifiers
nusvc_model = NuSVC(probability=True, class_weight='balanced', max_iter=2000, random_state=42)

# RF model classifier 
rf_model = RandomForestClassifier(
    bootstrap = False,
    max_depth = 30, 
    min_samples_leaf = 4, 
    min_samples_split = 2,
    n_estimators = 100, 
    random_state=42
)
 
# Create pipelines for NuSVC and RandomForest
nusvc_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # StandardScaler for NuSVC
    ('pca', PCA(n_components=93)), # PCA with the best n_components for NuSVC
    ('classifier', nusvc_model)    # NuSVC classifier
])

rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # StandardScaler for RandomForest
    ('pca', PCA(n_components=50)), # PCA with the best n_components for RandomForest
    ('classifier', rf_model)       # RandomForest classifier
])

# Defining the meta-classifier for the second layer
# Create a Stacking Classifier with the two base models
stacking_model = StackingClassifier(
    estimators=[('nusvc', nusvc_pipeline), ('rf', rf_pipeline)],
    final_estimator=LogisticRegression() # Logistic Regression as the meta-model
)

# Set up GridSearchCV to optimize the stacking classifier
param_grid = {
    'final_estimator__C': [0.1, 1, 10],                # Hyperparameter for Logistic Regression meta-model
    'final_estimator__solver': ['liblinear', 'saga'],  # Solver for Logistic Regression
}

stacking_grid_search = GridSearchCV(estimator=stacking_model,
    param_grid=param_grid,
    scoring='accuracy',    # Scoring for accuracy
    cv=5,                  # 5-fold cross-validation
    n_jobs=-1,             # Use all processors 
    verbose=10)            # Return output 

# Fitting the stacking classifier to the training data
stacking_grid_search.fit(X_train, y_train)

# Best parameters and cross-validated accuracy
best_params = stacking_grid_search.best_params_
best_accuracy = stacking_grid_search.best_score_

print(f'Best parameters: {best_params}')
print(f'Best cross-validated accuracy: {best_accuracy}')

# Getting the best stacking model
best_stacking_model = stacking_grid_search.best_estimator_

# Making predictions on the test set
y_pred = best_stacking_model.predict(X_test)

# Evaluating the model on the test set
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {test_accuracy}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='hot', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Best parameters: {'final_estimator__C': 10, 'final_estimator__solver': 'liblinear'}
# Best cross-validated accuracy: 0.5518148228645946
# Test Set Accuracy: 0.5186567164179104
# Classification Report:
#                precision    recall  f1-score   support

#            0       0.60      0.29      0.39        52
#            1       0.46      0.23      0.31        79
#            2       0.52      0.77      0.62       137

#     accuracy                           0.52       268
#    macro avg       0.53      0.43      0.44       268
# weighted avg       0.52      0.52      0.48       268