In [1]:
## Ensemble Model Script

# Script Name: EnsembleScript.ipynb
# Purpose: Provide an example of an ensemble model
# Author: K21014303
# Date: Last edited 20/11/24

In [2]:
## Importing libraries and functions

# Core Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing 
from sklearn.preprocessing import (
    StandardScaler, 
    MinMaxScaler, 
    RobustScaler, 
    LabelEncoder, 
    PowerTransformer
)
from sklearn.utils import class_weight
from sklearn.experimental import enable_halving_search_cv

# Model Selection and evaluation
from sklearn.model_selection import (
    train_test_split, 
    StratifiedKFold, 
    StratifiedShuffleSplit, 
    KFold, 
    cross_val_score, 
    cross_val_predict, 
    GridSearchCV,
    HalvingGridSearchCV
)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Modelling 
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.svm import SVC, NuSVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU, Input
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.utils import to_categorical

# Pipeline 
from sklearn.pipeline import Pipeline

# TensorFlow and Keras optimisers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay


In [None]:
## Loading data 

# Loading the .CSV file
data = pd.read_csv('data.csv') # Cannot be provided on github as has confidential data 

# Checking the data shape
print(data.shape) 

# Inspecting the first few rows of the data
print(data.head())

# The data is structured with participant ID, Age, Diagnosis, MRI Field Strength, and remaining 292 columns are MRI-derived features

In [3]:
## Preparing data 

# Dropping non-numeric variables
number_data = data.drop(columns=['ID', 'Age', 'Diagnosis', 'FieldStrength']) 
feature_names = number_data.columns.tolist()  # Saving the variable names for later use 

# Encoding the diagnostic labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['Diagnosis'])


NameError: name 'data' is not defined

In [None]:
## Exploratory data analysis
# Compute the correlation matrix
corr_matrix = number_data.corr()

# Plot the correlation matrix using a heatmap
plt.figure(figsize=(20, 15))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

# Correlation matrix shows most of our MRI features are correlated except the asymmetry features. This suggests we should be aware of 
# multicolinearity with our highly correlated features. 

In [None]:
## Splitting the dataset into training (80%) and testing (20%) sets to evaluate the model performance on unseen data 
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Generating the training/testing splits
for train_index, test_index in sss.split(number_data, y_encoded):
    X_train, X_test = number_data.iloc[train_index], number_data.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]

In [None]:
## Creating an ensemble model with weighting 

# Defining MLP model
mlp_model = Sequential([
    Dense(256, activation=None, input_shape=(X_train.shape[1],)),             # Input layer with shape matching meta-features
    BatchNormalization(),                                                     # Normalise activations 
    LeakyReLU(negative_slope=0.2),                                            # LeakyReLU activation function
    Dropout(0.4),                                                             # Dropout for regularization
    
    Dense(128, activation=None),                                              # Second dense layer
    BatchNormalization(),                                                     # Batch normalisation 
    LeakyReLU(negative_slope=0.2),                                            # LeakyReLU activation
    Dropout(0.4),                                                             # Dropout for regularization

    Dense(len(np.unique(y_encoded)), activation='softmax')                    # Output layer with softmax for multi-class classification
])

# Compiling the MLP model with Adam optimizer, sparse categorical cross-entropy loss, and accuracy metric
mlp_model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Calculating class weights for balancing
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))

# Training the MLP model
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1)

# Fitting the MLP model on our training data 
mlp_history = mlp_model.fit(X_train, y_train,
                            validation_split=0.2, epochs=350,
                            batch_size=32, callbacks=[reduce_lr],
                            class_weight=class_weight_dict)

# Defining the NuSVC model with RBF kernel and class weight balancing 
nusvc_model = NuSVC(kernel='rbf', nu=0.5, probability=True, class_weight='balanced', max_iter=2000, random_state=42)

# Creating a pipeline for the NuSVC model
nusvc_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=93)),
    ('classifier', nusvc_model)
])

# Training the NuSVC pipeline using our training data 
nusvc_pipeline.fit(X_train, y_train)

# Getting the predicted probabilities from both models
mlp_probs = mlp_model.predict(X_test)                 # MLP prediction probabilities
stacked_probs = nusvc_pipeline.predict_proba(X_test)  # SVM prediction probabilities

# Defining class-specific weights for combining the models
weights = np.array([
    [0.8, 0.2],    # AD: Higher weight for MLP
    [0.75, 0.25],  # CN: Higher weight for MLP
    [0.2, 0.8]     # MCI: Higher weight for SVM 
])
# Experimented with different weighting to find optimal proportions with insight from previous model results 

# Combining probabilities using class-specific weights
ensemble_probs = np.zeros_like(mlp_probs)
for i in range(ensemble_probs.shape[1]):
    ensemble_probs[:, i] = (
        weights[i, 0] * mlp_probs[:, i] +
        weights[i, 1] * stacked_probs[:, i]
    )

# Final predictions by selecting the class with the highest probability
ensemble_preds = np.argmax(ensemble_probs, axis=1)

# Evaluate the ensemble model
accuracy = accuracy_score(y_test, ensemble_preds)
print(f"Weighted Ensemble Accuracy: {accuracy}")

# Confusion Matrix
cm = confusion_matrix(y_test, ensemble_preds)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='hot', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Classification Report
print("\nClassification Report:\n", classification_report(y_test, ensemble_preds, target_names=label_encoder.classes_))

# Weighted Ensemble Accuracy: 0.5634328358208955
# Classification Report:
#                precision    recall  f1-score   support

#           AD       0.55      0.69      0.61        52
#           CN       0.52      0.58      0.55        79
#          MCI       0.61      0.50      0.55       137

#     accuracy                           0.56       268
#    macro avg       0.56      0.59      0.57       268
# weighted avg       0.57      0.56      0.56       268