In [None]:
## Random Forest Model Script

# Script Name: RFMScript.ipynb
# Purpose: Provide an example of a Random Forest model
# Author: K21014303
# Date: Last edited 20/11/24

In [1]:
## Importing libraries and functions

# Core Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing 
from sklearn.preprocessing import (
    StandardScaler, 
    MinMaxScaler, 
    RobustScaler, 
    LabelEncoder, 
    PowerTransformer
)
from sklearn.utils import class_weight

# Model Selection and evaluation
from sklearn.model_selection import (
    train_test_split, 
    StratifiedKFold, 
    StratifiedShuffleSplit, 
    KFold, 
    cross_val_score, 
    cross_val_predict, 
    GridSearchCV
)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Modelling 
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.svm import SVC, NuSVC
from tensorflow.keras.utils import to_categorical

# Pipeline 
from sklearn.pipeline import Pipeline


In [2]:
## Loading data 

# Loading the .CSV file
data = pd.read_csv('data.csv') # Cannot be provided on github as has confidential data 

# Checking the data shape
print(data.shape) 

# Inspecting the first few rows of the data
print(data.head())

# The data is structured with participant ID, Age, Diagnosis, MRI Field Strength, and remaining 292 columns are MRI-derived features

FileNotFoundError: [Errno 2] No such file or directory: 'data.csv'

In [None]:
## Preparing data 

# Dropping non-numeric variables
number_data = data.drop(columns=['ID', 'Age', 'Diagnosis', 'FieldStrength']) 
feature_names = number_data.columns.tolist()  # Saving the variable names for later use 

# Encoding the diagnostic labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['Diagnosis'])


In [None]:
## Exploratory data analysis
# Compute the correlation matrix
corr_matrix = number_data.corr()

# Plot the correlation matrix using a heatmap
plt.figure(figsize=(20, 15))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

# Correlation matrix shows most of our MRI features are correlated except the asymmetry features. This suggests we should be aware of 
# multicolinearity with our highly correlated features. 

In [None]:
## Splitting the dataset into training (80%) and testing (20%) sets to evaluate the model performance on unseen data 
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Generating the training/testing splits
for train_index, test_index in sss.split(number_data, y_encoded):
    X_train, X_test = number_data.iloc[train_index], number_data.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]

In [None]:
## Random Forest Model 

# Proposed pipeline 1: 
# Scaling -> Dimensionality Reduction (PCA) -> RFM -> Cross Validation (k = 5) -> Model Evaluation 

# Proposed pipeline 2 with stacking:
# First layer: Scaling -> Dimensionality Reduction (PCA) -> RFM (SVM/nuSVM) -> Scaling -> Dimensionality Reduction (PCA) -> RFM
# Second layer:  Meta classifier (logistic regression) 
# Evaluation: Cross Validation (k = 5) -> Model Evaluation 

In [None]:
## Proposed pipeline 1 with GridSearchCV

# Defining the hyperparameter grid
param_grid = {
    'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'pca__n_components': range(1, 292),  # PCA component range representing the number of MRI features 
    'rf__n_estimators': [100, 200],            # Number of trees in the RF model
    'rf__max_depth': [None, 10, 20, 30],       # Depth of growth for each tree 
    'rf__min_samples_split': [2, 5, 10],       # Number of samples to split each node
    'rf__min_samples_leaf': [1, 2, 4],         # Number of samples to be at a leaf node
    'rf__bootstrap': [True, False]             # Option to bootstrap or not 
}

# Creating a pipeline that includes scaling, PCA and Random Forest
pipeline = Pipeline([    
    ('scaler', 'passthrough'),                       # Placeholder for scalers
    ('pca', PCA()),                                  # PCA 
    ('rf', RandomForestClassifier(random_state=42))  # Random Forest model
])

# Set up GridSearchCV with cross-validation
grid_search = HalvingGridSearchCV(estimator=pipeline,
    param_grid=param_grid, 
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=10)

# Fit the model using Grid Search with cross-validation
grid_search.fit(X_train, y_train)

# Best parameters and cross-validated accuracy
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_

print(f'Best parameters: {best_params}')
print(f'Best cross-validated accuracy: {best_accuracy}')

# Final model with the best parameters
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {test_accuracy}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='hot', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Best parameters found:  {'pca__n_components': 50, 'rf__bootstrap': False, 'rf__max_depth': 30, 
# 'rf__min_samples_leaf': 4, 'rf__min_samples_split': 2, 'rf__n_estimators': 100}
# Test Set Accuracy: 0.5410

# Classification Report:
#                precision    recall  f1-score   support

#            0       0.75      0.23      0.35        52
#            1       0.52      0.16      0.25        80
#            2       0.53      0.88      0.66       136

#     accuracy                           0.54       268
#    macro avg       0.60      0.43      0.42       268
# weighted avg       0.57      0.54      0.48       268

In [None]:
## Proposed pipeline 2 with stacking and normal SVC

# Defining the base models for the first layer
# NuSVC classifiers
svc_model = SVC(probability=True, class_weight='balanced', max_iter=2000, random_state=42)

# RF model classifier 
rf_model = RandomForestClassifier(
    bootstrap = False,
    max_depth = 30, 
    min_samples_leaf = 4, 
    min_samples_split = 2,
    n_estimators = 100, 
    random_state=42
)
 
# Create pipelines for NuSVC and RandomForest
nusvc_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # StandardScaler for NuSVC
    ('pca', PCA(n_components=93)), # PCA with the best n_components for NuSVC
    ('classifier', nvc_model)    # NuSVC classifier
])

rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # StandardScaler for RandomForest
    ('pca', PCA(n_components=50)), # PCA with the best n_components for RandomForest
    ('classifier', rf_model)       # RandomForest classifier
])

# Defining the meta-classifier for the second layer
# Create a Stacking Classifier with the two base models
stacking_model = StackingClassifier(
    estimators=[('svc', svc_pipeline), ('rf', rf_pipeline)],
    final_estimator=LogisticRegression() # Logistic Regression as the meta-model
)

# Set up GridSearchCV to optimize the stacking classifier
param_grid = {
    'final_estimator__C': [0.1, 1, 10],                # Hyperparameter for Logistic Regression meta-model
    'final_estimator__solver': ['liblinear', 'saga'],  # Solver for Logistic Regression
}

stacking_grid_search = GridSearchCV(estimator=stacking_model,
    param_grid=param_grid,
    scoring='accuracy',    # Scoring for accuracy
    cv=5,                  # 5-fold cross-validation
    n_jobs=-1,             # Use all processors 
    verbose=10)            # Return output 

# Fitting the stacking classifier to the training data
stacking_grid_search.fit(X_train, y_train)

# Best parameters and cross-validated accuracy
best_params = stacking_grid_search.best_params_
best_accuracy = stacking_grid_search.best_score_

print(f'Best parameters: {best_params}')
print(f'Best cross-validated accuracy: {best_accuracy}')

# Getting the best stacking model
best_stacking_model = stacking_grid_search.best_estimator_

# Making predictions on the test set
y_pred = best_stacking_model.predict(X_test)

# Evaluating the model on the test set
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {test_accuracy}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='hot', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Best parameters: {'final_estimator__C': 10, 'final_estimator__solver': 'liblinear'}
# Best cross-validated accuracy: 0.5760878069984786
# Test Set Accuracy: 0.5261194029850746
# Classification Report:
#                precision    recall  f1-score   support

#            0       0.50      0.27      0.35        52
#            1       0.52      0.33      0.40        79
#            2       0.53      0.74      0.62       137

#     accuracy                           0.53       268
#    macro avg       0.52      0.45      0.46       268
# weighted avg       0.52      0.53      0.50       268