In [4]:
## Deep Neural Network Model Script

# Script Name: DNNScript.ipynb
# Purpose: Provide an example of a Deep Neural Network model
# Author: K21014303
# Date: Last edited 20/11/24

In [2]:
## Importing libraries and functions

# Core Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing 
from sklearn.preprocessing import (
    StandardScaler, 
    MinMaxScaler, 
    RobustScaler, 
    LabelEncoder, 
    PowerTransformer
)
from sklearn.utils import class_weight

# Model Selection and evaluation
from sklearn.model_selection import (
    train_test_split, 
    StratifiedKFold, 
    StratifiedShuffleSplit, 
    KFold, 
    cross_val_score, 
    cross_val_predict, 
    GridSearchCV
)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Modelling 
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.svm import SVC, NuSVC
from tensorflow.keras.utils import to_categorical

# Pipeline 
from sklearn.pipeline import Pipeline


In [3]:
## Loading data 

# Loading the .CSV file
data = pd.read_csv('data.csv') # Cannot be provided on github as has confidential data 

# Checking the data shape
print(data.shape) 

# Inspecting the first few rows of the data
print(data.head())

# The data is structured with participant ID, Age, Diagnosis, MRI Field Strength, and remaining 292 columns are MRI-derived features

FileNotFoundError: [Errno 2] No such file or directory: 'data.csv'

In [None]:
## Preparing data 

# Dropping non-numeric variables
number_data = data.drop(columns=['ID', 'Age', 'Diagnosis', 'FieldStrength']) 
feature_names = number_data.columns.tolist()  # Saving the variable names for later use 

# Encoding the diagnostic labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['Diagnosis'])


In [None]:
## Exploratory data analysis
# Compute the correlation matrix
corr_matrix = number_data.corr()

# Plot the correlation matrix using a heatmap
plt.figure(figsize=(20, 15))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

# Correlation matrix shows most of our MRI features are correlated except the asymmetry features. This suggests we should be aware of 
# multicolinearity with our highly correlated features. 

In [None]:
## Splitting the dataset into training (80%) and testing (20%) sets to evaluate the model performance on unseen data 
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Generating the training/testing splits
for train_index, test_index in sss.split(number_data, y_encoded):
    X_train, X_test = number_data.iloc[train_index], number_data.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]

In [None]:
## DNN Model 

# Proposed pipeline: 
# Scaling -> Dropout Layer 1 (LeakyReLU, 256 neurons) -> Dropout Layer 2 (LeakyReLU, 128 neurons) -> Output Layer (softmax) 
# -> Model Training (early stopping, learning rate schedule, batch normalization, class weighting) -> Model Evaluation 


In [None]:
## Proposed pipeline 

# Defining the MLP model
model = Sequential([
    Dense(256, input_dim=X_train.shape[1], activation=None),  # First hidden layer with 256 neurons
    BatchNormalization(),                                     # Normalises layer outputs to speed up convergence
    LeakyReLU(negative_slope=0.2),                            # LeakyReLU activation to handle non-linearity
    Dropout(0.4),                                             # Dropout with rate of 40% for regularization

    Dense(128, activation=None),                              # Second hidden layer
    BatchNormalization(),                                     # Normalises layer outputs to speed up convergence
    LeakyReLU(negative_slope=0.2),                            # LeakyReLU activation to handle non-linearity
    Dropout(0.4),                                             # Dropout with rate of 40% for regularization

    Dense(len(np.unique(y_train)), activation='softmax')      # Output layer for multi-class classification with softmax 
])

# Early stopping to avoid overfitting 
early_stopping = EarlyStopping(
    monitor='val_loss',                                       # Monitoring validation loss to determine stopping
    patience=50,                                              # Stop if no improvement after 50 epochs
    restore_best_weights=True                                 # Go back to the best model weights
)

# Learning rate scheduler 
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1) # Reduces learning rate by half if needed 

# Compiling the MLP model
model.compile(optimizer=Adam(learning_rate=0.001),    # Optimising with Adam and initial specified learning rate of 0.001
              loss='sparse_categorical_crossentropy', # Loss function for multi-class classification
              metrics=['accuracy']) 

# Calculating class weights for balancing 
class_weights = class_weight.compute_class_weight(
    'balanced',                                    # Balancing classes based on sample distribution
    classes=np.unique(y_train), 
    y=y_train  
)
class_weight_dict = dict(enumerate(class_weights)) # Converting weights to dictionary format for use during training stage

# Training the MLP model
history = model.fit(X_train, y_train,
                    validation_split=0.2,                  # Reserving 20% of data for validation
                    epochs=350,                            # Number of training epochs (tuned through experimentation)
                    batch_size=32,                         # Using standard batch size
                    callbacks=[early_stopping, reduce_lr], # Adding in early stopping and learning rate scheduler
                    class_weight=class_weight_dict)        # Including class weight to account for imbalance in data 

# Evaluating the model
y_pred = model.predict(X_test).argmax(axis=1) 

# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"MLP Model Test Set Accuracy: {accuracy}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='hot', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# MLP Model Test Set Accuracy: 0.5223880597014925
# Classification Report:
#                precision    recall  f1-score   support

#            0       0.51      0.73      0.60        52
#            1       0.47      0.58      0.52        79
#            2       0.58      0.41      0.48       137

#     accuracy                           0.52       268
#    macro avg       0.52      0.57      0.53       268
# weighted avg       0.54      0.52      0.52       268