In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer

# Import dataset
dataset = pd.read_csv("C:/Users/steve/Desktop/Notebooks/Thesis-Project/ADNI(Rawdata).csv")
dataset.head()

Unnamed: 0,RID,Gender,Ageatscreening,Diagnosis,MMSE0m,HipsASMbaseline,HipsContrastbaseline,HipsCorelationbaseline,HipsVariancebaseline,HipsSumAveragebaseline,...,ERCsContrastbaseline,ERCsCorelationbaseline,ERCsVariancebaseline,ERCsSumAveragebaseline,ERCsSumVariancebaseline,ERCsEntropybaseline,ERCsClusterShadebaseline,ERCs_thicknessbaseline,ERCsVolumebaseline,HipposcampusVolumebaseline
0,3,0,81.3479,3,20.0,,158.27,0.63,218.3,28.37,...,253.1,0.4,208.65,23.39,581.5,,-2568.19,2.31,1176.0,3047.0
1,4,0,67.6904,1,27.0,0.06,147.64,0.55,173.64,44.72,...,220.88,0.48,215.7,33.74,641.9,3.33,4113.01,2.76,1942.0,3449.0
2,5,0,73.8027,0,29.0,0.1,199.66,0.55,222.27,41.18,...,220.37,0.54,232.18,29.18,708.36,2.87,-1388.41,3.18,2044.0,3441.0
3,8,1,84.5945,0,28.0,0.08,184.21,0.53,201.55,43.04,...,198.42,0.54,220.48,26.68,683.5,2.77,-2506.55,2.68,1959.0,2875.0
4,10,1,73.9726,3,24.0,0.11,233.02,0.48,229.88,39.46,...,196.55,0.53,210.63,26.6,645.95,2.72,-1164.02,2.64,1397.0,2700.0


In [3]:
# Group one
group_one = dataset[dataset["Diagnosis"] == 0]
group_two = dataset[dataset["Diagnosis"] == 3]

combined_groups_one = pd.concat([group_one, group_two], ignore_index = True)
combined_groups_one["Diagnosis"].unique()

array([0, 3], dtype=int64)

In [4]:
# let's separate into training and testing set
combined_groups_one.drop(labels = "RID", axis = 1, inplace = True)

X_train, X_test, y_train, y_test = train_test_split(
    combined_groups_one.drop("Diagnosis", axis=1),  
    combined_groups_one["Diagnosis"],  
    test_size=0.3,  
    random_state=0,  
)

X_train.shape, X_test.shape

((226, 22), (98, 22))

In [5]:
pipe = Pipeline([
    ("imputer", MeanMedianImputer(
        imputation_method="mean", 
        variables=[
            'MMSE0m', 'HipsASMbaseline', 'HipsContrastbaseline',
            'HipsCorelationbaseline', 'HipsVariancebaseline',
            'HipsSumAveragebaseline', 'HipsSumVariancebaseline',
            'HipsEntropybaseline', 'HipsClusterShadebaseline', 
            'ERCsASMbaseline', 'ERCsContrastbaseline', 
            'ERCsCorelationbaseline', 'ERCsVariancebaseline', 
            'ERCsSumAveragebaseline', 'ERCsSumVariancebaseline',
            'ERCsEntropybaseline', 'ERCsClusterShadebaseline', 
            'ERCs_thicknessbaseline', 'ERCsVolumebaseline', 
            'HipposcampusVolumebaseline'
        ]
    )),
    ("scaler", StandardScaler().set_output(transform="pandas")),
])

pipe.fit(X_train)

# let's transform the data with the pipeline
X_train_scaled = pipe.transform(X_train)
X_test_scaled = pipe.transform(X_test)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

lg = LogisticRegression(multi_class = "multinomial", 
                        solver = "lbfgs",
                        max_iter = 1000,
                        random_state = 42)

svm = SVC(kernel ='rbf', 
          decision_function_shape ='ovo',
          probability = True,
          random_state = 42)

dt = decision_tree_model = DecisionTreeClassifier(
     criterion ='gini',      
     max_depth = 5,           
     min_samples_split = 10,  
     min_samples_leaf = 5,    
     max_features = 'sqrt',    
     random_state = 42)

rf = RandomForestClassifier(
     n_estimators = 100,     
     criterion = 'gini',     
     max_depth = 5,           
     min_samples_split = 10, 
     min_samples_leaf = 5,   
     max_features = 'sqrt',   
     bootstrap = True,        
     random_state = 42)

In [7]:
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import KFold, cross_validate

kf = KFold(n_splits = 5, 
           shuffle = True, 
           random_state = 42)

# Define metrics to evaluate
scoring_metrics = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average = 'weighted', zero_division=0),
    'recall': make_scorer(recall_score, average = 'weighted', zero_division=0),
    'f1': make_scorer(f1_score, average = 'weighted', zero_division=0),
    'roc_auc': make_scorer(roc_auc_score, multi_class='ovr', response_method = "predict_proba")
}

lg_results = cross_validate(
    lg,
    X_train_scaled,
    y_train,
    scoring = scoring_metrics,
    return_train_score = True,
    cv = kf)

svm_results = cross_validate(
    svm,
    X_train_scaled,
    y_train,
    scoring = scoring_metrics,
    return_train_score = True,
    cv = kf)

dt_results = cross_validate(
    dt,
    X_train,
    y_train,
    scoring = scoring_metrics,
    return_train_score = True,
    cv = kf)

rf_results = cross_validate(
    rf,
    X_train,
    y_train,
    scoring = scoring_metrics,
    return_train_score = True,
    cv = kf)

In [8]:
# Print results for Logistic Regression
print("------------------------------------------------------")
print("Logistic Regression")
print('Mean train set accuracy:', np.mean(lg_results['train_accuracy']), '±', np.std(lg_results['train_accuracy']))
print('Mean test set accuracy:', np.mean(lg_results['test_accuracy']), '±', np.std(lg_results['test_accuracy']))
print('Mean train precision:', np.mean(lg_results['train_precision']), '±', np.std(lg_results['train_precision']))
print('Mean test precision:', np.mean(lg_results['test_precision']), '±', np.std(lg_results['test_precision']))
print('Mean train recall:', np.mean(lg_results['train_recall']), '±', np.std(lg_results['train_recall']))
print('Mean test recall:', np.mean(lg_results['test_recall']), '±', np.std(lg_results['test_recall']))
print('Mean train F1 score:', np.mean(lg_results['train_f1']), '±', np.std(lg_results['train_f1']))
print('Mean test F1 score:', np.mean(lg_results['test_f1']), '±', np.std(lg_results['test_f1']))
print('Mean train ROC AUC:', np.mean(lg_results['train_roc_auc']), '±', np.std(lg_results['train_roc_auc']))
print('Mean test ROC AUC:', np.mean(lg_results['test_roc_auc']), '±', np.std(lg_results['test_roc_auc']), "\n")

# Print results for Support Vector Machine
print("------------------------------------------------------")
print("Support Vector Machine")
print('Mean train set accuracy:', np.mean(svm_results['train_accuracy']), '±', np.std(svm_results['train_accuracy']))
print('Mean test set accuracy:', np.mean(svm_results['test_accuracy']), '±', np.std(svm_results['test_accuracy']))
print('Mean train precision:', np.mean(svm_results['train_precision']), '±', np.std(svm_results['train_precision']))
print('Mean test precision:', np.mean(svm_results['test_precision']), '±', np.std(svm_results['test_precision']))
print('Mean train recall:', np.mean(svm_results['train_recall']), '±', np.std(svm_results['train_recall']))
print('Mean test recall:', np.mean(svm_results['test_recall']), '±', np.std(svm_results['test_recall']))
print('Mean train F1 score:', np.mean(svm_results['train_f1']), '±', np.std(svm_results['train_f1']))
print('Mean test F1 score:', np.mean(svm_results['test_f1']), '±', np.std(svm_results['test_f1']))
print('Mean train ROC AUC:', np.mean(svm_results['train_roc_auc']), '±', np.std(svm_results['train_roc_auc']))
print('Mean test ROC AUC:', np.mean(svm_results['test_roc_auc']), '±', np.std(svm_results['test_roc_auc']), "\n")

# Print results for Decision Tree
print("------------------------------------------------------")
print("Decision Tree")
print('Mean train set accuracy:', np.mean(dt_results['train_accuracy']), '±', np.std(dt_results['train_accuracy']))
print('Mean test set accuracy:', np.mean(dt_results['test_accuracy']), '±', np.std(dt_results['test_accuracy']))
print('Mean train precision:', np.mean(dt_results['train_precision']), '±', np.std(dt_results['train_precision']))
print('Mean test precision:', np.mean(dt_results['test_precision']), '±', np.std(dt_results['test_precision']))
print('Mean train recall:', np.mean(dt_results['train_recall']), '±', np.std(dt_results['train_recall']))
print('Mean test recall:', np.mean(dt_results['test_recall']), '±', np.std(dt_results['test_recall']))
print('Mean train F1 score:', np.mean(dt_results['train_f1']), '±', np.std(dt_results['train_f1']))
print('Mean test F1 score:', np.mean(dt_results['test_f1']), '±', np.std(dt_results['test_f1']))
print('Mean train ROC AUC:', np.mean(dt_results['train_roc_auc']), '±', np.std(dt_results['train_roc_auc']))
print('Mean test ROC AUC:', np.mean(dt_results['test_roc_auc']), '±', np.std(dt_results['test_roc_auc']), "\n")

# Print results for Random Forest
print("------------------------------------------------------")
print("Random Forest")
print('Mean train set accuracy:', np.mean(rf_results['train_accuracy']), '±', np.std(rf_results['train_accuracy']))
print('Mean test set accuracy:', np.mean(rf_results['test_accuracy']), '±', np.std(rf_results['test_accuracy']))
print('Mean train precision:', np.mean(rf_results['train_precision']), '±', np.std(rf_results['train_precision']))
print('Mean test precision:', np.mean(rf_results['test_precision']), '±', np.std(rf_results['test_precision']))
print('Mean train recall:', np.mean(rf_results['train_recall']), '±', np.std(rf_results['train_recall']))
print('Mean test recall:', np.mean(rf_results['test_recall']), '±', np.std(rf_results['test_recall']))
print('Mean train F1 score:', np.mean(rf_results['train_f1']), '±', np.std(rf_results['train_f1']))
print('Mean test F1 score:', np.mean(rf_results['test_f1']), '±', np.std(rf_results['test_f1']))
print('Mean train ROC AUC:', np.mean(rf_results['train_roc_auc']), '±', np.std(rf_results['train_roc_auc']))
print('Mean test ROC AUC:', np.mean(rf_results['test_roc_auc']), '±', np.std(rf_results['test_roc_auc']), "\n")

------------------------------------------------------
Logistic Regression
Mean train set accuracy: 0.9878207489257212 ± 0.00648101625490923
Mean test set accuracy: 0.9733333333333334 ± 0.025915341754867992
Mean train precision: 0.9878909868918562 ± 0.006505511027715961
Mean test precision: 0.9742192118226601 ± 0.025498790619537207
Mean train recall: 0.9878207489257212 ± 0.00648101625490923
Mean test recall: 0.9733333333333334 ± 0.025915341754867992
Mean train F1 score: 0.9878124699527777 ± 0.0064801572374334
Mean test F1 score: 0.9731793747411223 ± 0.02610560531948763
Mean train ROC AUC: 0.9996731189686431 ± 0.00015291827895117462
Mean test ROC AUC: 0.9954393903310311 ± 0.005536149089445782 

------------------------------------------------------
Support Vector Machine
Mean train set accuracy: 0.9822958870472682 ± 0.004160885404953817
Mean test set accuracy: 0.9556521739130435 ± 0.0344893704615623
Mean train precision: 0.9828497876300352 ± 0.003930861218971975
Mean test precision: 0.9