In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer

# Import dataset
dataset = pd.read_csv("C:/Users/steve/Desktop/Notebooks/Thesis-Project/Datasets/Raw/ADNI(Rawdata).csv")
dataset.head()

Unnamed: 0,RID,Gender,Ageatscreening,Diagnosis,MMSE0m,HipsASMbaseline,HipsContrastbaseline,HipsCorelationbaseline,HipsVariancebaseline,HipsSumAveragebaseline,...,ERCsContrastbaseline,ERCsCorelationbaseline,ERCsVariancebaseline,ERCsSumAveragebaseline,ERCsSumVariancebaseline,ERCsEntropybaseline,ERCsClusterShadebaseline,ERCs_thicknessbaseline,ERCsVolumebaseline,HipposcampusVolumebaseline
0,3,0,81.3479,3,20.0,,158.27,0.63,218.3,28.37,...,253.1,0.4,208.65,23.39,581.5,,-2568.19,2.31,1176.0,3047.0
1,4,0,67.6904,1,27.0,0.06,147.64,0.55,173.64,44.72,...,220.88,0.48,215.7,33.74,641.9,3.33,4113.01,2.76,1942.0,3449.0
2,5,0,73.8027,0,29.0,0.1,199.66,0.55,222.27,41.18,...,220.37,0.54,232.18,29.18,708.36,2.87,-1388.41,3.18,2044.0,3441.0
3,8,1,84.5945,0,28.0,0.08,184.21,0.53,201.55,43.04,...,198.42,0.54,220.48,26.68,683.5,2.77,-2506.55,2.68,1959.0,2875.0
4,10,1,73.9726,3,24.0,0.11,233.02,0.48,229.88,39.46,...,196.55,0.53,210.63,26.6,645.95,2.72,-1164.02,2.64,1397.0,2700.0


In [3]:
# Group one
group_one = dataset[dataset["Diagnosis"] == 0]
group_two = dataset[dataset["Diagnosis"] == 3]

combined_groups_one = pd.concat([group_one, group_two], ignore_index = True)
combined_groups_one["Diagnosis"].unique()

array([0, 3], dtype=int64)

Is our target imbalanced for group one?

In [4]:
# Check percentage of each class
combined_groups_one["Diagnosis"].value_counts(normalize=True) * 100

Diagnosis
0    59.876543
3    40.123457
Name: proportion, dtype: float64

Not as much!

In [5]:
# let's separate into training and testing set
combined_groups_one.drop(labels = "RID", axis = 1, inplace = True)

X_train, X_test, y_train, y_test = train_test_split(
    combined_groups_one.drop("Diagnosis", axis=1),  
    combined_groups_one["Diagnosis"],  
    test_size=0.3,  
    random_state=0,  
)

X_train.shape, X_test.shape

((226, 22), (98, 22))

In [6]:
pipe = Pipeline([
    ("imputer", MeanMedianImputer(
        imputation_method="mean", 
        variables=[
            'MMSE0m', 'HipsASMbaseline', 'HipsContrastbaseline',
            'HipsCorelationbaseline', 'HipsVariancebaseline',
            'HipsSumAveragebaseline', 'HipsSumVariancebaseline',
            'HipsEntropybaseline', 'HipsClusterShadebaseline', 
            'ERCsASMbaseline', 'ERCsContrastbaseline', 
            'ERCsCorelationbaseline', 'ERCsVariancebaseline', 
            'ERCsSumAveragebaseline', 'ERCsSumVariancebaseline',
            'ERCsEntropybaseline', 'ERCsClusterShadebaseline', 
            'ERCs_thicknessbaseline', 'ERCsVolumebaseline', 
            'HipposcampusVolumebaseline'
        ]
    )),
    ("scaler", StandardScaler().set_output(transform="pandas")),
])

pipe.fit(X_train)

# let's transform the data with the pipeline
X_train_scaled = pipe.transform(X_train)
X_test_scaled = pipe.transform(X_test)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

lg = LogisticRegression(multi_class = "auto", solver = "lbfgs", max_iter = 1000, random_state = 42)

svm = SVC(kernel ='rbf', decision_function_shape ='ovr', probability = True, random_state = 42)

dt = decision_tree_model = DecisionTreeClassifier(criterion ='gini', max_depth = 5, min_samples_split = 10,
                                                  min_samples_leaf = 5, max_features = 'sqrt', random_state = 42)

rf = RandomForestClassifier(n_estimators = 100, criterion = 'gini', max_depth = 5, min_samples_split = 10, 
                            min_samples_leaf = 5, max_features = 'sqrt', bootstrap = True, random_state = 42)

In [8]:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import KFold, cross_validate

kf = KFold(n_splits = 5, shuffle = True, random_state = 42)

# Define metrics to evaluate
scoring_metrics = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average = 'weighted', zero_division=0),
    'recall': make_scorer(recall_score, average = 'weighted', zero_division=0),
    'f1': make_scorer(f1_score, average = 'weighted', zero_division=0),
    'roc_auc': make_scorer(roc_auc_score)
}

models  = {"Logistic Regression": lg, 
           "Support Vector Machine": svm, 
           "Decision Tree": dt, 
           "Random Forest": rf
}

model_data_mapping = {
    'Logistic Regression': X_train_scaled,
    'Support Vector Machine': X_train_scaled,
    'Decision Tree': X_train,
    'Random Forest': X_train
}

In [9]:
for model_name, model in models.items():

    X_train_to_use = model_data_mapping[model_name]
    
    results = cross_validate(model, 
                             X_train_to_use, 
                             y_train, 
                             scoring = scoring_metrics,
                             return_train_score = True,
                             cv = kf)
    print("------------------------------------------------------")
    print(model_name)
    for metric in scoring_metrics.keys():
            print(f'Mean train {metric}:', np.mean(results[f'train_{metric}']), '±', np.std(results[f'train_{metric}']))
            print(f'Mean test {metric}:', np.mean(results[f'test_{metric}']), '±', np.std(results[f'test_{metric}']))

------------------------------------------------------
Logistic Regression
Mean train accuracy: 0.9845058317986494 ± 0.00544344899788108
Mean test accuracy: 0.9733333333333334 ± 0.025915341754867992
Mean train precision: 0.9845900278069568 ± 0.005512728998822873
Mean test precision: 0.9742192118226601 ± 0.025498790619537207
Mean train recall: 0.9845058317986494 ± 0.00544344899788108
Mean test recall: 0.9733333333333334 ± 0.025915341754867992
Mean train f1: 0.984498377809512 ± 0.005437367330516009
Mean test f1: 0.9731793747411223 ± 0.02610560531948763
Mean train roc_auc: 0.9834886525597619 ± 0.005174403938045408
Mean test roc_auc: 0.970291395910591 ± 0.030353995791676323
------------------------------------------------------
Support Vector Machine
Mean train accuracy: 0.9822958870472682 ± 0.004160885404953817
Mean test accuracy: 0.9556521739130435 ± 0.0344893704615623
Mean train precision: 0.9828497876300352 ± 0.003930861218971975
Mean test precision: 0.9604295544468862 ± 0.027952734104