In [2]:
# Import basic libraries for EDA
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from dotenv import load_dotenv
import os
import pingouin as pg
from sklearn.preprocessing import StandardScaler 


In [3]:
# Import dataset
load_dotenv()
dataset_path=os.getenv("DATASET_PATH")
dataset = pd.read_csv(dataset_path)
dataset.head()

Unnamed: 0,RID,Gender,Ageatscreening,Diagnosis,MMSE0m,HipsASMbaseline,HipsContrastbaseline,HipsCorelationbaseline,HipsVariancebaseline,HipsSumAveragebaseline,...,ERCsContrastbaseline,ERCsCorelationbaseline,ERCsVariancebaseline,ERCsSumAveragebaseline,ERCsSumVariancebaseline,ERCsEntropybaseline,ERCsClusterShadebaseline,ERCs_thicknessbaseline,ERCsVolumebaseline,HipposcampusVolumebaseline
0,3,0,81.3479,3,20.0,,158.27,0.63,218.3,28.37,...,253.1,0.4,208.65,23.39,581.5,,-2568.19,2.31,1176.0,3047.0
1,4,0,67.6904,1,27.0,0.06,147.64,0.55,173.64,44.72,...,220.88,0.48,215.7,33.74,641.9,3.33,4113.01,2.76,1942.0,3449.0
2,5,0,73.8027,0,29.0,0.1,199.66,0.55,222.27,41.18,...,220.37,0.54,232.18,29.18,708.36,2.87,-1388.41,3.18,2044.0,3441.0
3,8,1,84.5945,0,28.0,0.08,184.21,0.53,201.55,43.04,...,198.42,0.54,220.48,26.68,683.5,2.77,-2506.55,2.68,1959.0,2875.0
4,10,1,73.9726,3,24.0,0.11,233.02,0.48,229.88,39.46,...,196.55,0.53,210.63,26.6,645.95,2.72,-1164.02,2.64,1397.0,2700.0


In [4]:
dataset.shape

(608, 24)

### Multiclass

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer

# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    dataset.drop("Diagnosis", axis=1),  
    dataset["Diagnosis"],  
    test_size=0.3,  
    random_state=0,  
)

X_train.shape, X_test.shape

((425, 23), (183, 23))

In [4]:
pipe = Pipeline([
    ("imputer", MeanMedianImputer(
        imputation_method="mean", 
        variables=[
            'MMSE0m', 'HipsASMbaseline', 'HipsContrastbaseline',
            'HipsCorelationbaseline', 'HipsVariancebaseline',
            'HipsSumAveragebaseline', 'HipsSumVariancebaseline',
            'HipsEntropybaseline', 'HipsClusterShadebaseline', 
            'ERCsASMbaseline', 'ERCsContrastbaseline', 
            'ERCsCorelationbaseline', 'ERCsVariancebaseline', 
            'ERCsSumAveragebaseline', 'ERCsSumVariancebaseline',
            'ERCsEntropybaseline', 'ERCsClusterShadebaseline', 
            'ERCs_thicknessbaseline', 'ERCsVolumebaseline', 
            'HipposcampusVolumebaseline'
        ]
    )),
    ("scaler", StandardScaler().set_output(transform="pandas")),
])

pipe.fit(X_train)

# let's transform the data with the pipeline
X_train_scaled = pipe.transform(X_train)
X_test_scaled = pipe.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, balanced_accuracy_score, make_scorer
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

def run_model(model, classifier_name, X_train, y_train):
        
        kf = StratifiedKFold(n_splits=10,
                             shuffle=True,
                             random_state=42,
        )
        
        metrics = {"accuracy": make_scorer(accuracy_score),
                   "balanced_accuracy": make_scorer(balanced_accuracy_score),  
                   "precision": make_scorer(precision_score, average="weighted"), 
                   "recall": make_scorer(recall_score, average="weighted"), 
                   "f1_weighted": make_scorer(f1_score, average="weighted"),
                   "roc_auc_ovr_weighted": make_scorer(roc_auc_score, 
                                                       average="weighted", 
                                                       multi_class="ovr", 
                                                       response_method="predict_proba",),
        }
        
        cross_val_results = cross_validate(model,
                                           X_train,
                                           y_train,
                                           cv=kf,
                                           scoring=metrics,
                                           return_train_score=True,
        )
                
        metric_names = list(metrics.keys())
        mean_train = [round(np.mean(cross_val_results[f"train_{metric}"]), 3) for metric in metric_names]
        std_train = [round(np.std(cross_val_results[f"train_{metric}"]), 3) for metric in metric_names]
        mean_test = [round(np.mean(cross_val_results[f"test_{metric}"]), 3) for metric in metric_names]
        std_test = [round(np.std(cross_val_results[f"test_{metric}"]), 3) for metric in metric_names]
        time = round(np.mean(cross_val_results[f"fit_time"]), 3)
                
        cv_metrics_df = pd.DataFrame({
                "Classifier": classifier_name,
                "Fit Time": time,
                "Metric": metric_names,
                "Mean Train": mean_train,
                "Std Train": std_train,
                "Mean Test": mean_test,
                "Std Test": std_test,
        })
        
        fit_model = model.fit(X_train, y_train)
        
        return fit_model, cv_metrics_df


In [6]:
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42, class_weight="balanced",
        )
lg = LogisticRegression(multi_class = "auto", solver = "lbfgs", max_iter = 1000, random_state = 42)

svm = SVC(kernel ='rbf', decision_function_shape ='ovr', probability = True, random_state = 42)

dt = DecisionTreeClassifier(criterion ='gini', max_depth = 5, min_samples_split = 10,
                                                  min_samples_leaf = 5, max_features = 'sqrt', random_state = 42)

In [7]:
model_rf, metrics_rf = run_model(rf, "Random Forest", X_train, y_train)
model_dt, metrics_dt = run_model(dt, "Decision Tree", X_train, y_train)
model_lg, metrics_lg = run_model(svm, "Logistic Regression", X_train_scaled, y_train)
model_svm, metrics_svm = run_model(lg, "Support Vector Machine", X_train_scaled, y_train)



In [8]:
validation_df = pd.concat([metrics_rf, metrics_dt, metrics_lg, metrics_svm])
validation_df_report = validation_df.set_index(["Classifier", "Fit Time", "Metric"])
validation_df_report

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Mean Train,Std Train,Mean Test,Std Test
Classifier,Fit Time,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Random Forest,0.482,accuracy,0.904,0.013,0.605,0.063
Random Forest,0.482,balanced_accuracy,0.915,0.011,0.584,0.04
Random Forest,0.482,precision,0.907,0.013,0.595,0.057
Random Forest,0.482,recall,0.904,0.013,0.605,0.063
Random Forest,0.482,f1_weighted,0.902,0.014,0.592,0.063
Random Forest,0.482,roc_auc_ovr_weighted,0.985,0.003,0.835,0.047
Decision Tree,0.006,accuracy,0.661,0.027,0.494,0.037
Decision Tree,0.006,balanced_accuracy,0.628,0.032,0.441,0.031
Decision Tree,0.006,precision,0.67,0.025,0.479,0.037
Decision Tree,0.006,recall,0.661,0.027,0.494,0.037


In [11]:
import time

def model_eval(model, classifier_name, X_train, X_test, y_train, y_test):
    start_time = time.time()
    
    # Ensure that y_train and y_test are 1D arrays
    y_train = np.array(y_train).ravel()
    y_test = np.array(y_test).ravel()
    
    # Get predicted probabilities for ROC AUC
    pred_train_proba = model.predict_proba(X_train)
    pred_test_proba = model.predict_proba(X_test)
    
    # Get predicted classes for other metrics
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    
    # Calculate metrics
    metrics_train = {
        "accuracy": round(accuracy_score(y_train, pred_train), 3),
        "balanced_accuracy": round(balanced_accuracy_score(y_train, pred_train), 3),
        "precision": round(precision_score(y_train, pred_train, average="weighted"), 3),
        "recall": round(recall_score(y_train, pred_train, average="weighted"), 3),
        "f1_weighted": round(f1_score(y_train, pred_train, average="weighted"), 3),
        "roc_auc_ovr_weighted": round(roc_auc_score(y_train, pred_train_proba, average="weighted", multi_class="ovr"),3,),
    }
    
    metrics_test = {
        "accuracy": round(accuracy_score(y_test, pred_test), 3),
        "balanced_accuracy": round(balanced_accuracy_score(y_test, pred_test), 3),
        "precision": round(precision_score(y_test, pred_test, average="weighted"), 3),
        "recall": round(recall_score(y_test, pred_test, average="weighted"), 3),
        "f1_weighted": round(f1_score(y_test, pred_test, average="weighted"), 3),
        "roc_auc_ovr_weighted": round(roc_auc_score(y_test, pred_test_proba, average="weighted", multi_class="ovr"),3,),
    }
            
    elapsed_time = time.time() - start_time

    # Create the DataFrame without additional rounding issues
    pred_metrics_df = pd.DataFrame({
        "Classifier": classifier_name,
        "Classification Time": round(elapsed_time, 3),
        "Metric": list(metrics_train.keys()),
        "Train data": list(metrics_train.values()),
        "Test data": list(metrics_test.values()),
    })
    
    return pred_metrics_df

In [12]:
pred_rf = model_eval(model_rf,"Random Forest", X_train, X_test, y_train, y_test)
pred_dt = model_eval(model_dt,"Decision Tree", X_train, X_test, y_train, y_test)
pred_lg = model_eval(model_lg,"Logistic Regression", X_train_scaled, X_test_scaled, y_train, y_test)
pred_svm = model_eval(model_svm,"Support Vector Machine", X_train_scaled, X_test_scaled, y_train, y_test)

In [13]:
prediction_df = pd.concat([pred_rf, pred_dt, pred_lg, pred_svm])
prediction_df_report = prediction_df.set_index(["Classifier", "Classification Time", "Metric"])
prediction_df_report

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Train data,Test data
Classifier,Classification Time,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1
Random Forest,0.128,accuracy,0.868,0.645
Random Forest,0.128,balanced_accuracy,0.881,0.597
Random Forest,0.128,precision,0.873,0.637
Random Forest,0.128,recall,0.868,0.645
Random Forest,0.128,f1_weighted,0.866,0.638
Random Forest,0.128,roc_auc_ovr_weighted,0.977,0.826
Decision Tree,0.047,accuracy,0.675,0.568
Decision Tree,0.047,balanced_accuracy,0.632,0.523
Decision Tree,0.047,precision,0.671,0.555
Decision Tree,0.047,recall,0.675,0.568
