In [1]:
# Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, accuracy_score, f1_score

from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.ensemble import (
    AdaBoostClassifier,
    BaggingClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier
)

In [2]:
# Load the processed dataset
df = pd.read_csv('../data/preprocessed/with_diabetes_status/dataset_with_diabetes_status.csv')

In [3]:
# Drop the diabetes and hba1c columns
df = df.drop('diabetes', axis=1)

In [4]:
# Drop records where gender is 'Other'
df = df[df['gender'] != 'Other']

In [5]:
# One-hot encode categorical features
df = pd.get_dummies(df, columns=['gender', 'smoking_history'], drop_first=False)

In [6]:
# Define the order of categories for diabetes_status
status_order = ['non diabetic', 'stress induced prediabetic', 'stress induced type 2 diabetic', 'prediabetic', 'diabetic']

# Create a mapping for the specified order
status_mapping = {status: i for i, status in enumerate(status_order)}

# Map 'diabetes_status' to the numeric encoding
df['diabetes_status'] = df['diabetes_status'].map(status_mapping)

In [7]:
# Move the encoded 'diabetes_status' to the right
cols = df.columns.tolist()
cols.append(cols.pop(cols.index('diabetes_status')))
df = df[cols]

In [8]:
# Separate features and target
X = df.drop(columns=['diabetes_status'])
y = df['diabetes_status']

In [9]:
# Initialize SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

In [10]:
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [11]:
# List of models to evaluate (only classification models for multi-class target)
models = [
    StackingClassifier(
        estimators=[
            ('bagging', BaggingClassifier()),
            ('extra_trees', ExtraTreesClassifier()),
            ('grad_boost', GradientBoostingClassifier()),
            ('hist_grad', HistGradientBoostingClassifier()),
            ('rf', RandomForestClassifier())
        ],
        final_estimator=LogisticRegression()
    ),
    StackingClassifier(
        estimators=[
            ('bagging', BaggingClassifier()),
            ('extra_trees', ExtraTreesClassifier()),
            ('grad_boost', GradientBoostingClassifier()),
            ('hist_grad', HistGradientBoostingClassifier()),
            ('rf', RandomForestClassifier())
        ],
        final_estimator=RidgeClassifier()
    ),
    StackingClassifier(
        estimators=[
            ('bagging', BaggingClassifier()),
            ('extra_trees', ExtraTreesClassifier()),
            ('grad_boost', GradientBoostingClassifier()),
            ('hist_grad', HistGradientBoostingClassifier()),
            ('rf', RandomForestClassifier())
        ],
        final_estimator=SVC(probability=True)
    ),
    StackingClassifier(
        estimators=[
            ('bagging', BaggingClassifier()),
            ('extra_trees', ExtraTreesClassifier()),
            ('grad_boost', GradientBoostingClassifier()),
            ('hist_grad', HistGradientBoostingClassifier()),
            ('rf', RandomForestClassifier())
        ],
        final_estimator=KNeighborsClassifier()
    ),
    StackingClassifier(
        estimators=[
            ('bagging', BaggingClassifier()),
            ('extra_trees', ExtraTreesClassifier()),
            ('grad_boost', GradientBoostingClassifier()),
            ('hist_grad', HistGradientBoostingClassifier()),
            ('rf', RandomForestClassifier())
        ],
        final_estimator=GaussianNB()
    ),
    StackingClassifier(
        estimators=[
            ('bagging', BaggingClassifier()),
            ('extra_trees', ExtraTreesClassifier()),
            ('grad_boost', GradientBoostingClassifier()),
            ('hist_grad', HistGradientBoostingClassifier()),
            ('rf', RandomForestClassifier())
        ],
        final_estimator=RandomForestClassifier()
    ),
]

In [12]:
def evaluate_models(models, X, y, kf):
    results = {}
    
    for model in models:
        model_name = type(model).__name__
        print(f"Evaluating {model_name}...")
        
        fold_accuracies = []
        fold_f1_scores = []
        fold_reports = []
        
        for fold, (train_index, test_index) in enumerate(kf.split(X)):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='weighted')
            report = classification_report(y_test, y_pred)
            
            fold_accuracies.append(accuracy)
            fold_f1_scores.append(f1)
            fold_reports.append(report)
            
            print(f"Fold {fold + 1} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")
            print(f"Classification Report for Fold {fold + 1}:\n{report}\n")
        
        mean_accuracy = np.mean(fold_accuracies)
        std_accuracy = np.std(fold_accuracies)
        mean_f1 = np.mean(fold_f1_scores)
        std_f1 = np.std(fold_f1_scores)
        
        print(f"\n{model_name} - Mean Accuracy: {mean_accuracy:.4f} (± {std_accuracy:.4f}), Mean F1 Score: {mean_f1:.4f} (± {std_f1:.4f})\n")
        
        results[model_name] = {
            'mean_accuracy': mean_accuracy,
            'std_accuracy': std_accuracy,
            'mean_f1': mean_f1,
            'std_f1': std_f1,
            'reports': fold_reports
        }
    
    return results

In [13]:
# Call the function with the updated list of models, features, target, and KFold object
results = evaluate_models(models, X_res, y_res, kf)

Evaluating StackingClassifier...
Fold 1 - Accuracy: 0.9803, F1 Score: 0.9803
Classification Report for Fold 1:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7583
           1       0.96      0.98      0.97      7703
           2       0.97      0.99      0.98      7362
           3       0.98      0.96      0.97      7626
           4       0.99      0.97      0.98      7755

    accuracy                           0.98     38029
   macro avg       0.98      0.98      0.98     38029
weighted avg       0.98      0.98      0.98     38029


Fold 2 - Accuracy: 0.9824, F1 Score: 0.9824
Classification Report for Fold 2:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7650
           1       0.96      0.98      0.97      7541
           2       0.98      0.99      0.98      7678
           3       0.98      0.96      0.97      7631
           4       0.99      0.98      0.98      7

In [31]:
# Save trained model
import joblib

stacking_clf = models[5]
joblib.dump(stacking_clf, 'stacking_classifier.pkl')

['stacking_classifier.pkl']

In [32]:
import sklearn
print(sklearn.__version__)

1.5.1
