In [1]:
# Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, accuracy_score, f1_score

from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.ensemble import (
    AdaBoostClassifier,
    BaggingClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier
)

In [2]:
# Load the processed dataset
df = pd.read_csv('../data/preprocessed/with_diabetes_status/dataset_with_diabetes_status.csv')

In [3]:
# Drop duplicate records
df = df.drop_duplicates()

In [4]:
# Drop the diabetes and hba1c columns
df = df.drop('diabetes', axis=1)

In [5]:
# Drop records where gender is 'Other'
df = df[df['gender'] != 'Other']

In [6]:
# List of columns to transform
columns_to_transform = ['blood_glucose_level']

# Apply log transformation and create new columns with a '_log' suffix
for col in columns_to_transform:
    # Check for zero or negative values
    if (df[col] <= 0).any():
        df[col + '_log'] = np.log1p(df[col])
    else:
        df[col + '_log'] = np.log(df[col])

# Drop the original columns
df.drop(columns=columns_to_transform)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,diabetes_status,blood_glucose_level_log
0,Female,80.0,0,1,never,25.19,6.6,stress induced type 2 diabetic,4.941642
1,Female,54.0,0,0,No Info,27.32,6.6,stress induced type 2 diabetic,4.382027
2,Male,28.0,0,0,never,27.32,5.7,stress induced prediabetic,5.062595
3,Female,36.0,0,0,current,23.45,5.0,non diabetic,5.043425
4,Male,76.0,1,1,current,20.14,4.8,non diabetic,5.043425
...,...,...,...,...,...,...,...,...,...
99994,Female,36.0,0,0,No Info,24.60,4.8,non diabetic,4.976734
99996,Female,2.0,0,0,No Info,17.37,6.5,stress induced type 2 diabetic,4.605170
99997,Male,66.0,0,0,former,27.83,5.7,stress induced prediabetic,5.043425
99998,Female,24.0,0,0,never,35.42,4.0,non diabetic,4.605170


In [7]:
# One-hot encode categorical features
df = pd.get_dummies(df, columns=['gender', 'smoking_history'], drop_first=False)

In [8]:
# Define the order of categories for diabetes_status
status_order = ['non diabetic', 'stress induced prediabetic', 'stress induced type 2 diabetic', 'prediabetic', 'diabetic']

# Create a mapping for the specified order
status_mapping = {status: i for i, status in enumerate(status_order)}

# Map 'diabetes_status' to the numeric encoding
df['diabetes_status'] = df['diabetes_status'].map(status_mapping)

In [9]:
# Move the encoded 'diabetes_status' to the right
cols = df.columns.tolist()
cols.append(cols.pop(cols.index('diabetes_status')))
df = df[cols]

In [10]:
# Separate features and target
X = df.drop(columns=['diabetes_status'])
y = df['diabetes_status']

In [11]:
# Initialize SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

In [12]:
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [13]:
# List of models to evaluate (only classification models for multi-class target)
models = [
    StackingClassifier(
        estimators=[
            ('bagging', BaggingClassifier()),
            ('hist_grad', HistGradientBoostingClassifier()),
            ('rf', RandomForestClassifier())
        ],
        final_estimator=LogisticRegression()
    ),
    StackingClassifier(
        estimators=[
            ('bagging', BaggingClassifier()),
            ('hist_grad', HistGradientBoostingClassifier()),
            ('rf', RandomForestClassifier())
        ],
        final_estimator=RidgeClassifier()
    ),
    StackingClassifier(
        estimators=[
            ('bagging', BaggingClassifier()),
            ('hist_grad', HistGradientBoostingClassifier()),
            ('rf', RandomForestClassifier())
        ],
        final_estimator=SVC(probability=True)
    ),
    StackingClassifier(
        estimators=[
            ('bagging', BaggingClassifier()),
            ('hist_grad', HistGradientBoostingClassifier()),
            ('rf', RandomForestClassifier())
        ],
        final_estimator=KNeighborsClassifier()
    ),
    StackingClassifier(
        estimators=[
            ('bagging', BaggingClassifier()),
            ('hist_grad', HistGradientBoostingClassifier()),
            ('rf', RandomForestClassifier())
        ],
        final_estimator=GaussianNB()
    ),
    StackingClassifier(
        estimators=[
            ('bagging', BaggingClassifier()),
            ('hist_grad', HistGradientBoostingClassifier()),
            ('rf', RandomForestClassifier())
        ],
        final_estimator=RandomForestClassifier()
    ),
]

In [14]:
def evaluate_models(models, X, y, kf):
    results = {}
    
    for model in models:
        model_name = type(model).__name__
        print(f"Evaluating {model_name}...")
        
        fold_accuracies = []
        fold_f1_scores = []
        fold_reports = []
        
        for fold, (train_index, test_index) in enumerate(kf.split(X)):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='weighted')
            report = classification_report(y_test, y_pred)
            
            fold_accuracies.append(accuracy)
            fold_f1_scores.append(f1)
            fold_reports.append(report)
            
            print(f"Fold {fold + 1} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")
            print(f"Classification Report for Fold {fold + 1}:\n{report}\n")
        
        mean_accuracy = np.mean(fold_accuracies)
        std_accuracy = np.std(fold_accuracies)
        mean_f1 = np.mean(fold_f1_scores)
        std_f1 = np.std(fold_f1_scores)
        
        print(f"\n{model_name} - Mean Accuracy: {mean_accuracy:.4f} (± {std_accuracy:.4f}), Mean F1 Score: {mean_f1:.4f} (± {std_f1:.4f})\n")
        
        results[model_name] = {
            'mean_accuracy': mean_accuracy,
            'std_accuracy': std_accuracy,
            'mean_f1': mean_f1,
            'std_f1': std_f1,
            'reports': fold_reports
        }
    
    return results

In [15]:
# Call the function with the updated list of models, features, target, and KFold object
results = evaluate_models(models, X_res, y_res, kf)

Evaluating StackingClassifier...
Fold 1 - Accuracy: 0.9799, F1 Score: 0.9799
Classification Report for Fold 1:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7407
           1       0.96      0.98      0.97      7227
           2       0.97      0.99      0.98      7174
           3       0.98      0.96      0.97      7335
           4       0.99      0.97      0.98      7324

    accuracy                           0.98     36467
   macro avg       0.98      0.98      0.98     36467
weighted avg       0.98      0.98      0.98     36467


Fold 2 - Accuracy: 0.9792, F1 Score: 0.9792
Classification Report for Fold 2:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7175
           1       0.96      0.97      0.97      7402
           2       0.98      0.99      0.98      7231
           3       0.97      0.96      0.97      7322
           4       0.99      0.98      0.98      7

In [16]:
# Save trained model
import joblib

stacking_clf = models[5]
joblib.dump(stacking_clf, 'stacking_classifier.pkl')

['stacking_classifier.pkl']

In [17]:
import sklearn
print(sklearn.__version__)

1.5.1
