In [3]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import classification_report, accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import (
    BaggingClassifier,
    HistGradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier
)

# Load the processed dataset
df = pd.read_csv('../data/preprocessed/with_diabetes_status/dataset_with_diabetes_status.csv')

# Drop the diabetes and hba1c columns
df = df.drop('diabetes', axis=1)

# Drop records where gender is 'Other'
df = df[df['gender'] != 'Other']

# One-hot encode categorical features
df = pd.get_dummies(df, columns=['gender', 'smoking_history'], drop_first=False)

# Define the order of categories for diabetes_status
status_order = ['non diabetic', 'stress induced prediabetic', 'stress induced type 2 diabetic', 'prediabetic', 'diabetic']
status_mapping = {status: i for i, status in enumerate(status_order)}
df['diabetes_status'] = df['diabetes_status'].map(status_mapping)

# Move the encoded 'diabetes_status' to the right
cols = df.columns.tolist()
cols.append(cols.pop(cols.index('diabetes_status')))
df = df[cols]

# Separate features and target
X = df.drop(columns=['diabetes_status'])
y = df['diabetes_status']

# Initialize SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Define the base estimators and stacking classifier
estimators = [
    ('bagging', BaggingClassifier()),
    ('hist_grad', HistGradientBoostingClassifier()),
    ('rf', RandomForestClassifier())
]

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=RandomForestClassifier()
)

# Set up the parameter grid for hyperparameter tuning.
# The keys follow the format <estimator_name>__<parameter_name>
param_grid = {
    'bagging__n_estimators': [10, 50],
    'hist_grad__max_iter': [100, 200],
    'rf__n_estimators': [50, 100],
    'final_estimator__n_estimators': [50, 100]
}

# Initialize GridSearchCV with 3-fold cross validation and f1 weighted scoring.
# The verbose parameter is set to 3 to show progress.
grid_search = GridSearchCV(
    estimator=stacking_clf,
    param_grid=param_grid,
    cv=3,
    scoring='f1_weighted',
    verbose=3,
    n_jobs=-1
)

# Fit the grid search
grid_search.fit(X_res, y_res)

# Output the best parameters and best F1 score found
print("Best parameters found:", grid_search.best_params_)
print("Best weighted F1 score:", grid_search.best_score_)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best parameters found: {'bagging__n_estimators': 50, 'final_estimator__n_estimators': 50, 'hist_grad__max_iter': 200, 'rf__n_estimators': 100}
Best weighted F1 score: 0.9810273038217424


In [7]:
import joblib

joblib.dump(grid_search.best_estimator_, 'stacking_classifier_tuned.pkl')

['stacking_classifier_tuned.pkl']

In [13]:
import sklearn
print(sklearn.__version__)

1.5.1
