In [1]:
# Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, accuracy_score, f1_score

from imblearn.over_sampling import SMOTE

from sklearn.svm import SVC

from sklearn.ensemble import (
    BaggingClassifier,
    HistGradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier
)

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Load the processed dataset
df = pd.read_csv('../data/preprocessed/with_diabetes_status/dataset_with_diabetes_status.csv')

# Drop duplicate records
df = df.drop_duplicates()

# Drop the diabetes and hba1c columns
df = df.drop('diabetes', axis=1)

# Drop records where gender is 'Other'
df = df[df['gender'] != 'Other']

# List of columns to transform
columns_to_transform = ['blood_glucose_level']

# Apply log transformation and create new columns with a '_log' suffix
for col in columns_to_transform:
    # Check for zero or negative values
    if (df[col] <= 0).any():
        df[col + '_log'] = np.log1p(df[col])
    else:
        df[col + '_log'] = np.log(df[col])

# Drop the original columns
df.drop(columns=columns_to_transform)

# One-hot encode categorical features
df = pd.get_dummies(df, columns=['gender', 'smoking_history'], drop_first=False)

# Define the order of categories for diabetes_status
status_order = ['non diabetic', 'stress induced prediabetic', 'stress induced type 2 diabetic', 'prediabetic', 'diabetic']

# Create a mapping for the specified order
status_mapping = {status: i for i, status in enumerate(status_order)}

# Map 'diabetes_status' to the numeric encoding
df['diabetes_status'] = df['diabetes_status'].map(status_mapping)

# Move the encoded 'diabetes_status' to the right
cols = df.columns.tolist()
cols.append(cols.pop(cols.index('diabetes_status')))
df = df[cols]

# Separate features and target
X = df.drop(columns=['diabetes_status'])
y = df['diabetes_status']

# Initialize SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Initialize KFold
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# defining stacking classifier
stack = StackingClassifier(
    estimators=[
        ('bagging', BaggingClassifier(random_state=42)),
        ('hist_grad', HistGradientBoostingClassifier(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42))
    ],
    final_estimator=SVC(probability=True, random_state=42),
    cv=3,          # internal cv for stacking folds
    n_jobs=-1
)

# defining hyperparameter distributions
param_dist = {
    # BaggingClassifier params
    'bagging__n_estimators': randint(10, 200),
    'bagging__max_samples': uniform(0.5, 0.5),
    # HistGradientBoostingClassifier params
    'hist_grad__max_iter': randint(50, 300),
    'hist_grad__learning_rate': uniform(0.01, 0.3),
    # RandomForestClassifier params
    'rf__n_estimators': randint(50, 300),
    'rf__max_depth': randint(3, 20),
    # final SVC meta窶親stimator params
    'final_estimator__C': uniform(0.1, 10),
    'final_estimator__kernel': ['rbf', 'poly'],
    'final_estimator__degree': randint(2,5)
}

# setting up RandomizedSearchCV
search = RandomizedSearchCV(
    estimator=stack,
    param_distributions=param_dist,
    n_iter=10,                    # number of random draws
    scoring=['accuracy','f1_weighted'],
    refit='f1_weighted',          # final refit uses weighted窶色1
    cv=kf,                        # outer KFold
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# run the search on resampled data
search.fit(X_res, y_res)

# print out the best parameters and corresponding scores
print("Best parameters found:")
for param, val in search.best_params_.items():
    print(f"  {param}: {val!r}")

print(f"\nBest weighted窶色1: {search.best_score_:.4f}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters found:
  bagging__max_samples: 0.7475884550556351
  bagging__n_estimators: 184
  final_estimator__C: 1.833646535077721
  final_estimator__degree: 2
  final_estimator__kernel: 'poly'
  hist_grad__learning_rate: 0.0646708263364187
  hist_grad__max_iter: 181
  rf__max_depth: 4
  rf__n_estimators: 183

Best weighted窶色1: 0.9796
