In [9]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
import gc

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Load Data

In [3]:
data = pd.read_csv('/kaggle/input/diabetes-dataset/diabetes_final.csv')
data.drop('Unnamed: 0', axis=1, inplace=True)
data.head()

Unnamed: 0,age,physical_activity_minutes_per_week,diet_score,screen_time_hours_per_day,family_history_diabetes,hypertension_history,cardiovascular_history,bmi,waist_to_hip_ratio,systolic_bp,...,ethnicity_Other,ethnicity_White,employment_status_Employed,employment_status_Retired,employment_status_Student,employment_status_Unemployed,smoking_status_Current,smoking_status_Former,smoking_status_Never,diagnosed_diabetes
0,0.504956,1.138363,-0.165523,0.771162,-0.530172,-0.578582,-0.293278,1.362636,0.724256,1.27418,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,-0.135884,0.285376,0.395977,1.09526,-0.530172,-0.578582,-0.293278,-0.70055,-1.1973,0.924138,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2,0.633124,-0.73347,0.227527,0.852187,1.886181,-0.578582,-0.293278,-0.951478,-0.983794,-0.055979,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
3,1.530299,-0.828246,-1.456972,-0.322667,-0.530172,-0.578582,-0.293278,0.331043,0.51075,0.294063,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
4,-0.264052,-0.117424,0.676727,-0.403691,-0.530172,-0.578582,-0.293278,-1.230287,-1.624313,-1.666172,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1


In [4]:
X = data.drop('diagnosed_diabetes', axis=1)
y = data['diagnosed_diabetes']

In [5]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [10]:
base_models = [
    ('lgbm', LGBMClassifier(n_estimators=1000, learning_rate=0.01, num_leaves=31, 
                            device='gpu', random_state=42, verbose=-1)),
    
 
    ('xgb', XGBClassifier(n_estimators=1000, learning_rate=0.01, max_depth=6, 
                          tree_method='hist', 
                          device='cuda',     
                          random_state=42)),
    
    ('cat', CatBoostClassifier(iterations=1000, learning_rate=0.03, depth=8, 
                               task_type="GPU", random_state=42, verbose=False))
]



In [11]:
stack_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(),
    cv=skf, 
    stack_method='predict_proba',
    n_jobs=1 
)

In [12]:
gc.collect()
print("Starting Stacking Training (This might take some time due to GPU processing)...")
stack_model.fit(X, y)

Starting Stacking Training (This might take some time due to GPU processing)...


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


In [13]:
final_auc = roc_auc_score(y, stack_model.predict_proba(X)[:, 1])
print(f"Final Stacking Training AUC: {final_auc:.4f}")

Final Stacking Training AUC: 0.9705


In [14]:
import joblib

model_filename = 'final_stacking_diabetes_model.pkl'
joblib.dump(stack_model, model_filename, compress=3)

['final_stacking_diabetes_model.pkl']