In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

df = pd.read_csv('diabetes_prediction_dataset.csv')

#Preprocess the data
numeric_col=[]
non_numeric_col=[]
for column in df.columns:
    if pd.api.types.is_numeric_dtype(df[column]):
        if(df[column].nunique()<5):
            non_numeric_col.append(column)
        else:
            numeric_col.append(column)
    else:
        non_numeric_col.append(column)
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['smoking_history'] = df['smoking_history'].replace({'not_current':'former','ever':'never'})
for col in non_numeric_col:
    df[col]=le.fit_transform(df[col])




In [2]:
y = df['diabetes']
X = df.drop('diabetes', axis = 1)

In [3]:
#Apply the SMOTE technique to account for the class imbalance
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy = 'minority')
X, y= smote.fit_resample(X,y)
y.value_counts()

diabetes
0    91500
1    91500
Name: count, dtype: int64

In [4]:
#Normalize the data
scaler = MinMaxScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
#Optimize the model using Bayesian Optimization
from bayes_opt import BayesianOptimization
import xgboost as xgb

dtrain = xgb.DMatrix(X_train_scaled, label = y_train)

def bo_tune_xgb(max_depth, gamma, learning_rate, subsample):
    params = {'max_depth': int(max_depth),
              'gamma': gamma,
              'learning_rate':learning_rate,
              'subsample': subsample,
              'eval_metric': 'auc',
             'booster': 'dart',
             'device':'cuda'}
    cv_result = xgb.cv(params, dtrain, num_boost_round=50, nfold=5)
    return cv_result['test-auc-mean'].iloc[-1]

hyperparameter_space = {'max_depth': (3, 20),
                        'gamma': (0, 1),
                        'learning_rate':(0.01,1),
                       'subsample': (0.5, 0.8),
                       }

optimizer = BayesianOptimization(f=bo_tune_xgb, pbounds=hyperparameter_space, random_state=42, verbose=2)
optimizer.maximize(init_points=5, n_iter=15)

In [44]:
#Train the model using the optimal hyperparameters
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
params = {
    'tree_method': 'hist',
    'booster':'dart',
    'objective':'binary:logistic',
    'subsample':0.6952,
    'eval_metric':'auc',
    'learning_rate':0.2005,
    'max_depth': 10,
    'device':'cuda',
    'gamma': 0.2125,
    'max_depth': 12,
    'lambda': 0,
}
dtrain = xgb.DMatrix(X_train_scaled, label = y_train)
dtest = xgb.DMatrix(X_test_scaled, label = y_test)
model = xgb.train(params, dtrain, num_boost_round = 100)
y_pred = model.predict(dtest)
y_pred = (y_pred > 0.5).astype(int)
print(classification_report(y_test, y_pred))
print(f' AUC score is : {roc_auc_score(y_test, y_pred)}')

              precision    recall  f1-score   support

           0       0.97      0.99      0.98     27567
           1       0.99      0.97      0.98     27333

    accuracy                           0.98     54900
   macro avg       0.98      0.98      0.98     54900
weighted avg       0.98      0.98      0.98     54900

 AUC score is : 0.9799492012629235


In [38]:
#train the random forest classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(criterion='log_loss', n_estimators = 600, max_features = 'log2')
rf.fit(X_train_scaled, y_train)
y_pred = rf.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(f' AUC score is : {roc_auc_score(y_test, y_pred)}')

              precision    recall  f1-score   support

           0       0.97      0.98      0.98     27567
           1       0.98      0.97      0.98     27333

    accuracy                           0.98     54900
   macro avg       0.98      0.98      0.98     54900
weighted avg       0.98      0.98      0.98     54900

 AUC score is : 0.9764560923254374
