# Modeling - Hyperparameter tuning (Diabetes)

    1. Gradient Boosting Trees
    2. XGBoost
    3. LightGBM
    4. CatBoost


In [12]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix 
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings("ignore")

In [13]:
df = pd.read_csv('../data/diabetes_data_encoded.csv').drop('Unnamed: 0', axis=1)
df

Unnamed: 0,encounter id,patient nbr,age,time in hospital,num lab procedures,num procedures,num medications,number outpatient,number emergency,number inpatient,...,insulin_Steady,insulin_Up,glyburidemetformin_Down,glyburidemetformin_No,glyburidemetformin_Steady,glyburidemetformin_Up,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
0,2278392.0,8222157.0,5.0,1.0,41.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,1,1,0
1,149190.0,55629189.0,15.0,3.0,59.0,0.0,18.0,0.0,0.0,0.0,...,0,1,0,1,0,0,1,0,0,1
2,64410.0,86047875.0,25.0,2.0,11.0,5.0,13.0,2.0,0.0,1.0,...,0,0,0,1,0,0,0,1,0,1
3,500364.0,82442376.0,35.0,2.0,44.0,1.0,16.0,0.0,0.0,0.0,...,0,1,0,1,0,0,1,0,0,1
4,16680.0,42519267.0,45.0,1.0,51.0,0.0,8.0,0.0,0.0,0.0,...,1,0,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101758,443847548.0,100162476.0,75.0,3.0,51.0,0.0,16.0,0.0,0.0,0.0,...,0,0,0,1,0,0,1,0,0,1
101759,443847782.0,74694222.0,85.0,5.0,33.0,3.0,18.0,0.0,0.0,1.0,...,1,0,0,1,0,0,0,1,0,1
101760,443854148.0,41088789.0,75.0,1.0,53.0,0.0,9.0,1.0,0.0,0.0,...,0,0,0,1,0,0,1,0,0,1
101761,443857166.0,31693671.0,85.0,10.0,45.0,2.0,21.0,0.0,0.0,1.0,...,0,1,0,1,0,0,1,0,0,1


In [14]:
readmit = df['readmitted'].value_counts()

# Print the counts
print("Occurrences of Readmitted:")
print("Value 0:", readmit[0])
print("Value 1:", readmit[1])
print('%:',readmit[1]/readmit[0])

X = df.drop('readmitted', axis=1)
y = df['readmitted'].values

Occurrences of Readmitted:
Value 0: 90406
Value 1: 11357
%: 0.12562219321726434


In [15]:
### Random Forest Classifier

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [17]:
param_grid_rfc = { 
    'n_estimators': [10, 50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}


In [None]:
rfc = RandomForestClassifier()
rfc_random = RandomizedSearchCV(estimator=rfc, param_distributions=param_grid_rfc, n_iter=10, cv=5, random_state=42)
rfcCV.fit(X_train, y_train)
grid_predictions = rfcCV.predict(X_val) 
rfcCV.best_params_
print(classification_report(y_val, grid_predictions)) 
print(confusion_matrix(y_val, grid_predictions))

In [7]:
### Gradient Boosting
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
param_grid_gbt = {
    'n_estimators': [25, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 1.0],
    'max_features': ['auto', 'sqrt']
}


In [11]:
gbt = GradientBoostingClassifier()
rfc_random = RandomizedSearchCV(estimator=gbt, param_distributions=param_grid_gbt, n_iter=10, cv=5, random_state=42)
gbt.fit(X_train, y_train)
grid_predictions = gbt.predict(X_val) 
print(classification_report(y_val, grid_predictions)) 
print(confusion_matrix(y_val, grid_predictions))
print("Best Parameters for GBT:", gbt.best_params_)

KeyboardInterrupt: 

In [None]:
### XGBoost
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
param_grid_xgb = {
    'min_child_weight': [40, 50, 60],
    'max_delta_step': [0, 1, 2],
    'num_leaves': [20, 30, 40],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 10, 15],
    'n_estimators': [25, 50, 100],
}


In [None]:
xgb = XGBClassifier()
rfc_random = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid_xgb, n_iter=10, cv=5, random_state=42)
grid_search_xgb.fit(X_train, y_train)
best_params_xgb = grid_search_xgb.best_params_
best_estimator_xgb = grid_search_xgb.best_estimator_
grid_predictions_xgb = best_estimator_xgb.predict(X_val)
print(classification_report(y_temp, grid_predictions, zero_division=0)) 
print(confusion_matrix(y_temp, grid_predictions))
print("Best Parameters for XGBoost:", best_params_xgb)

In [None]:
### CatBoost
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
param_grid_catboost = {
    'iterations': [25, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [5, 10, 15],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
}

In [None]:
cat = CatBoostClassifier()
cat_random = RandomizedSearchCV(estimator=cat, param_distributions=param_grid_catboost, n_iter=10, cv=5, random_state=42)
grid_search_cat.fit(X_train, y_train)
best_params_cat = grid_search_cat.best_params_
best_estimator_cat = grid_search_cat.best_estimator_
grid_predictions_cat = best_estimator_cat.predict(X_val)
print(classification_report(y_temp, grid_predictions, zero_division=0)) 
print(confusion_matrix(y_temp, grid_predictions))
print("Best Parameters for XGBoost:", best_params_cat)