In [5]:
#Loading libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler 
import lightgbm as lgb
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score
import re
import warnings
warnings.filterwarnings("ignore")

In [21]:
data = pd.read_csv('../data/processed_diabetes_data.csv').drop(columns=["Unnamed: 0"], axis=1)

In [22]:
class0 = data[data["readmitted"] == 0]
class1 = data[data["readmitted"] == 1]
majorityCount = class0.shape[0]
minorityCount = class1.shape[0]
class0_sampled = class0.sample(frac=0.3, random_state=42)
print("Class 0 (reamitted = 0): ", majorityCount)
print("Resampled Class 0 (readmitted = 0): ", class0_sampled.shape[0])
print("Class 1 (readmitted = 1): ", minorityCount)
class0_under_sampled = class0.sample(frac=0.1, random_state=42)
sampled_df = pd.concat([class0_under_sampled, class1])
sampled_df.describe()

Class 0 (reamitted = 0):  90406
Resampled Class 0 (readmitted = 0):  27122
Class 1 (readmitted = 1):  11357


Unnamed: 0,encounter_id,patient_nbr,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,readmitted
count,20398.0,20398.0,20398.0,20398.0,20398.0,20398.0,20398.0,20398.0,20398.0,20398.0,20398.0,20398.0
mean,163848700.0,54612740.0,66.358957,4.567261,43.65075,1.304049,16.461908,0.39901,0.27439,0.930679,7.547701,0.55677
std,102150700.0,38156920.0,15.842489,2.991721,19.391822,1.675263,8.098719,1.269366,1.112869,1.677715,1.864628,0.496779
min,36900.0,135.0,5.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
25%,83722080.0,23484060.0,55.0,2.0,32.0,0.0,11.0,0.0,0.0,0.0,6.0,0.0
50%,150856500.0,45785510.0,65.0,4.0,44.5,1.0,15.0,0.0,0.0,0.0,9.0,1.0
75%,226993000.0,87767420.0,75.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0,1.0
max,443867200.0,188970200.0,95.0,14.0,132.0,6.0,81.0,40.0,64.0,21.0,16.0,1.0


In [23]:
numerical_columns = sampled_df.select_dtypes(include=['float64']).astype(int).columns.tolist()
object_columns = sampled_df.select_dtypes(include=['object']).columns.tolist()
print("Numerical Columns:")
print(numerical_columns)

print("\nObject (Categorical) Columns:")
print(object_columns)
object_columns = sampled_df.select_dtypes(include=['object']).columns.tolist()
object_columns
sampled_df = pd.get_dummies(sampled_df, columns=object_columns, dtype=int)

Numerical Columns:
['encounter_id', 'patient_nbr', 'age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'readmitted']

Object (Categorical) Columns:
['race', 'gender', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 'glyburide-metformin', 'change', 'diabetesMed']


In [24]:
sampled_df.columns = [re.sub(r'[^\w\s]', '', col) for col in sampled_df.columns]
sampled_df.columns = [re.sub(r'[_]', ' ', col) for col in sampled_df.columns]
sampled_df.columns

Index(['encounter id', 'patient nbr', 'age', 'time in hospital',
       'num lab procedures', 'num procedures', 'num medications',
       'number outpatient', 'number emergency', 'number inpatient',
       ...
       'insulin Steady', 'insulin Up', 'glyburidemetformin Down',
       'glyburidemetformin No', 'glyburidemetformin Steady',
       'glyburidemetformin Up', 'change Ch', 'change No', 'diabetesMed No',
       'diabetesMed Yes'],
      dtype='object', length=1664)

In [27]:
X = sampled_df.drop(['readmitted'],axis=1)
y = sampled_df['readmitted']
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print('Train:', X_train.shape[0])
print('Test:', X_test.shape[0])

Train: 16318
Test: 4080


In [28]:
# Define LightGBM parameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'num_leaves': [31, 50, 100],
}

# Create and fit a LightGBM model
lgb_classifier = lgb.LGBMClassifier()
lgb_classifier.fit(X_train, y_train)

grid_search = GridSearchCV(lgb_classifier, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'num_leaves': 31}


In [29]:
model_selected_attributes = lgb.LGBMClassifier(**best_params)
model_selected_attributes.fit(X_train, y_train)

# Predict on the test set
y_pred = best_estimator.predict(X_test)

# Evaluate the model with selected attributes
accuracy_selected_attributes = accuracy_score(y_test, y_pred)
roc_auc_selected_attributes = roc_auc_score(y_test, model_selected_attributes.predict_proba(X_test)[:, 1])
precision_selected_attributes = precision_score(y_test, y_pred)
recall_selected_attributes = recall_score(y_test, y_pred)
f1_selected_attributes = f1_score(y_test, y_pred)

# Compute confusion matrix for the model with selected attributes
confusion_matrix_selected_attributes = confusion_matrix(y_test, y_pred)

print("Model tuned:")
print("Accuracy:", accuracy_selected_attributes)
print("ROC AUC:", roc_auc_selected_attributes)
print("Precision:", precision_selected_attributes)
print("Recall:", recall_selected_attributes)
print("F1 Score:", f1_selected_attributes)
cm = confusion_matrix_selected_attributes
print("Confusion Matrix:\n", cm)


Model tuned:
Accuracy: 0.6061274509803921
ROC AUC: 0.6509770737255391
Precision: 0.6145366861867034
Recall: 0.7852112676056338
F1 Score: 0.6894685990338164
Confusion Matrix:
 [[ 689 1119]
 [ 488 1784]]


In [30]:
#LGBM with real features

In [33]:
X = sampled_df.drop(['readmitted'],axis=1)
y = sampled_df['readmitted']
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print('Train:', X_train.shape[0])
print('Test:', X_test.shape[0])

Train: 16318
Test: 4080


In [34]:
model_selected_attributes = lgb.LGBMClassifier(**best_params)
model_selected_attributes.fit(X_train, y_train)

# Predict on the test set
y_pred = best_estimator.predict(X_test)

# Evaluate the model with selected attributes
accuracy_selected_attributes = accuracy_score(y_test, y_pred)
roc_auc_selected_attributes = roc_auc_score(y_test, model_selected_attributes.predict_proba(X_test)[:, 1])
precision_selected_attributes = precision_score(y_test, y_pred)
recall_selected_attributes = recall_score(y_test, y_pred)
f1_selected_attributes = f1_score(y_test, y_pred)

# Compute confusion matrix for the model with selected attributes
confusion_matrix_selected_attributes = confusion_matrix(y_test, y_pred)

print("\nModel tuned:")
print("Accuracy:", accuracy_selected_attributes)
print("ROC AUC:", roc_auc_selected_attributes)
print("Precision:", precision_selected_attributes)
print("Recall:", recall_selected_attributes)
print("F1 Score:", f1_selected_attributes)
cm = confusion_matrix_selected_attributes
print("Confusion Matrix:\n", cm)



Model tuned:
Accuracy: 0.6061274509803921
ROC AUC: 0.6509770737255391
Precision: 0.6145366861867034
Recall: 0.7852112676056338
F1 Score: 0.6894685990338164
Confusion Matrix:
 [[ 689 1119]
 [ 488 1784]]


In [None]:
#LGBM with scaled features

In [38]:
X = sampled_df.drop('readmitted',axis=1)
y = sampled_df['readmitted']

# Define and fit a MinMaxScaler
scaler = MinMaxScaler()
model = scaler.fit(X)
scaled_X = model.transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2, random_state=42, stratify=y)
print('Train:', X_train.shape[0])
print('Test:', X_test.shape[0])


Train: 16318
Test: 4080


In [39]:
# Define LightGBM parameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'num_leaves': [31, 50, 100],
}

# Create and fit a LightGBM model
lgb_classifier = lgb.LGBMClassifier()
lgb_classifier.fit(X_train, y_train)

grid_search = GridSearchCV(lgb_classifier, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'num_leaves': 31}


In [40]:
model_selected_attributes = lgb.LGBMClassifier(**best_params)
model_selected_attributes.fit(X_train, y_train)

# Predict on the test set
y_pred = best_estimator.predict(X_test)

# Evaluate the model with selected attributes
accuracy_selected_attributes = accuracy_score(y_test, y_pred)
roc_auc_selected_attributes = roc_auc_score(y_test, model_selected_attributes.predict_proba(X_test)[:, 1])
precision_selected_attributes = precision_score(y_test, y_pred)
recall_selected_attributes = recall_score(y_test, y_pred)
f1_selected_attributes = f1_score(y_test, y_pred)

# Compute confusion matrix for the model with selected attributes
confusion_matrix_selected_attributes = confusion_matrix(y_test, y_pred)

print("\nModel tuned:")
print("Accuracy:", accuracy_selected_attributes)
print("ROC AUC:", roc_auc_selected_attributes)
print("Precision:", precision_selected_attributes)
print("Recall:", recall_selected_attributes)
print("F1 Score:", f1_selected_attributes)
cm = confusion_matrix_selected_attributes
print("Confusion Matrix:\n", cm)



Model tuned:
Accuracy: 0.6061274509803921
ROC AUC: 0.6509770737255391
Precision: 0.6145366861867034
Recall: 0.7852112676056338
F1 Score: 0.6894685990338164
Confusion Matrix:
 [[ 689 1119]
 [ 488 1784]]
