In [1]:
#Loading libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
import lightgbm as lgb
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score


In [2]:
data = pd.read_csv('../data/processed_diabetes_data.csv', header=0).drop('Unnamed: 0', axis=1)

In [3]:
class0 = data[data["readmitted"] == 0]
class1 = data[data["readmitted"] == 1]
majorityCount = class0.shape[0]
minorityCount = class1.shape[0]
class01, class02 = class0.sample(frac=0.1, random_state=42), class0.sample(frac=0.9, random_state=42)
print("Class 0 (readmitted = 0): ", majorityCount)
print("Class 1 (readmitted = 1): ", minorityCount)
print("20% Resampled Class 0 (readmitted = 0):", class01.shape[0])
print("Class 1 (readmitted= 1): ", minorityCount)

oversampled_class1 = class1.sample(frac=0.1, replace=True, random_state=42)
sampled_df = pd.concat([class01, oversampled_class1])
sampled_df.describe()

Class 0 (readmitted = 0):  90406
Class 1 (readmitted = 1):  11357
20% Resampled Class 0 (readmitted = 0): 9041
Class 1 (readmitted= 1):  11357


Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,readmitted,...,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
count,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0,...,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0
mean,65.985556,4.367004,43.134126,1.331041,16.036651,0.364253,0.192002,0.631424,7.395991,0.111624,...,0.0,0.0,1.415054,1.007271,9.8e-05,0.0,0.0,0.0,0.531984,0.770954
std,15.900063,2.94319,19.474766,1.723683,8.120029,1.229179,0.720927,1.253708,1.943385,0.314919,...,0.0,0.0,0.846098,0.086114,0.009913,0.0,0.0,0.0,0.499,0.420239
min,5.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,55.0,2.0,32.0,0.0,10.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,65.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
75%,75.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0,0.0,...,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
max,95.0,14.0,102.0,6.0,69.0,38.0,12.0,16.0,16.0,1.0,...,0.0,0.0,3.0,3.0,1.0,0.0,0.0,0.0,1.0,1.0


In [4]:
data = sampled_df

In [5]:
X = data.drop('readmitted',axis=1)

y = data['readmitted']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### LGBM feature selection

In [6]:
# Define LightGBM parameters
params = {
    'objective': 'binary',
    'verbosity': -1,
    'is_unbalance': True,
    'learning_rate': 0.01
}

# Create and fit a LightGBM model
light = lgb.LGBMClassifier(**params)
light.fit(X_train, y_train)

# Feature selection using SelectFromModel
threshold = 'median'
feature_selector = SelectFromModel(light, threshold=threshold)
model = feature_selector.fit(X_train, y_train)

# Get the selected features
X_train_selected = model.transform(X_train)
selected_features_mask = feature_selector.get_support()

# Get feature importances
feature_importance = light.feature_importances_
feature_names = X.columns
# Create a DataFrame for feature importances
print(feature_names, feature_importance)

Index(['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'race', 'gender',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed'],
      dtype='object') [104 161 274 100 259  68  26 181  76  34  24  15  97  35 454 378 278   9
  77  30   0   0   0  22   0   7  27   0  10  16   0   0   0   0   0   0
  70  29   0   0   0

### with tuned hyperparameters? does feature selection improve scores

In [7]:
X = data.drop('readmitted',axis=1)

y = data['readmitted']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

hyper  = {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 300, 'num_leaves': 50}

# Create and fit a LightGBM model with all attributes
model_all_attributes = lgb.LGBMClassifier(**hyper)
model_all_attributes.fit(X_train, y_train)

# Predict on the test set
y_pred_all_attributes = model_all_attributes.predict(X_test)

# Evaluate the model with all attributes
accuracy_all_attributes = accuracy_score(y_test, y_pred_all_attributes)
roc_auc_all_attributes = roc_auc_score(y_test, model_all_attributes.predict_proba(X_test)[:, 1])
precision_all_attributes = precision_score(y_test, y_pred_all_attributes)
recall_all_attributes = recall_score(y_test, y_pred_all_attributes)
f1_all_attributes = f1_score(y_test, y_pred_all_attributes)

# Compute confusion matrix for the model with all attributes
confusion_matrix_all_attributes = confusion_matrix(y_test, y_pred_all_attributes)

print("Model with all attributes:")
print("Accuracy:", accuracy_all_attributes)
print("ROC AUC:", roc_auc_all_attributes)
print("Precision:", precision_all_attributes)
print("Recall:", recall_all_attributes)
print("F1 Score:", f1_all_attributes)
print("Confusion Matrix:\n", confusion_matrix_all_attributes)

Model with all attributes:
Accuracy: 0.8850687622789783
ROC AUC: 0.6406270166543688
Precision: 0.43636363636363634
Recall: 0.10572687224669604
F1 Score: 0.1702127659574468
Confusion Matrix:
 [[1778   31]
 [ 203   24]]


In [9]:
# Create and fit a LightGBM model with the selected attributes
feature_selector = SelectFromModel(model_all_attributes, threshold='median')
feature_selector.fit(X_train, y_train)

# Transform the data
X_train_selected = feature_selector.transform(X_train)
X_test_selected = feature_selector.transform(X_test)

model_selected_attributes = lgb.LGBMClassifier(**hyper)
model_selected_attributes.fit(X_train_selected, y_train)

# Predict on the test set
y_pred_selected_attributes = model_selected_attributes.predict(X_test_selected)

# Evaluate the model with selected attributes
accuracy_selected_attributes = accuracy_score(y_test, y_pred_selected_attributes)
roc_auc_selected_attributes = roc_auc_score(y_test, model_selected_attributes.predict_proba(X_test_selected)[:, 1])
precision_selected_attributes = precision_score(y_test, y_pred_selected_attributes)
recall_selected_attributes = recall_score(y_test, y_pred_selected_attributes)
f1_selected_attributes = f1_score(y_test, y_pred_selected_attributes)

# Compute confusion matrix for the model with selected attributes
confusion_matrix_selected_attributes = confusion_matrix(y_test, y_pred_selected_attributes)

print("\nModel with selected attributes:")
print("Accuracy:", accuracy_selected_attributes)
print("ROC AUC:", roc_auc_selected_attributes)
print("Precision:", precision_selected_attributes)
print("Recall:", recall_selected_attributes)
print("F1 Score:", f1_selected_attributes)
cm = confusion_matrix_selected_attributes
print("Confusion Matrix:\n", cm)



Model with selected attributes:
Accuracy: 0.8870333988212181
ROC AUC: 0.6333262712380341
Precision: 0.47540983606557374
Recall: 0.1277533039647577
F1 Score: 0.2013888888888889
Confusion Matrix:
 [[1777   32]
 [ 198   29]]
