In [22]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import joblib

In [23]:
df1=pd.read_csv("/content/cleaned_mimic_data.csv")
df1.head()

Unnamed: 0,stay_id,gender,age,los_icu_days,apsiii,cci,creatinine_mean,platelets_mean,albumin_mean,readmit_30d,los_icu_log
0,39194905,M,90,0.021725,24,6,0.85,266.888889,2.7,1,0.021492
1,31699045,F,96,14.080509,31,5,0.981013,241.757576,2.5,1,2.713403
2,37310192,F,93,1.251343,20,5,0.884615,134.181818,3.2,1,0.811527
3,33539374,M,32,0.148264,21,0,0.452727,304.386555,3.292308,1,0.138251
4,30803171,F,31,0.767801,31,1,0.7,388.0,4.3,0,0.569736


In [24]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21603 entries, 0 to 21602
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   stay_id          21603 non-null  int64  
 1   gender           21603 non-null  object 
 2   age              21603 non-null  int64  
 3   los_icu_days     21603 non-null  float64
 4   apsiii           21603 non-null  int64  
 5   cci              21603 non-null  int64  
 6   creatinine_mean  21603 non-null  float64
 7   platelets_mean   21603 non-null  float64
 8   albumin_mean     21603 non-null  float64
 9   readmit_30d      21603 non-null  int64  
 10  los_icu_log      21603 non-null  float64
dtypes: float64(5), int64(5), object(1)
memory usage: 1.8+ MB


In [25]:
df1 = df1.drop(columns=["stay_id"])

In [26]:
df1=pd.get_dummies(df1, columns=["gender"], drop_first=True)
df1["gender_M"]=df1["gender_M"].astype(int)
df1.head()

Unnamed: 0,age,los_icu_days,apsiii,cci,creatinine_mean,platelets_mean,albumin_mean,readmit_30d,los_icu_log,gender_M
0,90,0.021725,24,6,0.85,266.888889,2.7,1,0.021492,1
1,96,14.080509,31,5,0.981013,241.757576,2.5,1,2.713403,0
2,93,1.251343,20,5,0.884615,134.181818,3.2,1,0.811527,0
3,32,0.148264,21,0,0.452727,304.386555,3.292308,1,0.138251,1
4,31,0.767801,31,1,0.7,388.0,4.3,0,0.569736,0


In [27]:
df_linear = df1.drop(columns=['los_icu_days'])  # for logistic regression
df_tree = df1.drop(columns=['los_icu_log'])     # for tree-based models

Logistic Regression model as baseline

In [28]:
X_linear=df_linear.drop(columns="readmit_30d")
y_linear=df_linear["readmit_30d"]

In [29]:
# X_train, X_test, y_train, y_test=train_test_split(X_linear, y_linear, test_size=0.2, random_state=42)
X_temp, X_test, y_temp, y_test = train_test_split(X_linear, y_linear, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=42)
# X_train.shape
X_train.head()

Unnamed: 0,age,apsiii,cci,creatinine_mean,platelets_mean,albumin_mean,los_icu_log,gender_M
8689,62,39,8,0.833333,189.0,3.7,0.964155,1
11662,68,41,8,0.675,417.25,3.3,0.798581,1
3003,46,54,4,0.446154,290.166667,3.2,2.0502,1
12442,69,82,4,3.33125,533.393939,2.041935,1.927405,1
512,90,66,10,0.968421,172.166667,3.6,1.543209,0


***Scaling***

In [30]:
num_cols = ["age", "los_icu_log", "apsiii", "cci",
             "creatinine_mean", "platelets_mean", "albumin_mean"]

all_features = num_cols + ['gender_M']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols)
    ],
    remainder='passthrough'  # to keep gender_M as is
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=1000, random_state=42))
])
pipeline.fit(X_train, y_train)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [31]:
y_val_pred=pipeline.predict(X_val)
accuracy=accuracy_score(y_val,y_val_pred)
cl_report=classification_report(y_val, y_val_pred)
cf_matrix=confusion_matrix(y_val,y_val_pred)
print("Accuracy:",accuracy)
print("Classification report:",cl_report)
print(cf_matrix)

Accuracy: 0.6177105831533477
Classification report:               precision    recall  f1-score   support

           0       0.63      0.84      0.72      1895
           1       0.57      0.31      0.40      1346

    accuracy                           0.62      3241
   macro avg       0.60      0.57      0.56      3241
weighted avg       0.61      0.62      0.59      3241

[[1590  305]
 [ 934  412]]


# ***Intrepretation of Result:***
Our baseline model (logistic regression) achieved an accuracy of approximately 62%.
For the majority class (not readmitted), the model correctly predicted 63% of them as non-readmissions (precision) and successfully identified 84% of all non-readmitted patients (recall).

However, in healthcare prediction tasks like this, recall is often more critical since missing a patient who will be readmitted can have serious consequences.
For the readmitted class, the model correctly identified 57% of the predicted readmitted patients (precision) but only managed to find 31% of all actual readmissions (recall), indicating room for improvement in sensitivity.

**MODEL 2: TREE-BASED MODELS**

In [32]:
X_tree=df_tree.drop(columns="readmit_30d")
y_tree=df_tree["readmit_30d"]

In [33]:
X_temp, X_test, y_temp, y_test = train_test_split(X_tree, y_tree, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=42)
X_train.head()

Unnamed: 0,age,los_icu_days,apsiii,cci,creatinine_mean,platelets_mean,albumin_mean,gender_M
8689,62,1.622569,39,8,0.833333,189.0,3.7,1
11662,68,1.222384,41,8,0.675,417.25,3.3,1
3003,46,6.769456,54,4,0.446154,290.166667,3.2,1
12442,69,5.871655,82,4,3.33125,533.393939,2.041935,1
512,90,3.679583,66,10,0.968421,172.166667,3.6,0


In [34]:
rf_model= RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_val_pred_rf=rf_model.predict(X_val)
rf_accuracy=accuracy_score(y_val,y_val_pred_rf)
rf_cl_report=classification_report(y_val, y_val_pred_rf)
rf_cf_matrix=confusion_matrix(y_val,y_val_pred_rf)
print("Accuracy:",rf_accuracy)
print("Random Forest Classification report:\n",rf_cl_report)
print(rf_cf_matrix)

Accuracy: 0.6531934588090096
Random Forest Classification report:
               precision    recall  f1-score   support

           0       0.68      0.76      0.72      1895
           1       0.60      0.50      0.55      1346

    accuracy                           0.65      3241
   macro avg       0.64      0.63      0.63      3241
weighted avg       0.65      0.65      0.65      3241

[[1439  456]
 [ 668  678]]


LightGBM

In [35]:
lgbm_model=LGBMClassifier(random_state=42)
lgbm_model.fit(X_train, y_train)
y_val_pred_lgbm=lgbm_model.predict(X_val)
lgbm_accuracy=accuracy_score(y_val,y_val_pred_lgbm)
lgbm_cl_report=classification_report(y_val, y_val_pred_lgbm)
lgbm_cf_matrix=confusion_matrix(y_val,y_val_pred_lgbm)
print("Accuracy:",lgbm_accuracy)
print("LGBM Classification report:\n",lgbm_cl_report)
print(lgbm_cf_matrix)

[LightGBM] [Info] Number of positive: 6256, number of negative: 8865
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000708 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1243
[LightGBM] [Info] Number of data points in the train set: 15121, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.413729 -> initscore=-0.348570
[LightGBM] [Info] Start training from score -0.348570
Accuracy: 0.6741746374575748
LGBM Classification report:
               precision    recall  f1-score   support

           0       0.71      0.75      0.73      1895
           1       0.62      0.56      0.59      1346

    accuracy                           0.67      3241
   macro avg       0.66      0.66      0.66      3241
weighted avg       0.67      0.67      0.67      3241

[[1427  468]
 [ 588  758]]


XGBOOST

In [36]:
xgb_model=XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_val_pred_xgb=xgb_model.predict(X_val)
xgb_accuracy=accuracy_score(y_val,y_val_pred_xgb)
xgb_cl_report=classification_report(y_val, y_val_pred_xgb)
xgb_cf_matrix=confusion_matrix(y_val,y_val_pred_xgb)
print("Accuracy:",xgb_accuracy)
print("XGB Classification report:\n",xgb_cl_report)
print(xgb_cf_matrix)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.6439370564640543
XGB Classification report:
               precision    recall  f1-score   support

           0       0.68      0.74      0.71      1895
           1       0.58      0.51      0.54      1346

    accuracy                           0.64      3241
   macro avg       0.63      0.62      0.63      3241
weighted avg       0.64      0.64      0.64      3241

[[1397  498]
 [ 656  690]]


Compilation of results from Tree based models

In [37]:
def get_metrics(y_true, y_pred, model_name):
    return {
        "Model": model_name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision(Class 1)": precision_score(y_true, y_pred),
        "Recall(Class 1)": recall_score(y_true, y_pred),
        "F1-score(Class 1)": f1_score(y_true, y_pred)
    }

results = [
    get_metrics(y_val, y_val_pred_rf, "Random Forest"),
    get_metrics(y_val, y_val_pred_xgb, "XGBoost"),
    get_metrics(y_val, y_val_pred_lgbm, "LightGBM"),
]

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Accuracy,Precision(Class 1),Recall(Class 1),F1-score(Class 1)
0,Random Forest,0.653193,0.597884,0.503715,0.546774
1,XGBoost,0.643937,0.580808,0.51263,0.544594
2,LightGBM,0.674175,0.618271,0.56315,0.589425


Hyperparam Tuning

In [38]:
lgbm=LGBMClassifier(random_state=42)
param={
    "n_estimators":[100,200,500],
    "learning_rate":[0.01,0.05,0.1],
    "max_depth":[-1,5,10,15],
    "num_leaves":[20,31,40,50]
}
random_search=RandomizedSearchCV(estimator=lgbm,param_distributions=param,n_iter=20,scoring="recall",cv=5, random_state=42,verbose=2, n_jobs=-1)
random_search.fit(X_train, y_train)
print("Best Parameters:", random_search.best_params_)
print("Best Recall Score:", random_search.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[LightGBM] [Info] Number of positive: 6256, number of negative: 8865
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000692 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1243
[LightGBM] [Info] Number of data points in the train set: 15121, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.413729 -> initscore=-0.348570
[LightGBM] [Info] Start training from score -0.348570
Best Parameters: {'num_leaves': 31, 'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.05}
Best Recall Score: 0.5586653999484119


In [39]:
best_lgbm = LGBMClassifier(
    num_leaves=31,
    n_estimators=100,
    max_depth=10,
    learning_rate=0.05,
    random_state=42
)
best_lgbm.fit(X_train, y_train)


y_val_pred = best_lgbm.predict(X_val)
print(classification_report(y_val, y_val_pred))


[LightGBM] [Info] Number of positive: 6256, number of negative: 8865
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000687 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1243
[LightGBM] [Info] Number of data points in the train set: 15121, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.413729 -> initscore=-0.348570
[LightGBM] [Info] Start training from score -0.348570
              precision    recall  f1-score   support

           0       0.71      0.74      0.72      1895
           1       0.61      0.57      0.59      1346

    accuracy                           0.67      3241
   macro avg       0.66      0.66      0.66      3241
weighted avg       0.67      0.67      0.67      3241



In [40]:
y_test_pred=best_lgbm.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.69      0.74      0.72      1898
           1       0.60      0.54      0.57      1343

    accuracy                           0.66      3241
   macro avg       0.64      0.64      0.64      3241
weighted avg       0.65      0.66      0.65      3241



In [45]:
joblib.dump(best_lgbm, "tuned_lgbm_model.pkl")
X_train.to_csv("X_train.csv", index=False)
X_val.to_csv("X_val.csv",index=False)
X_test.to_csv("X_test.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_val.to_csv("y_val.csv", index=False)
y_test.to_csv("y_test.csv", index=False)