In [1]:
# import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

In [2]:
# load data
df_rf = pd.read_csv("diabetic_data_clean.csv")
df_rf.head()

  df_rf = pd.read_csv("diabetic_data_clean.csv")


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,readmit_30d
0,2278392,8222157,Caucasian,Female,5,,6,25,1,1,...,No,No,No,No,No,No,No,No,NO,0
1,149190,55629189,Caucasian,Female,15,,1,1,7,3,...,Up,No,No,No,No,No,Ch,Yes,>30,0
2,64410,86047875,AfricanAmerican,Female,25,,1,1,7,2,...,No,No,No,No,No,No,No,Yes,NO,0
3,500364,82442376,Caucasian,Male,35,,1,1,7,2,...,Up,No,No,No,No,No,Ch,Yes,NO,0
4,16680,42519267,Caucasian,Male,45,,1,1,7,1,...,Steady,No,No,No,No,No,Ch,Yes,NO,0


In [3]:
# Set the "readmit_30d" as target/model output y and the rest features as model inputs X
y_data = df_rf['readmit_30d']
X_data = df_rf.drop(['readmit_30d', 'readmitted', 'encounter_id', 'patient_nbr'],axis=1)

In [4]:
# Split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(
    X_data, 
    y_data,
    test_size=0.2,
    random_state=42,
    stratify=y_data
)

In [5]:
# Identify numeric vs categorical columns
numeric_cols = X_data.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = [c for c in X_data.columns if c not in numeric_cols]

# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
    #("scaler", StandardScaler()) # not needed for RF model
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ],
    remainder="drop"
)


In [6]:
# ==================================
# a) Random Forest Classifier model
# ==================================
rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced"  # important for readmission imbalance
)

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("classifier", rf_model)
])

In [7]:
model.fit(X_train, y_train)

In [8]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

In [9]:
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, 
    roc_auc_score
)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8884740100226
ROC AUC: 0.6624869817834214
Confusion Matrix:
 [[18075     8]
 [ 2262     9]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      1.00      0.94     18083
           1       0.53      0.00      0.01      2271

    accuracy                           0.89     20354
   macro avg       0.71      0.50      0.47     20354
weighted avg       0.85      0.89      0.84     20354



In [10]:
feature_names = model.named_steps["preprocess"].get_feature_names_out()
importances = model.named_steps["classifier"].feature_importances_

feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)
feat_imp.head(10)


num__num_lab_procedures          0.048854
num__num_medications             0.045494
num__number_inpatient            0.038228
num__time_in_hospital            0.036143
num__age                         0.030961
num__discharge_disposition_id    0.028849
num__number_diagnoses            0.026047
num__num_procedures              0.025592
num__admission_type_id           0.019100
num__admission_source_id         0.016879
dtype: float64

In [26]:
# simple gridsearch, not fine-tuning yet

In [11]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [None, 10, 20],
    "classifier__min_samples_split": [2, 5]
}

grid = GridSearchCV(
    model,
    param_grid,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)

grid.fit(X_train, y_train)
best_model = grid.best_estimator_


In [24]:
print("Best parameters:")
print(grid.best_params_)

Best parameters:
{'classifier__max_depth': None, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
