In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("diabetic_data.csv")

df.replace("?", np.nan, inplace=True)
df.dropna(subset=["diag_1", "diag_2", "diag_3"], inplace=True)

In [6]:
df["readmitted_30"] = df["readmitted"].apply(lambda x: 1 if x == "<30" else 0)

In [9]:
categorical_cols = df.select_dtypes(include="object").columns

le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col].astype(str))

In [10]:
X = df.drop(["readmitted", "readmitted_30"], axis=1)
y = df["readmitted_30"]

In [13]:
rf = RandomForestClassifier(
    n_estimators=300,
    max_leaf_nodes=100,
    class_weight="balanced",
    random_state=42
)

rf.fit(X, y)
y_pred = rf.predict(X)

print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.67      0.78     88994
           1       0.20      0.64      0.30     11250

    accuracy                           0.67    100244
   macro avg       0.57      0.65      0.54    100244
weighted avg       0.85      0.67      0.73    100244



In [16]:
xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    scale_pos_weight=(y.value_counts()[0] / y.value_counts()[1]),
    eval_metric="logloss"
)

xgb.fit(X, y)
y_pred = xgb.predict(X)

print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.86      0.92     88994
           1       0.45      0.91      0.60     11250

    accuracy                           0.87    100244
   macro avg       0.72      0.89      0.76    100244
weighted avg       0.93      0.87      0.88    100244



In [17]:
y.value_counts(normalize=True)

readmitted_30
0    0.887774
1    0.112226
Name: proportion, dtype: float64