In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix
)


In [2]:
df = pd.read_csv("diabetic_data_clean.csv", low_memory=False)

In [3]:
if "readmit_30d" in df.columns:
    y = df["readmit_30d"].astype(int)
    leakage_cols = ["readmit_30d"]
    # if also present, drop the string outcome to avoid leakage
    if "readmitted" in df.columns:
        leakage_cols.append("readmitted")

elif "readmitted" in df.columns:
    # Create binary 30-day target from the original label column
    # readmitted typically in {"<30", ">30", "NO"}
    y = (df["readmitted"].astype(str).str.strip() == "<30").astype(int)
    leakage_cols = ["readmitted"]  # drop original label from features
else:
    raise KeyError(
        "Could not find a target column. Expected 'readmit_30d' or 'readmitted'. "
        f"Available columns include: {list(df.columns)[:20]} ..."
    )

In [4]:
drop_cols = leakage_cols.copy()

# Drop identifiers if present (avoid memorization / leakage)
for c in ["encounter_id", "patient_nbr"]:
    if c in df.columns:
        drop_cols.append(c)

X = df.drop(columns=drop_cols)

# Identify numeric vs categorical columns
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = [c for c in X.columns if c not in numeric_cols]

# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ],
    remainder="drop"
)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# =========================
# 5) Baseline models
# =========================
dummy_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", DummyClassifier(strategy="most_frequent"))
])

logreg_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=5000))
])


dummy_model.fit(X_train, y_train)
logreg_model.fit(X_train, y_train)

# =========================
# 6) Evaluation
# =========================
def evaluate(pipe, name, threshold=0.50):
    if hasattr(pipe.named_steps["model"], "predict_proba"):
        proba = pipe.predict_proba(X_test)[:, 1]
        pred = (proba >= threshold).astype(int)
        auc = roc_auc_score(y_test, proba)
    else:
        pred = pipe.predict(X_test)
        auc = np.nan

    return {
        "model": name,
        "threshold": threshold,
        "accuracy": accuracy_score(y_test, pred),
        "precision": precision_score(y_test, pred, zero_division=0),
        "recall": recall_score(y_test, pred, zero_division=0),
        "f1": f1_score(y_test, pred, zero_division=0),
        "roc_auc": auc,
        "confusion_matrix": confusion_matrix(y_test, pred)
    }

results = [
    evaluate(dummy_model, "Dummy (Most Frequent)", threshold=0.50),
    evaluate(logreg_model, "Logistic Regression (Baseline)", threshold=0.50),
]

In [16]:
print(pd.DataFrame([{k:v for k,v in r.items() if k!="confusion_matrix"} for r in results]))

print("\nConfusion Matrices:")
for r in results:
    print(f"\n{r['model']} @ threshold={r['threshold']}")
    print(r["confusion_matrix"])

                            model  threshold  accuracy  precision    recall  \
0           Dummy (Most Frequent)        0.5  0.888425        0.0  0.000000   
1  Logistic Regression (Baseline)        0.5  0.888425        0.5  0.018934   

         f1   roc_auc  
0  0.000000  0.500000  
1  0.036487  0.645951  

Confusion Matrices:

Dummy (Most Frequent) @ threshold=0.5
[[18083     0]
 [ 2271     0]]

Logistic Regression (Baseline) @ threshold=0.5
[[18040    43]
 [ 2228    43]]


In [18]:
print(df.columns.tolist())
print(df[["readmitted"]].head() if "readmitted" in df.columns else "No readmitted column")

['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'payer_code', 'medical_specialty', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted', 'readmit_30d']
  readmitted
0         NO
1        >30
2         NO
3         NO
4         NO
