In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix
)


In [2]:
df = pd.read_csv("diabetic_data_clean.csv", low_memory=False)

In [3]:
if "readmit_30d" in df.columns:
    y = df["readmit_30d"].astype(int)
    leakage_cols = ["readmit_30d"]
    # if also present, drop the string outcome to avoid leakage
    if "readmitted" in df.columns:
        leakage_cols.append("readmitted")

elif "readmitted" in df.columns:
    # Create binary 30-day target from the original label column
    # readmitted typically in {"<30", ">30", "NO"}
    y = (df["readmitted"].astype(str).str.strip() == "<30").astype(int)
    leakage_cols = ["readmitted"]  # drop original label from features
else:
    raise KeyError(
        "Could not find a target column. Expected 'readmit_30d' or 'readmitted'. "
        f"Available columns include: {list(df.columns)[:20]} ..."
    )

In [4]:
drop_cols = leakage_cols.copy()

# Drop identifiers if present (avoid memorization / leakage)
for c in ["encounter_id", "patient_nbr"]:
    if c in df.columns:
        drop_cols.append(c)

X = df.drop(columns=drop_cols)

# Identify numeric vs categorical columns
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = [c for c in X.columns if c not in numeric_cols]

# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ],
    remainder="drop"
)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# =========================
# 5) Baseline models
# =========================
dummy_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", DummyClassifier(strategy="most_frequent"))
])

logreg_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=5000))
])


dummy_model.fit(X_train, y_train)
logreg_model.fit(X_train, y_train)

# =========================
# 6) Evaluation
# =========================
def evaluate(pipe, name, threshold=0.50):
    if hasattr(pipe.named_steps["model"], "predict_proba"):
        proba = pipe.predict_proba(X_test)[:, 1]
        pred = (proba >= threshold).astype(int)
        auc = roc_auc_score(y_test, proba)
    else:
        pred = pipe.predict(X_test)
        auc = np.nan

    return {
        "model": name,
        "threshold": threshold,
        "accuracy": accuracy_score(y_test, pred),
        "precision": precision_score(y_test, pred, zero_division=0),
        "recall": recall_score(y_test, pred, zero_division=0),
        "f1": f1_score(y_test, pred, zero_division=0),
        "roc_auc": auc,
        "confusion_matrix": confusion_matrix(y_test, pred)
    }

results = [
    evaluate(dummy_model, "Dummy (Most Frequent)", threshold=0.50),
    evaluate(logreg_model, "Logistic Regression (Baseline)", threshold=0.50),
]

In [6]:
print(pd.DataFrame([{k:v for k,v in r.items() if k!="confusion_matrix"} for r in results]))

print("\nConfusion Matrices:")
for r in results:
    print(f"\n{r['model']} @ threshold={r['threshold']}")
    print(r["confusion_matrix"])

                            model  threshold  accuracy  precision    recall  \
0           Dummy (Most Frequent)        0.5  0.888425   0.000000  0.000000   
1  Logistic Regression (Baseline)        0.5  0.888474   0.505747  0.019375   

        f1   roc_auc  
0  0.00000  0.500000  
1  0.03732  0.646044  

Confusion Matrices:

Dummy (Most Frequent) @ threshold=0.5
[[18083     0]
 [ 2271     0]]

Logistic Regression (Baseline) @ threshold=0.5
[[18040    43]
 [ 2227    44]]


In [7]:
print(df.columns.tolist())
print(df[["readmitted"]].head() if "readmitted" in df.columns else "No readmitted column")

['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'payer_code', 'medical_specialty', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted', 'readmit_30d']
  readmitted
0         NO
1        >30
2         NO
3         NO
4         NO


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, confusion_matrix, classification_report
import xgboost as xgb

# Load Data
df = pd.read_csv("diabetic_data_clean.csv", low_memory=False)

# Define target
if "readmit_30d" in df.columns:
    y = df["readmit_30d"].astype(int)
    drop_cols = ["readmit_30d", "readmitted", "encounter_id", "patient_nbr"]
elif "readmitted" in df.columns:
    y = (df["readmitted"].astype(str).str.strip() == "<30").astype(int)
    drop_cols = ["readmitted", "encounter_id", "patient_nbr"]

X = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Preprocessing
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'string']).columns.tolist()

# Fill missing values
X = X.copy()
for col in numeric_cols:
    X[col] = X[col].fillna(X[col].median())

# Label encode categorical
for col in categorical_cols:
    X[col] = X[col].fillna(X[col].mode()[0])
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Handle class imbalance
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'min_child_weight': [1, 3, 5]
}

xgb_model = xgb.XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    eval_metric='logloss'
)

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=2
)

print("Training XGBoost with GridSearchCV...")
grid_search.fit(X_train, y_train)

# Best model predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluation
print("\n" + "="*50)
print("XGBoost Model Performance")
print("="*50)
print(f"Best Parameters: {grid_search.best_params_}")
print(f"\nF1-Score: {f1_score(y_test, y_pred):.4f}")
print(f"\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Readmit', 'Readmit']))

# Comparison with baseline
print("\n" + "="*50)
print("Comparison with Baseline")
print("="*50)
print(f"Baseline F1-Score: 0.0373")
print(f"XGBoost F1-Score: {f1_score(y_test, y_pred):.4f}")
print(f"Improvement: {(f1_score(y_test, y_pred) - 0.0373):.4f}")

Training XGBoost with GridSearchCV...
Fitting 3 folds for each of 729 candidates, totalling 2187 fits
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.9; total time=   1.7s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=300, subsample=0.9; total time=   1.3s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, min_child_weight=3, n_estimators=200, subsample=0.9; total time=   1.0s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.8; total time=   0.5s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=200, subsample=0.9; total time=   0.9s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   0.8s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, min_child_weight=1