In [4]:
# ==============================
# Week 7–8: Modeling & Training
# ==============================

# Step 1: Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc
import joblib
from pathlib import Path

# Ensure models folder exists
models_path = Path("../models")  # if running from notebooks folder
models_path.mkdir(parents=True, exist_ok=True)

# Step 2: Load Processed Data
df = pd.read_csv("../data/processed/diabetic_clean.csv")

# Step 3: Split Features & Target
X = df.drop(columns=["readmitted"])
y = df["readmitted"]

# Step 4: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

# Step 5: Define Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, n_jobs=-1),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Step 6: Train & Evaluate
results = {}
for name, model in models.items():
    print(f"\n=== {name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Metrics
    roc_auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None
    print(classification_report(y_test, y_pred))
    if roc_auc:
        print("ROC-AUC:", roc_auc)
    
    # Save results
    results[name] = {
        "classification_report": classification_report(y_test, y_pred, output_dict=True),
        "roc_auc": roc_auc
    }
    
    # Save trained model
    joblib.dump(model, models_path / f"{name.replace(' ', '_').lower()}.joblib")

print("\n✅ Training complete. Models saved in 'models/' folder.")

Train shape: (81412, 2461)
Test shape: (20354, 2461)

=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     18083
           1       0.49      0.02      0.04      2271

    accuracy                           0.89     20354
   macro avg       0.69      0.51      0.49     20354
weighted avg       0.85      0.89      0.84     20354

ROC-AUC: 0.6469229305750555

=== Decision Tree ===
              precision    recall  f1-score   support

           0       0.89      0.90      0.90     18083
           1       0.16      0.15      0.15      2271

    accuracy                           0.82     20354
   macro avg       0.53      0.53      0.53     20354
weighted avg       0.81      0.82      0.82     20354

ROC-AUC: 0.5260296027712908

=== Random Forest ===
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     18083
           1       0.64      0.01      0.01      2271