# Heart Disease ML Pipeline (UCI) — Notebooks

These notebooks implement a full pipeline on the **UCI Heart Disease** dataset using your requested loader:

```python
from ucimlrepo import fetch_ucirepo
heart_disease = fetch_ucirepo(id=45)
X = heart_disease.data.features
y = heart_disease.data.targets
```

> Bonus items (Streamlit/Ngrok) are intentionally **omitted** per the request.

## 04 — Supervised Learning

Models:
- Logistic Regression
- Decision Tree
- Random Forest
- SVM (RBF)

Outputs:
- Metrics table (Accuracy, Precision, Recall, F1, ROC-AUC)
- ROC curves
- A baseline best model saved to `models/` (contains preprocessing + estimator in a Pipeline)

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, auc
)
import joblib

# Load split data
X_train = np.load("../data/X_train.npy")
X_test  = np.load("../data/X_test.npy")
y_train = np.load("../data/y_train.npy")
y_test  = np.load("../data/y_test.npy")

classes = np.unique(np.concatenate([y_train, y_test]))
n_classes = len(classes)

# Define models
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, multi_class="auto"),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42),
    "SVM_RBF": SVC(kernel="rbf", probability=True, random_state=42)
}

rows = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Try to get probabilistic outputs; if not, skip AUC
    y_proba = None
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)
    elif hasattr(model, "decision_function"):
        df = model.decision_function(X_test)
        if df.ndim == 1:  # binary case fallback
            df = np.vstack([-df, df]).T
        ex = np.exp(df - df.max(axis=1, keepdims=True))
        y_proba = ex / ex.sum(axis=1, keepdims=True)

    acc = accuracy_score(y_test, y_pred)
    prec_macro = precision_score(y_test, y_pred, average="macro", zero_division=0)
    rec_macro  = recall_score(y_test, y_pred, average="macro", zero_division=0)
    f1_macro   = f1_score(y_test, y_pred, average="macro", zero_division=0)
    prec_w     = precision_score(y_test, y_pred, average="weighted", zero_division=0)
    rec_w      = recall_score(y_test, y_pred, average="weighted", zero_division=0)
    f1_w       = f1_score(y_test, y_pred, average="weighted", zero_division=0)

    if y_proba is not None and n_classes > 1:
        auc_macro_ovr = roc_auc_score(y_test, y_proba, multi_class="ovr", average="macro")
    else:
        auc_macro_ovr = np.nan

    rows.append([name, acc, prec_macro, rec_macro, f1_macro,
                 prec_w, rec_w, f1_w, auc_macro_ovr])

# Save metrics table
metrics_df = pd.DataFrame(rows, columns=[
    "Model","Accuracy","Precision_macro","Recall_macro","F1_macro",
    "Precision_weighted","Recall_weighted","F1_weighted","ROC_AUC_macro_ovr"
]).sort_values(["ROC_AUC_macro_ovr","F1_macro"], ascending=[False, False])

metrics_df.to_csv("../results/supervised_metrics.csv", index=False)
print(metrics_df)

# Pick best model by ROC AUC (macro OvR) — fallback to F1_macro
best = metrics_df.dropna(subset=["ROC_AUC_macro_ovr"]).head(1)
if best.empty:
    best = metrics_df.sort_values("F1_macro", ascending=False).head(1)

best_name = best.iloc[0]["Model"]
best_model = models[best_name]

# Micro-averaged ROC curve for best model
if hasattr(best_model, "predict_proba"):
    y_score = best_model.predict_proba(X_test)
elif hasattr(best_model, "decision_function"):
    df = best_model.decision_function(X_test)
    if df.ndim == 1:
        df = np.vstack([-df, df]).T
    ex = np.exp(df - df.max(axis=1, keepdims=True))
    y_score = ex / ex.sum(axis=1, keepdims=True)
else:
    y_score = None

if y_score is not None:
    y_test_bin = label_binarize(y_test, classes=classes)
    fpr, tpr, _ = roc_curve(y_test_bin.ravel(), y_score.ravel())
    roc_auc_micro = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, label=f"micro-average ROC (AUC = {roc_auc_micro:.3f})")
    plt.plot([0,1],[0,1],'--', label="Chance")
    plt.title(f"Micro-averaged ROC — {best_name}")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()
    plt.savefig("../results/roc_curves_micro.png", dpi=150)
    plt.close()

# Save the best baseline model
joblib.dump(best_model, "../models/baseline_model.pkl")
with open("../results/evaluation_metrics.txt", "w") as f:
    f.write(metrics_df.to_string(index=False))

print(f"Saved baseline best model: {best_name}")


                Model  Accuracy  Precision_macro  Recall_macro  F1_macro  \
0  LogisticRegression  0.606557         0.355198      0.362771  0.357489   
3             SVM_RBF  0.557377         0.197857      0.242424  0.217863   
2        RandomForest  0.524590         0.156098      0.193939  0.172973   
1        DecisionTree  0.459016         0.223333      0.228571  0.224615   

   Precision_weighted  Recall_weighted  F1_weighted  ROC_AUC_macro_ovr  
0            0.561719         0.606557     0.582123           0.798335  
3            0.457904         0.557377     0.502744           0.789306  
2            0.422231         0.524590     0.467878           0.764620  
1            0.460929         0.459016     0.458764           0.537896  
Saved baseline best model: LogisticRegression
