# Heart Disease ML Pipeline (UCI) — Notebooks

These notebooks implement a full pipeline on the **UCI Heart Disease** dataset using your requested loader:

```python
from ucimlrepo import fetch_ucirepo
heart_disease = fetch_ucirepo(id=45)
X = heart_disease.data.features
y = heart_disease.data.targets
```

> Bonus items (Streamlit/Ngrok) are intentionally **omitted** per the request.

## 06 — Hyperparameter Tuning

- RandomizedSearchCV (fast search) + GridSearchCV (refine) for RandomForest and SVM
- Saves the final best estimator as `models/final_model.pkl`
- Logs the best params and scores to `results/tuning_report.txt`

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from scipy.stats import randint, loguniform
from sklearn.metrics import roc_auc_score
import joblib

# Load split data
X_train = np.load("../data/X_train.npy")
X_test  = np.load("../data/X_test.npy")
y_train = np.load("../data/y_train.npy")
y_test  = np.load("../data/y_test.npy")

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# -----------------------
# Random Forest Tuning
# -----------------------
rf = RandomForestClassifier(random_state=42)
rf_param_dist = {
    "n_estimators": randint(200, 600),
    "max_depth": randint(2, 20),
    "min_samples_split": randint(2, 20),
    "min_samples_leaf": randint(1, 10),
    "max_features": ["sqrt", "log2", None]
}

rf_rand = RandomizedSearchCV(
    rf, rf_param_dist, n_iter=25,
    scoring="roc_auc_ovr", cv=cv, n_jobs=-1,
    random_state=42, verbose=1
)
rf_rand.fit(X_train, y_train)

rf_grid = GridSearchCV(
    rf_rand.best_estimator_,
    {
        "n_estimators": [rf_rand.best_params_["n_estimators"]],
        "max_depth": [rf_rand.best_params_["max_depth"], None],
        "min_samples_split": [rf_rand.best_params_["min_samples_split"]],
        "min_samples_leaf": [rf_rand.best_params_["min_samples_leaf"]],
        "max_features": [rf_rand.best_params_["max_features"]],
    },
    scoring="roc_auc_ovr", cv=cv, n_jobs=-1, verbose=1
)
rf_grid.fit(X_train, y_train)

# -----------------------
# SVM Tuning
# -----------------------
svm = SVC(kernel="rbf", probability=True, random_state=42)
svm_param_dist = {
    "C": loguniform(1e-2, 1e3),
    "gamma": loguniform(1e-4, 1e0)
}

svm_rand = RandomizedSearchCV(
    svm, svm_param_dist, n_iter=25,
    scoring="roc_auc_ovr", cv=cv, n_jobs=-1,
    random_state=42, verbose=1
)
svm_rand.fit(X_train, y_train)

svm_grid = GridSearchCV(
    svm_rand.best_estimator_,
    {
        "C": [svm_rand.best_params_["C"]],
        "gamma": [svm_rand.best_params_["gamma"]],
    },
    scoring="roc_auc_ovr", cv=cv, n_jobs=-1, verbose=1
)
svm_grid.fit(X_train, y_train)

# -----------------------
# Evaluate on Holdout
# -----------------------
rf_best = rf_grid.best_estimator_
svm_best = svm_grid.best_estimator_

rf_auc = roc_auc_score(y_test, rf_best.predict_proba(X_test),
                       multi_class="ovr", average="macro")
svm_auc = roc_auc_score(y_test, svm_best.predict_proba(X_test),
                        multi_class="ovr", average="macro")

# Pick the final model
best_model = rf_best if rf_auc >= svm_auc else svm_best
joblib.dump(best_model, "../models/final_model.pkl")

# Save results
with open("../results/tuning_report.txt", "w") as f:
    f.write("RandomForest Randomized best params: " + str(rf_rand.best_params_) + "\n")
    f.write("RandomForest Grid best params: " + str(rf_grid.best_params_) + "\n")
    f.write("SVM Randomized best params: " + str(svm_rand.best_params_) + "\n")
    f.write("SVM Grid best params: " + str(svm_grid.best_params_) + "\n")
    f.write(f"Holdout ROC-AUC (macro-ovr) -> RF={rf_auc:.4f}, SVM={svm_auc:.4f}\n")
    f.write(f"Selected final model: {'RandomForest' if best_model is rf_best else 'SVM'}\n")

print("Saved final best model to ../models/final_model.pkl and tuning report to ../results.")


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Saved final best model to ../models/final_model.pkl and tuning report to ../results.
