In [41]:
# ===============================
# SPECT Heart - Full Colab Script
# Compatible: scikit-learn 1.4.2, xgboost 2.1.1
# ===============================

import os, time, json, warnings, pathlib
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score, accuracy_score, f1_score,
    classification_report, confusion_matrix
)
from sklearn.utils import Bunch

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# --------------- Settings ---------------
RS = 42
CV_N_SPLITS = 3              # change to 5 if you used 5-fold in Oct13
SCORING = "roc_auc"          # change to "average_precision" if needed
TRAIN_PATH = "/content/sample_data/SPECT.train"
TEST_PATH  = "/content/sample_data/SPECT.test"

assert os.path.exists(TRAIN_PATH), f"File not found: {TRAIN_PATH}"
assert os.path.exists(TEST_PATH),  f"File not found: {TEST_PATH}"

# --------------- Data Loading ---------------
def load_spect(path: str) -> Bunch:
    # UCI SPECT: first column is label, rest are binary features
    df = pd.read_csv(path, header=None)
    n_features = df.shape[1] - 1
    df.columns = ["target"] + [f"x{i+1}" for i in range(n_features)]
    # coerce numeric + drop NA
    for c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    df = df.dropna().astype(int)
    X = df.drop(columns=["target"]).values
    y = df["target"].values
    return Bunch(frame=df, X=X, y=y, feature_names=df.columns[1:])

train = load_spect(TRAIN_PATH)
test  = load_spect(TEST_PATH)

print("Train shape:", train.X.shape, "| Test shape:", test.X.shape)
print("Train class counts:", np.bincount(train.y))
print("Test  class counts:", np.bincount(test.y))

# If your positive class is flipped vs. Oct13, uncomment:
# train.y = 1 - train.y
# test.y  = 1 - test.y

# --------------- Pipelines ---------------
cv = StratifiedKFold(n_splits=CV_N_SPLITS, shuffle=True, random_state=RS)

pipelines = {
    "logreg": Pipeline([
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(max_iter=1000, random_state=RS))
    ]),
    "dtree": Pipeline([
        ("dtree", DecisionTreeClassifier(random_state=RS))
    ]),
    "rf": Pipeline([
        ("rf", RandomForestClassifier(random_state=RS, n_jobs=-1))
    ]),
    "xgb": Pipeline([
        ("xgb", XGBClassifier(
            random_state=RS,
            n_estimators=300,
            learning_rate=0.1,
            subsample=0.9,
            colsample_bytree=0.9,
            tree_method="hist",   # CPU-friendly and portable
            n_jobs=-1,
            eval_metric="logloss"
        ))
    ]),
    "svc": Pipeline([
        ("scaler", StandardScaler()),
        ("svc", SVC(probability=True, random_state=RS))
    ]),
}

# --------------- Parameter Grids ---------------
param_grids = {
    "logreg": {
        "logreg__penalty": ["l2"],
        "logreg__C": [0.01, 0.1, 1.0, 10.0]
    },
    "dtree": {
        "dtree__max_depth": [None, 3, 4, 5],
        "dtree__min_samples_split": [2, 5, 10],
        "dtree__min_samples_leaf": [1, 5, 10]
    },
    "rf": {
        "rf__n_estimators": [200, 400],
        "rf__max_depth": [None, 4, 6],
        "rf__min_samples_split": [2, 5, 10],
        "rf__min_samples_leaf": [1, 5, 10]
    },
    "xgb": {
        "xgb__max_depth": [3, 4, 5],
        "xgb__min_child_weight": [1, 2, 4],
        "xgb__gamma": [0, 0.5],
        "xgb__reg_lambda": [1.0, 2.0, 5.0]
    },
    "svc": {
        "svc__C": [0.1, 1.0, 10.0],
        "svc__kernel": ["rbf", "linear"]
    },
}

# --------------- Helpers (robust to API quirks) ---------------
def proba_from_model(best_model, X, model_key):
    """
    Robustly get probabilities for positive class.
    Safely handles XGB inside a Pipeline.
    """
    if model_key == "xgb":
        est = getattr(best_model, "named_steps", {}).get("xgb", best_model)
        # XGBClassifier supports predict_proba
        proba = est.predict_proba(X)
        return proba[:, 1] if proba.ndim == 2 and proba.shape[1] > 1 else proba.ravel()

    if hasattr(best_model, "predict_proba"):
        proba = best_model.predict_proba(X)
        return proba[:, 1] if proba.ndim == 2 and proba.shape[1] > 1 else proba.ravel()

    if hasattr(best_model, "decision_function"):
        scores = best_model.decision_function(X)
        ranks = pd.Series(scores).rank(method="average").values
        return (ranks - ranks.min()) / (ranks.max() - ranks.min() + 1e-9)

    return best_model.predict(X)


def predict_from_model(best_model, X, model_key):
    """
    Robustly get hard class predictions.
    Safely handles XGB inside a Pipeline.
    """
    if model_key == "xgb":
        est = getattr(best_model, "named_steps", {}).get("xgb", best_model)
        return est.predict(X)
    return best_model.predict(X)

# --------------- Train + Evaluate ---------------
results = {}
start_all = time.time()

for name, pipe in pipelines.items():
    print(f"\nSearching: {name.upper()}")
    t0 = time.time()
    gs = GridSearchCV(
        estimator=pipe,
        param_grid=param_grids[name],
        scoring=SCORING,
        cv=cv,
        n_jobs=-1,
        verbose=0,
        refit=True
    )
    gs.fit(train.X, train.y)
    dur_min = (time.time() - t0) / 60.0

    best_model = gs.best_estimator_

    # robust proba/preds (handles XGB inside Pipeline)
    y_proba = proba_from_model(best_model, test.X, name)
    y_pred  = predict_from_model(best_model, test.X, name)

    # Metrics
    test_auc = roc_auc_score(test.y, y_proba)
    test_acc = accuracy_score(test.y, y_pred)
    test_f1  = f1_score(test.y, y_pred, zero_division=0)

    print(f"{name.upper()} done in {dur_min:.2f} min | "
          f"AUC={test_auc:.3f} ACC={test_acc:.3f} F1={test_f1:.3f}")

    results[name] = {
        "best_score_cv": gs.best_score_,
        "best_params": gs.best_params_,
        "test_auc": float(test_auc),
        "test_acc": float(test_acc),
        "test_f1":  float(test_f1),
        "fit_time_min": dur_min
    }

elapsed_all = (time.time() - start_all) / 60.0

# --------------- Summary ---------------
print("\n=== Test Metrics (held-out TEST) ===")
for k, v in results.items():
    print(f"{k.upper():5s} | AUC={v['test_auc']:.3f}  "
          f"ACC={v['test_acc']:.3f}  F1={v['test_f1']:.3f}  "
          f"(CV best={v['best_score_cv']:.3f})")

print("\nBest params per model:")
for k, v in results.items():
    print(k, "→", v["best_params"])

print(f"\nTotal elapsed: {elapsed_all:.2f} min")

# --------------- Detailed report for top model ---------------
top = max(results.items(), key=lambda kv: kv[1]["test_auc"])[0]
print(f"\n=== Detailed classification report: {top.upper()} ===")

gs_top = GridSearchCV(
    estimator=pipelines[top],
    param_grid=param_grids[top],
    scoring=SCORING,
    cv=cv,
    n_jobs=-1,
    refit=True,
    verbose=0
)
gs_top.fit(train.X, train.y)
best_top = gs_top.best_estimator_

y_pred_top  = predict_from_model(best_top, test.X, top)
print(classification_report(test.y, y_pred_top, digits=3))
print("Confusion matrix:\n", confusion_matrix(test.y, y_pred_top))

# --------------- Save artifacts (optional) ---------------
OUTDIR = pathlib.Path("/content/spect_outputs")
OUTDIR.mkdir(parents=True, exist_ok=True)

with open(OUTDIR / "results.json", "w") as f:
    json.dump(results, f, indent=2)

# Save best estimators for each model
import joblib
for name, pipe in pipelines.items():
    gs = GridSearchCV(
        estimator=pipe,
        param_grid=param_grids[name],
        scoring=SCORING,
        cv=cv,
        n_jobs=-1,
        refit=True,
        verbose=0
    )
    gs.fit(train.X, train.y)
    joblib.dump(gs.best_estimator_, OUTDIR / f"{name}_best.joblib")

print(f"\nArtifacts saved to: {OUTDIR}")

Train shape: (80, 22) | Test shape: (187, 22)
Train class counts: [40 40]
Test  class counts: [ 15 172]

Searching: LOGREG
LOGREG done in 0.00 min | AUC=0.841 ACC=0.754 F1=0.851

Searching: DTREE
DTREE done in 0.01 min | AUC=0.746 ACC=0.706 F1=0.814

Searching: RF
RF done in 0.94 min | AUC=0.827 ACC=0.797 F1=0.879

Searching: XGB
XGB done in 0.06 min | AUC=0.822 ACC=0.722 F1=0.827

Searching: SVC
SVC done in 0.00 min | AUC=0.817 ACC=0.856 F1=0.918

=== Test Metrics (held-out TEST) ===
LOGREG | AUC=0.841  ACC=0.754  F1=0.851  (CV best=0.792)
DTREE | AUC=0.746  ACC=0.706  F1=0.814  (CV best=0.748)
RF    | AUC=0.827  ACC=0.797  F1=0.879  (CV best=0.778)
XGB   | AUC=0.822  ACC=0.722  F1=0.827  (CV best=nan)
SVC   | AUC=0.817  ACC=0.856  F1=0.918  (CV best=0.810)

Best params per model:
logreg → {'logreg__C': 0.01, 'logreg__penalty': 'l2'}
dtree → {'dtree__max_depth': 3, 'dtree__min_samples_leaf': 1, 'dtree__min_samples_split': 2}
rf → {'rf__max_depth': 4, 'rf__min_samples_leaf': 1, 'rf__mi