In [16]:
!pip -q install shap xgboost

import os, sys, math, json, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import shap

RS = 42  # global seed

## 1) Column typing & encoders

In [17]:
PATH = "/content/sample_data/processed.cleveland.data"

if not os.path.exists(PATH):
    try:
        from google.colab import files
        print("Upload processed.cleveland.data")
        up = files.upload()
        # File will land in /content with its original name
        if "processed.cleveland.data" not in up:
            raise RuntimeError("Please upload 'processed.cleveland.data'.")
    except Exception as e:
        raise RuntimeError("Couldn't find or upload 'processed.cleveland.data'") from e

##2) Read & basic clean

In [18]:
cols = [
    'age','sex','cp','trestbps','chol','fbs','restecg',
    'thalach','exang','oldpeak','slope','ca','thal','target'
]
df = pd.read_csv(PATH, header=None, names=cols, na_values=["?"])
# Convert to numeric (safeguard)
for c in cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Binary target: 0 = no disease, 1 = disease (1..4 → 1)
df['target'] = (df['target'] > 0).astype(int)

print("Shape:", df.shape)
print(df.head())
print("\nMissing counts:\n", df.isnull().sum())

Shape: (303, 14)
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   slope   ca  thal  target  
0    3.0  0.0   6.0       0  
1    2.0  3.0   3.0       1  
2    2.0  2.0   7.0       1  
3    3.0  0.0   3.0       0  
4    1.0  0.0   3.0       0  

Missing counts:
 age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
target      0
dtype: int64


##3) Define features & preprocessing

In [19]:
# Numeric vs categorical (these are coded integers but categorical by meaning)
cat_cols = ['sex','cp','fbs','restecg','exang','slope','ca','thal']
num_cols = [c for c in df.columns if c not in cat_cols + ['target']]

numeric_pipe = SimpleImputer(strategy="median")
categorical_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("cat", categorical_pipe, cat_cols),
    ],
    sparse_threshold=1.0
)

X = df.drop(columns=["target"])
y = df["target"]

# Stratified split (fixed seed)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RS
)

# Inner split from TRAIN only for picking K (avoid test leakage)
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.25, stratify=y_train, random_state=RS
)
# So: 60% train, 20% val, 20% test

# Fit preprocess on TRAIN ONLY
preprocess.fit(X_tr)
Xtr_enc = preprocess.transform(X_tr)
Xval_enc = preprocess.transform(X_val)
Xte_enc  = preprocess.transform(X_test)

try:
    feat_names = preprocess.get_feature_names_out()
except:
    # Fallback (rare in newer sklearn)
    feat_names = [f"f_{i}" for i in range(Xtr_enc.shape[1])]

##4) Baseline models (FULL features)

In [20]:
try:
    from sklearn.preprocessing import TargetEncoder  # sklearn >= 1.6
    _SKLEARN_TE = True
except Exception:
    _SKLEARN_TE = False
    try:
        # Colab-friendly fallback
        import sys, subprocess
        subprocess.run([sys.executable, "-m", "pip", "install", "-q", "category_encoders"], check=False)
        from category_encoders import TargetEncoder as CAT_TargetEncoder
    except Exception as e:
        raise RuntimeError(
            "TargetEncoder not available. Upgrade scikit-learn to >=1.6 OR allow installing category_encoders."
        )

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import numpy as np
import pickle
import os

In [21]:
# 1) Column typing & encoders

def n_unique_nonnull(series):
    return series.dropna().nunique()

card = {c: n_unique_nonnull(X_train[c]) for c in cat_cols}

bin_cols   = [c for c,v in card.items() if v == 2]
low_cols   = [c for c,v in card.items() if 3 <= v <= 10]
high_cols  = [c for c,v in card.items() if v > 10]

# Pipelines per block
num_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

bin_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("ord", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])

low_ohe_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

if _SKLEARN_TE:
    te_estimator = TargetEncoder  # sklearn TargetEncoder
else:
    te_estimator = CAT_TargetEncoder  # category_encoders fallback

high_te_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("te", te_estimator())
])

# Build the ColumnTransformer; force dense output overall
transformers = []
if num_cols:
    transformers.append(("num", num_pipe, num_cols))
if bin_cols:
    transformers.append(("bin", bin_pipe, bin_cols))
if low_cols:
    transformers.append(("low", low_ohe_pipe, low_cols))
if high_cols:
    transformers.append(("high", high_te_pipe, high_cols))

preprocess = ColumnTransformer(
    transformers=transformers,
    remainder="drop",
    sparse_threshold=0.0   # force dense
)

In [22]:
# 2) Define base models

models = {
    "logreg": LogisticRegression(random_state=RS, solver="saga", max_iter=3000),
    "dtree":  DecisionTreeClassifier(random_state=RS),
    "rf":     RandomForestClassifier(random_state=RS, n_jobs=-1),
    "xgb":    XGBClassifier(
                 random_state=RS,
                 tree_method="hist",
                 # silent defaults; other params tuned in grid
                 n_jobs=-1
             ),
    "svc":    SVC(random_state=RS, probability=True)
}

In [23]:
# 3) Grids

logreg_grid = {
    'logreg__penalty': ['l2', 'l1'],
    'logreg__C': [10**x for x in range(-2, 3)]
}

dtree_grid = {
    'dtree__max_depth': [None] + [x for x in range(3, 11, 2)],
    'dtree__min_samples_split': [x for x in range(2, 14, 4)],
    'dtree__min_samples_leaf': [1, 10, 20]
}

rf_grid = {
    'rf__n_estimators': [100, 200, 400, 600],
    'rf__max_depth': [None] + [x for x in range(3, 11, 2)],
    'rf__min_samples_split': [x for x in range(2, 14, 4)],
    'rf__min_samples_leaf': [1, 10, 20]
}

xgb_grid = {
    'xgb__n_estimators': [100, 200, 400],
    'xgb__max_depth': [3, 4, 5],
    'xgb__min_child_weight': [1, 3, 5],
    'xgb__subsample': [0.7, 0.8, 0.9, 1.0],
    'xgb__colsample_bytree': [0.7, 0.8, 0.9, 1.0]  # NOTE: correct name is "colsample_bytree"
}

svc_grid = {
    'svc__C': [10**x for x in range(-2, 3)],
    'svc__gamma': [10**x for x in range(-2, 3)]
}

grids = {
    "logreg": logreg_grid,
    "dtree":  dtree_grid,
    "rf":     rf_grid,
    "xgb":    xgb_grid,
    "svc":    svc_grid
}

In [14]:
# 4) Pipelines & Search
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
import joblib
from scipy.stats import randint, uniform
import time

# 1) Faster CV & scoring
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RS)  # 3 folds
scoring = "roc_auc"

# 2) Optional: cache preprocessing (BIG speedup)
CACHE_DIR = "/content/cache"
memory = joblib.Memory(CACHE_DIR, verbose=0)

# 3) use gpu_hist for XGB
xgb_tree_method = "gpu_hist" if "CUDA_VISIBLE_DEVICES" in os.environ else "hist"
if hasattr(models["xgb"], "set_params"):
    models["xgb"].set_params(tree_method=xgb_tree_method)

dataset_name = "cleveland-heart"
results = {}

# 4) Lean search spaces
logreg_grid = {
    'logreg__penalty': ['l2', 'l1'],
    'logreg__C': [0.01, 0.1, 1, 10, 100],
}

dtree_grid = {
    'dtree__max_depth': [None, 3, 5, 7, 9],
    'dtree__min_samples_split': [2, 6, 10],
    'dtree__min_samples_leaf': [1, 5, 10],
}

# Randomized distributions for heavy models
rf_space = {
    'rf__n_estimators': randint(120, 301),            # 120–300
    'rf__max_depth': [None, 5, 7, 9],
    'rf__min_samples_split': [2, 6, 10],
    'rf__min_samples_leaf': [1, 3, 5],
}

xgb_space = {
    'xgb__n_estimators': randint(120, 241),           # 120–240
    'xgb__max_depth': [3, 4, 5],
    'xgb__min_child_weight': [1, 3, 5],
    'xgb__subsample': [0.8, 1.0],
    'xgb__colsample_bytree': [0.8, 1.0],
    # tree_method already set above
}

svc_space = {
    'svc__C': [0.1, 1, 10],
    'svc__gamma': ['scale', 0.1, 0.01],
}

# 5) Decide which uses grid vs randomized
search_plan = {
    "logreg": ("grid", logreg_grid),
    "dtree":  ("grid", dtree_grid),
    "rf":     ("random", rf_space),
    "xgb":    ("random", xgb_space),
    "svc":    ("random", svc_space),
}

print("FAST MODE: starting searches...\n")

for name, estimator in models.items():
    # Pipeline with caching
    pipe = Pipeline(steps=[("prep", preprocess), (name, estimator)], memory=memory)

    kind, space = search_plan[name]
    if kind == "grid":
        searcher = GridSearchCV(
            pipe, space, scoring=scoring, cv=cv, n_jobs=-1, refit=True, verbose=0
        )
    else:
        # n_iter controls speed/quality tradeoff (try 15–30)
        searcher = RandomizedSearchCV(
            pipe, space, n_iter=20, scoring=scoring, cv=cv,
            n_jobs=-1, refit=True, verbose=0, random_state=RS
        )

    t0 = time.time()
    searcher.fit(X_train, y_train)  # TRAIN ONLY
    fit_secs = time.time() - t0

    # Evaluate on TEST
    proba = searcher.predict_proba(X_test)[:, 1]
    pred  = (proba >= 0.5).astype(int)

    acc = accuracy_score(y_test, pred)
    f1  = f1_score(y_test, pred)
    auc = roc_auc_score(y_test, proba)

    results[name] = {
        "best_params": searcher.best_params_,
        "best_score_cv": searcher.best_score_,
        "test_acc": acc,
        "test_f1": f1,
        "test_auc": auc,
        "secs": fit_secs,
        "model": searcher
    }

    # Save each fitted searcher (pipeline included)
    fname = f"{dataset_name}-{name}-full.pickle"
    with open(fname, "wb") as f:
        pickle.dump(searcher, f)

    print(f"{name.upper():5s} done in {fit_secs/60:.1f} min | "
          f"AUC={auc:.3f} ACC={acc:.3f} F1={f1:.3f}")

print("\n=== FAST MODE: Test Metrics ===")
for k, v in results.items():
    print(f"{k.upper():5s} | AUC={v['test_auc']:.3f}  ACC={v['test_acc']:.3f}  F1={v['test_f1']:.3f}  (CV best={v['best_score_cv']:.3f})")

FAST MODE: starting searches...

LOGREG done in 0.1 min | AUC=0.968 ACC=0.902 F1=0.897
DTREE done in 0.1 min | AUC=0.871 ACC=0.869 F1=0.857
RF    done in 0.4 min | AUC=0.956 ACC=0.902 F1=0.897
XGB   done in 0.0 min | AUC=0.946 ACC=0.885 F1=0.881
SVC   done in 0.0 min | AUC=0.966 ACC=0.869 F1=0.867

=== FAST MODE: Test Metrics ===
LOGREG | AUC=0.968  ACC=0.902  F1=0.897  (CV best=0.908)
DTREE | AUC=0.871  ACC=0.869  F1=0.857  (CV best=0.850)
RF    | AUC=0.956  ACC=0.902  F1=0.897  (CV best=0.901)
XGB   | AUC=0.946  ACC=0.885  F1=0.881  (CV best=0.882)
SVC   | AUC=0.966  ACC=0.869  F1=0.867  (CV best=0.900)


In [15]:
# 5) Report

print("\n=== Test Metrics (held-out TEST) ===")
for k, v in results.items():
    print(f"{k.upper():5s} | AUC={v['test_auc']:.3f}  ACC={v['test_acc']:.3f}  F1={v['test_f1']:.3f}  (CV best={v['best_score_cv']:.3f})")

print("\nBest params per model:")
for k, v in results.items():
    print(k, "→", v["best_params"])


=== Test Metrics (held-out TEST) ===
LOGREG | AUC=0.968  ACC=0.902  F1=0.897  (CV best=0.908)
DTREE | AUC=0.871  ACC=0.869  F1=0.857  (CV best=0.850)
RF    | AUC=0.956  ACC=0.902  F1=0.897  (CV best=0.901)
XGB   | AUC=0.946  ACC=0.885  F1=0.881  (CV best=0.882)
SVC   | AUC=0.966  ACC=0.869  F1=0.867  (CV best=0.900)

Best params per model:
logreg → {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
dtree → {'dtree__max_depth': 3, 'dtree__min_samples_leaf': 10, 'dtree__min_samples_split': 2}
rf → {'rf__max_depth': None, 'rf__min_samples_leaf': 5, 'rf__min_samples_split': 10, 'rf__n_estimators': 200}
xgb → {'xgb__colsample_bytree': 0.8, 'xgb__max_depth': 3, 'xgb__min_child_weight': 5, 'xgb__n_estimators': 137, 'xgb__subsample': 1.0}
svc → {'svc__gamma': 0.01, 'svc__C': 1}
