In [1]:
# 1. Imports
# =========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib

# =========================
# 2. Load Data
# =========================
data = pd.read_csv("heart_disease_selected.csv")
target_col = "num"

X = data.drop(columns=[target_col])
y = data[target_col].apply(lambda x: 1 if x > 0 else 0)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train/Test sizes:", X_train.shape, X_test.shape)

# =========================
# 3. Helper Function
# =========================
def evaluate(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
    else:
        auc = None

    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1:", f1_score(y_test, y_pred))
    print("ROC-AUC:", auc)

# =========================
# 4. Logistic Regression Tuning
# =========================
lr_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000))
])

lr_param_grid = {
    "clf__C": [0.01, 0.1, 1, 10, 100],
    "clf__penalty": ["l2"],
    "clf__solver": ["lbfgs", "saga"]
}

lr_gs = GridSearchCV(lr_pipeline, lr_param_grid, cv=5, scoring="roc_auc", n_jobs=-1)
lr_gs.fit(X_train, y_train)

print("\nBest Logistic Regression Params:", lr_gs.best_params_)
evaluate("Logistic Regression (Tuned)", lr_gs.best_estimator_, X_test, y_test)

joblib.dump(lr_gs.best_estimator_, "model_logistic_regression_tuned.pkl")

# =========================
# 5. Decision Tree Tuning
# =========================
dt_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", DecisionTreeClassifier(random_state=42))
])

dt_param_grid = {
    "clf__max_depth": [None, 3, 5, 10],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4]
}

dt_gs = GridSearchCV(dt_pipeline, dt_param_grid, cv=5, scoring="roc_auc", n_jobs=-1)
dt_gs.fit(X_train, y_train)

print("\nBest Decision Tree Params:", dt_gs.best_params_)
evaluate("Decision Tree (Tuned)", dt_gs.best_estimator_, X_test, y_test)

joblib.dump(dt_gs.best_estimator_, "model_decision_tree_tuned.pkl")

# =========================
# 6. Random Forest Tuning
# =========================
rf_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(random_state=42))
])

rf_param_grid = {
    "clf__n_estimators": [50, 100, 200],
    "clf__max_depth": [None, 5, 10, 20],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4]
}

rf_rs = RandomizedSearchCV(rf_pipeline, rf_param_grid, n_iter=10,
                           cv=5, scoring="roc_auc", n_jobs=-1, random_state=42)
rf_rs.fit(X_train, y_train)

print("\nBest Random Forest Params:", rf_rs.best_params_)
evaluate("Random Forest (Tuned)", rf_rs.best_estimator_, X_test, y_test)

joblib.dump(rf_rs.best_estimator_, "model_random_forest_tuned.pkl")

# =========================
# 7. SVM Tuning
# =========================
svc_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", SVC(probability=True, random_state=42))
])

svc_param_grid = {
    "clf__C": [0.1, 1, 10, 100],
    "clf__gamma": ["scale", "auto"],
    "clf__kernel": ["rbf", "linear"]
}

svc_gs = GridSearchCV(svc_pipeline, svc_param_grid, cv=5, scoring="roc_auc", n_jobs=-1)
svc_gs.fit(X_train, y_train)

print("\nBest SVC Params:", svc_gs.best_params_)
evaluate("SVC (Tuned)", svc_gs.best_estimator_, X_test, y_test)

joblib.dump(svc_gs.best_estimator_, "model_svc_tuned.pkl")

print("\n✅ Hyperparameter tuning complete! Best models saved as .pkl files.")


Train/Test sizes: (242, 7) (61, 7)

Best Logistic Regression Params: {'clf__C': 10, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs'}

=== Logistic Regression (Tuned) ===
Accuracy: 0.8688524590163934
Precision: 0.8125
Recall: 0.9285714285714286
F1: 0.8666666666666667
ROC-AUC: 0.9383116883116882

Best Decision Tree Params: {'clf__max_depth': 3, 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 2}

=== Decision Tree (Tuned) ===
Accuracy: 0.8688524590163934
Precision: 0.8571428571428571
Recall: 0.8571428571428571
F1: 0.8571428571428571
ROC-AUC: 0.8701298701298701

Best Random Forest Params: {'clf__n_estimators': 100, 'clf__min_samples_split': 5, 'clf__min_samples_leaf': 2, 'clf__max_depth': 10}

=== Random Forest (Tuned) ===
Accuracy: 0.9016393442622951
Precision: 0.8928571428571429
Recall: 0.8928571428571429
F1: 0.8928571428571429
ROC-AUC: 0.9556277056277056

Best SVC Params: {'clf__C': 1, 'clf__gamma': 'scale', 'clf__kernel': 'rbf'}

=== SVC (Tuned) ===
Accuracy: 0.8688524590163934
Prec