# 06 - Hyperparameter tuning & final model

In [6]:
# =============================
# Cell 1: Imports and dataset
# =============================
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
import joblib

# Load dataset
df = pd.read_csv("../data/heart_disease_selected.csv")

X = df.drop("target", axis=1)
y = df["target"]

# Train/test split (keep stratify to preserve distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set distribution before SMOTE:")
print(y_train.value_counts())



Training set distribution before SMOTE:
target
0    128
1     43
3     28
2     28
4     10
Name: count, dtype: int64


In [7]:
# =============================
# Cell 2: Baseline models (pipelines)
# =============================

models = {
    "LogisticRegression": Pipeline([
        ("smote", SMOTE(random_state=42)),
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=500, solver="liblinear"))
    ]),
    "DecisionTree": Pipeline([
        ("smote", SMOTE(random_state=42)),
        ("clf", DecisionTreeClassifier(random_state=42))
    ]),
    "RandomForest": Pipeline([
        ("smote", SMOTE(random_state=42)),
        ("clf", RandomForestClassifier(random_state=42))
    ]),
    "SVM": Pipeline([
        ("smote", SMOTE(random_state=42)),
        ("scaler", StandardScaler()),
        ("clf", SVC(random_state=42))
    ])
}

baseline_results = {}
for name, pipeline in models.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    baseline_results[name] = acc

print("\nBaseline accuracies with SMOTE in pipeline:")
print(baseline_results)



Baseline accuracies with SMOTE in pipeline:
{'LogisticRegression': 0.65, 'DecisionTree': 0.5, 'RandomForest': 0.6333333333333333, 'SVM': 0.5166666666666667}




In [8]:
# =============================
# Cell 3: Hyperparameter tuning
# =============================

# Logistic Regression
log_param_dist = {
    "clf__C": [0.01, 0.1, 1, 10, 100],
    "clf__penalty": ["l1", "l2"]
}

log_pipe = Pipeline([
    ("smote", SMOTE(random_state=42)),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=10000, solver="liblinear"))
])

log_search = RandomizedSearchCV(
    estimator=log_pipe,
    param_distributions=log_param_dist,
    n_iter=10,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)
log_search.fit(X_train, y_train)
print("Best Logistic Regression params:", log_search.best_params_)

# Decision Tree
dt_param_dist = {
    "clf__max_depth": [None, 5, 10, 20, 30],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4]
}

dt_pipe = Pipeline([
    ("smote", SMOTE(random_state=42)),
    ("clf", DecisionTreeClassifier(random_state=42))
])

dt_search = RandomizedSearchCV(
    estimator=dt_pipe,
    param_distributions=dt_param_dist,
    n_iter=10,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)
dt_search.fit(X_train, y_train)
print("Best Decision Tree params:", dt_search.best_params_)

# Random Forest
rf_param_dist = {
    "clf__n_estimators": [50, 100, 200, 300],
    "clf__max_depth": [None, 5, 10, 20],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4],
    "clf__max_features": ["sqrt", "log2"]
}

rf_pipe = Pipeline([
    ("smote", SMOTE(random_state=42)),
    ("clf", RandomForestClassifier(random_state=42))
])

rf_search = RandomizedSearchCV(
    estimator=rf_pipe,
    param_distributions=rf_param_dist,
    n_iter=20,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)
rf_search.fit(X_train, y_train)
print("Best Random Forest params:", rf_search.best_params_)

# SVM
svm_param_dist = {
    "clf__C": [0.1, 1, 10, 100],
    "clf__gamma": [0.001, 0.01, 0.1, 1],
    "clf__kernel": ["rbf", "poly", "sigmoid"]
}

svm_pipe = Pipeline([
    ("smote", SMOTE(random_state=42)),
    ("scaler", StandardScaler()),
    ("clf", SVC(random_state=42, max_iter=2000))
])

svm_search = RandomizedSearchCV(
    estimator=svm_pipe,
    param_distributions=svm_param_dist,
    n_iter=15,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)
svm_search.fit(X_train, y_train)
print("Best SVM params:", svm_search.best_params_)



Best Logistic Regression params: {'clf__penalty': 'l2', 'clf__C': 0.01}
Best Decision Tree params: {'clf__min_samples_split': 2, 'clf__min_samples_leaf': 2, 'clf__max_depth': 30}
Best Random Forest params: {'clf__n_estimators': 300, 'clf__min_samples_split': 2, 'clf__min_samples_leaf': 4, 'clf__max_features': 'log2', 'clf__max_depth': 20}
Best SVM params: {'clf__kernel': 'rbf', 'clf__gamma': 0.01, 'clf__C': 10}


In [9]:
# =============================
# Cell 4: Compare tuned models
# =============================

tuned_results = {
    "LogisticRegression": accuracy_score(y_test, log_search.best_estimator_.predict(X_test)),
    "DecisionTree": accuracy_score(y_test, dt_search.best_estimator_.predict(X_test)),
    "RandomForest": accuracy_score(y_test, rf_search.best_estimator_.predict(X_test)),
    "SVM": accuracy_score(y_test, svm_search.best_estimator_.predict(X_test)),
}

print("\nBaseline Results:", baseline_results)
print("Tuned Results:", tuned_results)

best_model_name = max(tuned_results, key=tuned_results.get)
best_model = {
    "LogisticRegression": log_search.best_estimator_,
    "DecisionTree": dt_search.best_estimator_,
    "RandomForest": rf_search.best_estimator_,
    "SVM": svm_search.best_estimator_
}[best_model_name]

print(f"\n✅ Best Model: {best_model_name} with accuracy {tuned_results[best_model_name]}")




Baseline Results: {'LogisticRegression': 0.65, 'DecisionTree': 0.5, 'RandomForest': 0.6333333333333333, 'SVM': 0.5166666666666667}
Tuned Results: {'LogisticRegression': 0.5833333333333334, 'DecisionTree': 0.5333333333333333, 'RandomForest': 0.6, 'SVM': 0.5666666666666667}

✅ Best Model: RandomForest with accuracy 0.6


In [10]:
# =============================
# Cell 5: Final evaluation & save
# =============================
y_pred = best_model.predict(X_test)
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

joblib.dump(best_model, "../models/final_model.pkl")
print(f"Model saved as ../models/final_model.pkl ({best_model_name})")


Confusion Matrix:
[[29  3  0  0  0]
 [ 3  2  2  4  0]
 [ 0  2  2  2  1]
 [ 0  3  1  3  0]
 [ 0  2  0  1  0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.91      0.91        32
           1       0.17      0.18      0.17        11
           2       0.40      0.29      0.33         7
           3       0.30      0.43      0.35         7
           4       0.00      0.00      0.00         3

    accuracy                           0.60        60
   macro avg       0.35      0.36      0.35        60
weighted avg       0.60      0.60      0.60        60

Model saved as ../models/final_model.pkl (RandomForest)
