# 06 - Hyperparameter tuning & final model

In [4]:
# Cell 1: Imports and load dataset
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE  # Added import for SMOTE
import joblib

# Load dataset
df = pd.read_csv("../data/heart_disease_selected.csv")

X = df.drop("target", axis=1)
y = df["target"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ===========================
# OPTIONAL: Apply SMOTE for balancing (recommended)
# ===========================
print("Original class distribution in training set:")
print(y_train.value_counts())

# Uncomment the following lines to apply SMOTE:
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

print("\nAfter SMOTE application:")
print(pd.Series(y_train).value_counts())
print(f"Training set size after SMOTE: {X_train.shape[0]} samples")



Original class distribution in training set:
target
0    128
1     43
3     28
2     28
4     10
Name: count, dtype: int64

After SMOTE application:
target
1    128
0    128
3    128
4    128
2    128
Name: count, dtype: int64
Training set size after SMOTE: 640 samples


In [5]:
# Cell 2: Baseline models (no tuning)
models = {
    "LogisticRegression": LogisticRegression(max_iter=500, solver="liblinear"),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42)
}

baseline_results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    baseline_results[name] = acc

print("Baseline accuracies:")
print(baseline_results)



Baseline accuracies:
{'LogisticRegression': 0.65, 'DecisionTree': 0.5, 'RandomForest': 0.6333333333333333, 'SVM': 0.43333333333333335}




In [6]:
# Cell 3: Hyperparameter tuning - Logistic Regression
param_dist = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"]
}

log_reg = LogisticRegression(max_iter=10000, solver="liblinear")

log_search = RandomizedSearchCV(
    estimator=log_reg,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)

log_search.fit(X_train, y_train)

print("Best Logistic Regression:", log_search.best_params_)





Best Logistic Regression: {'penalty': 'l1', 'C': 100}




In [7]:
# Cell 4: Hyperparameter tuning - Decision Tree
param_dist = {
    "max_depth": [None, 5, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

dt = DecisionTreeClassifier(random_state=42)

dt_search = RandomizedSearchCV(
    estimator=dt,
    param_distributions=param_dist,
    n_iter=10,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)

dt_search.fit(X_train, y_train)

print("Best Decision Tree:", dt_search.best_params_)



Best Decision Tree: {'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 30}


In [8]:
# Cell 5: Hyperparameter tuning - Random Forest
param_dist = {
    "n_estimators": [50, 100, 200, 300],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"]
}

rf = RandomForestClassifier(random_state=42)

rf_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)

rf_search.fit(X_train, y_train)

print("Best Random Forest:", rf_search.best_params_)



Best Random Forest: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 10}


In [9]:
# Cell 6: Hyperparameter tuning - SVM
param_dist = {
    "C": [0.1, 1, 10, 100],
    "gamma": [0.001, 0.01, 0.1, 1],
    "kernel": ["rbf", "poly", "sigmoid"]
}

svm = SVC(random_state=42,max_iter=2000)

svm_search = RandomizedSearchCV(
    estimator=svm,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)

svm_search.fit(X_train, y_train)

print("Best SVM:", svm_search.best_params_)


Best SVM: {'kernel': 'rbf', 'gamma': 0.01, 'C': 100}


In [10]:
# Cell 7: Compare all tuned models
tuned_results = {
    "LogisticRegression": accuracy_score(y_test, log_search.best_estimator_.predict(X_test)),
    "DecisionTree": accuracy_score(y_test, dt_search.best_estimator_.predict(X_test)),
    "RandomForest": accuracy_score(y_test, rf_search.best_estimator_.predict(X_test)),
    "SVM": accuracy_score(y_test, svm_search.best_estimator_.predict(X_test)),
}

print("Baseline Results:", baseline_results)
print("Tuned Results:", tuned_results)

best_model_name = max(tuned_results, key=tuned_results.get)
best_model = {
    "LogisticRegression": log_search.best_estimator_,
    "DecisionTree": dt_search.best_estimator_,
    "RandomForest": rf_search.best_estimator_,
    "SVM": svm_search.best_estimator_
}[best_model_name]

print(f"✅ Best Model: {best_model_name} with accuracy {tuned_results[best_model_name]}")


Baseline Results: {'LogisticRegression': 0.65, 'DecisionTree': 0.5, 'RandomForest': 0.6333333333333333, 'SVM': 0.43333333333333335}
Tuned Results: {'LogisticRegression': 0.65, 'DecisionTree': 0.5333333333333333, 'RandomForest': 0.6, 'SVM': 0.4}
✅ Best Model: LogisticRegression with accuracy 0.65


In [11]:
# Cell 8: Save final chosen model
joblib.dump(best_model, "../models/final_model.pkl")
print(f"Model saved as ../models/final_model.pkl ({best_model_name})")


Model saved as ../models/final_model.pkl (LogisticRegression)
