# Hyper Parameter Tuning
This Notebook Optimizes Models using Hyperparameter Tuning (GridSearchCV, RandomizedSearchCV).

In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

## 1. Load feature selected dataset

In [2]:
df = pd.read_csv("../data/selected_feature_heart_disease.csv")

X = df.drop("target", axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 2. Optimize model hyperparameters

### Logistic Regression

In [3]:
grid_lr = GridSearchCV(LogisticRegression(random_state=42), {
    "C": [0.01, 0.1, 1, 10, 100],
    "solver": ["liblinear", "saga"],
    "max_iter": [100, 200, 300, 400, 500],
}, cv=5, scoring="accuracy", n_jobs=-1)

grid_lr.fit(X_train, y_train)
print("Best Logistic Regression:", grid_lr.best_params_)


Best Logistic Regression: {'C': 0.1, 'max_iter': 100, 'solver': 'liblinear'}


### Decision Tree

In [4]:
rand_dt = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), {
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}, cv=5, scoring="accuracy", n_jobs=-1)

rand_dt.fit(X_train, y_train)
print("Best Decision Tree:", rand_dt.best_params_)


Best Decision Tree: {'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': None}


### Random Forest

In [5]:
rand_rf = RandomizedSearchCV(RandomForestClassifier(random_state=42), {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}, cv=5, scoring="accuracy", n_jobs=-1)

rand_rf.fit(X_train, y_train)
print("Best Random Forest:", rand_rf.best_params_)


Best Random Forest: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 20, 'bootstrap': False}


### SVM

In [6]:
grid_svm = GridSearchCV(SVC(probability=True, random_state=42), {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf", "poly"],
    "gamma": ["scale", "auto"]
}, cv=5, scoring="accuracy", n_jobs=-1)

grid_svm.fit(X_train, y_train)
print("Best SVM:", grid_svm.best_params_)


Best SVM: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


## 3. Comparing optimized models with baseline performance

### Calculating the baseline performance for every model

In [7]:
baseline = {}

# Logistic Regression
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
baseline["Logistic Regression"] = accuracy_score(y_test, lr.predict(X_test))

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
baseline["Decision Tree"] = accuracy_score(y_test, dt.predict(X_test))

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
baseline["Random Forest"] = accuracy_score(y_test, rf.predict(X_test))

# SVM
svm = SVC(probability=True, random_state=42)
svm.fit(X_train, y_train)
baseline["SVM"] = accuracy_score(y_test, svm.predict(X_test))

In [8]:
optimized = {}

optimized["Logistic Regression"] = accuracy_score(y_test, grid_lr.best_estimator_.predict(X_test))
optimized["Decision Tree"] = accuracy_score(y_test, rand_dt.best_estimator_.predict(X_test))
optimized["Random Forest"] = accuracy_score(y_test, rand_rf.best_estimator_.predict(X_test))
optimized["SVM"] = accuracy_score(y_test, grid_svm.best_estimator_.predict(X_test))

comparison = pd.DataFrame({
    "Baseline Accuracy": baseline,
    "Tuned Accuracy": optimized
})

print("\t\t=== Model Comparison ===")
print(comparison)


		=== Model Comparison ===
                     Baseline Accuracy  Tuned Accuracy
Logistic Regression           0.916667        0.916667
Decision Tree                 0.900000        0.916667
Random Forest                 0.883333        0.916667
SVM                           0.900000        0.866667


## 4. Model Exporting

In [9]:
# Build pipeline
pipeline = Pipeline([
    ("scaler", StandardScaler()), 
    ("model", grid_lr.best_estimator_)
])

pipeline.fit(X, y) # Fit on full dataset (train + test)

joblib.dump(pipeline, "../models/final_model.pkl")

['../models/final_model.pkl']