<h1>Hyperparameter Tuning</h1>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from scipy.stats import uniform
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\


In [3]:
# load the dataset
df = pd.read_csv('../data/heart_disease_cleaned.csv')

In [6]:
X = df.drop('target', axis=1)
y = df['target']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# make a function to evaluate model
def evaluate_model(name , model , X_train , X_test , y_train , y_test):
    y_pred = model.predict(X_test)
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_proba = None
    return {
        "model": name,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_proba) if y_proba is not None else None
    }

In [5]:
results = []

<h3>Logistic Regression</h3>

In [9]:
baseline_lr = LogisticRegression(max_iter=5000 , random_state=42).fit(X_train , y_train)
results.append({**evaluate_model("Logistic Regression (Baseline)", baseline_lr, X_train, X_test, y_train, y_test), "Best Params": "Default"})


In [12]:
param_dist = {
    "C": uniform(0.01, 10),
    "penalty": ["l1", "l2"],
    "solver": ["saga", "liblinear"]
}

In [13]:
random_search = RandomizedSearchCV(LogisticRegression(max_iter=5000, random_state=42),
                                   param_distributions=param_dist,
                                   n_iter=20, cv=5, scoring="accuracy", n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)
best_lr = random_search.best_estimator_
results.append({**evaluate_model("Logistic Regression (Randomized Search)", best_lr, X_train, X_test, y_train, y_test), "Best Params": random_search.best_params_})

<h3>Decision Tree</h3>

In [14]:
baseline_dt = DecisionTreeClassifier(random_state=42).fit(X_train , y_train)
results.append({**evaluate_model("Decision Tree (Baseline)", baseline_dt, X_train, X_test, y_train, y_test), "Best Params": "Default"})


In [15]:
param_grid = {
    "max_depth": [3, 5, 7, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "criterion": ["gini", "entropy"]
}

In [16]:
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)
best_dt = grid_search.best_estimator_
results.append({**evaluate_model("Decision Tree (Grid Search)", best_dt, X_train, X_test, y_train, y_test), "Best Params": grid_search.best_params_})

<h3>Random Forest</h3>

In [17]:
baseline_rf = RandomForestClassifier(random_state=42).fit(X_train , y_train)
results.append({**evaluate_model("Random Forest (Baseline)", baseline_rf, X_train, X_test, y_train, y_test), "Best Params": "Default"})

In [18]:
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

In [19]:
grid_search = GridSearchCV(RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_
results.append({**evaluate_model("Random Forest (Grid Search)", best_rf, X_train, X_test, y_train, y_test), "Best Params": grid_search.best_params_})

<h3>SVM</h3>

In [20]:
baseline_svm = SVC(random_state=42).fit(X_train , y_train)
results.append({**evaluate_model("Support Vector Machine (Baseline)", baseline_svm, X_train, X_test, y_train, y_test), "Best Params": "Default"})

In [21]:
param_grid = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"]
}

In [22]:
grid_search = GridSearchCV(SVC(random_state=42),
                           param_grid=param_grid,
                           cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)
best_svm = grid_search.best_estimator_
results.append({**evaluate_model("Support Vector Machine (Grid Search)", best_svm, X_train, X_test, y_train, y_test), "Best Params": grid_search.best_params_})

In [None]:
results.sort(key=lambda x: x["accuracy"], reverse=True)
results

[{'model': 'Random Forest (Baseline)',
  'accuracy': 0.8532608695652174,
  'precision': 0.8867924528301887,
  'recall': 0.8623853211009175,
  'f1': 0.8744186046511628,
  'roc_auc': np.float64(0.9014678899082569),
  'Best Params': 'Default'},
 {'model': 'Support Vector Machine (Baseline)',
  'accuracy': 0.8369565217391305,
  'precision': 0.8623853211009175,
  'recall': 0.8623853211009175,
  'f1': 0.8623853211009175,
  'roc_auc': None,
  'Best Params': 'Default'},
 {'model': 'Random Forest (Grid Search)',
  'accuracy': 0.8260869565217391,
  'precision': 0.8666666666666667,
  'recall': 0.8348623853211009,
  'f1': 0.8504672897196262,
  'roc_auc': np.float64(0.9081345565749236),
  'Best Params': {'bootstrap': True,
   'max_depth': 5,
   'min_samples_leaf': 2,
   'min_samples_split': 5,
   'n_estimators': 100}},
 {'model': 'Logistic Regression (Baseline)',
  'accuracy': 0.8152173913043478,
  'precision': 0.8504672897196262,
  'recall': 0.8348623853211009,
  'f1': 0.8425925925925926,
  'roc_a

In [27]:
results_df = pd.DataFrame(results).sort_values("accuracy", ascending=False)
results_df

Unnamed: 0,model,accuracy,precision,recall,f1,roc_auc,Best Params
0,Random Forest (Baseline),0.853261,0.886792,0.862385,0.874419,0.901468,Default
1,Support Vector Machine (Baseline),0.836957,0.862385,0.862385,0.862385,,Default
2,Random Forest (Grid Search),0.826087,0.866667,0.834862,0.850467,0.908135,"{'bootstrap': True, 'max_depth': 5, 'min_sampl..."
3,Logistic Regression (Baseline),0.815217,0.850467,0.834862,0.842593,0.895902,Default
4,Logistic Regression (Randomized Search),0.815217,0.850467,0.834862,0.842593,0.895657,"{'C': 0.916064345328208, 'penalty': 'l2', 'sol..."
5,Logistic Regression (Randomized Search),0.809783,0.842593,0.834862,0.83871,0.895413,"{'C': 1.5701864044243652, 'penalty': 'l1', 'so..."
6,Support Vector Machine (Grid Search),0.798913,0.833333,0.825688,0.829493,,"{'C': 10, 'gamma': 'scale', 'kernel': 'linear'}"
7,Decision Tree (Grid Search),0.793478,0.825688,0.825688,0.825688,0.825749,"{'criterion': 'entropy', 'max_depth': 5, 'min_..."
8,Decision Tree (Baseline),0.766304,0.823529,0.770642,0.796209,0.765321,Default
