<h1>Hyperparameter Tuning</h1>

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from scipy.stats import uniform
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [35]:
# load the dataset
df = pd.read_csv('../data/heart_disease_cleaned.csv')

In [36]:
X = df.drop('target', axis=1)
y = df['target']

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
# make a function to evaluate model
def evaluate_model(name , model , X_train , X_test , y_train , y_test):
    y_pred = model.predict(X_test)
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_proba = None
    return {
        "model": name,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_proba) if y_proba is not None else None
    }

In [39]:
results = []

<h3>Logistic Regression</h3>

In [40]:
baseline_lr = LogisticRegression(max_iter=5000 , random_state=42).fit(X_train , y_train)
results.append({**evaluate_model("Logistic Regression (Baseline)", baseline_lr, X_train, X_test, y_train, y_test), "Best Params": "Default"})


In [41]:
param_dist = {
    "C": uniform(0.01, 10),
    "penalty": ["l1", "l2"],
    "solver": ["saga", "liblinear"]
}

In [42]:
random_search = RandomizedSearchCV(LogisticRegression(max_iter=5000, random_state=42),
                                   param_distributions=param_dist,
                                   n_iter=20, cv=5, scoring="accuracy", n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)
best_lr = random_search.best_estimator_
results.append({**evaluate_model("Logistic Regression (Randomized Search)", best_lr, X_train, X_test, y_train, y_test), "Best Params": random_search.best_params_})

<h3>Decision Tree</h3>

In [43]:
baseline_dt = DecisionTreeClassifier(random_state=42).fit(X_train , y_train)
results.append({**evaluate_model("Decision Tree (Baseline)", baseline_dt, X_train, X_test, y_train, y_test), "Best Params": "Default"})


In [44]:
param_grid = {
    "max_depth": [3, 5, 7, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "criterion": ["gini", "entropy"]
}

In [45]:
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)
best_dt = grid_search.best_estimator_
results.append({**evaluate_model("Decision Tree (Grid Search)", best_dt, X_train, X_test, y_train, y_test), "Best Params": grid_search.best_params_})

<h3>Random Forest</h3>

In [46]:
baseline_rf = RandomForestClassifier(random_state=42).fit(X_train , y_train)
results.append({**evaluate_model("Random Forest (Baseline)", baseline_rf, X_train, X_test, y_train, y_test), "Best Params": "Default"})

In [47]:
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

In [48]:
grid_search = GridSearchCV(RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_
results.append({**evaluate_model("Random Forest (Grid Search)", best_rf, X_train, X_test, y_train, y_test), "Best Params": grid_search.best_params_})

<h3>SVM</h3>

In [49]:
baseline_svm = SVC(random_state=42).fit(X_train , y_train)
results.append({**evaluate_model("Support Vector Machine (Baseline)", baseline_svm, X_train, X_test, y_train, y_test), "Best Params": "Default"})

In [50]:
param_grid = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"]
}

In [51]:
grid_search = GridSearchCV(SVC(random_state=42),
                           param_grid=param_grid,
                           cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)
best_svm = grid_search.best_estimator_
results.append({**evaluate_model("Support Vector Machine (Grid Search)", best_svm, X_train, X_test, y_train, y_test), "Best Params": grid_search.best_params_})

In [52]:
results.sort(key=lambda x: x["accuracy"], reverse=True)
results

[{'model': 'Random Forest (Baseline)',
  'accuracy': 0.8532608695652174,
  'precision': 0.8867924528301887,
  'recall': 0.8623853211009175,
  'f1': 0.8744186046511628,
  'roc_auc': np.float64(0.9014678899082569),
  'Best Params': 'Default'},
 {'model': 'Support Vector Machine (Baseline)',
  'accuracy': 0.8369565217391305,
  'precision': 0.8623853211009175,
  'recall': 0.8623853211009175,
  'f1': 0.8623853211009175,
  'roc_auc': None,
  'Best Params': 'Default'},
 {'model': 'Random Forest (Grid Search)',
  'accuracy': 0.8260869565217391,
  'precision': 0.8666666666666667,
  'recall': 0.8348623853211009,
  'f1': 0.8504672897196262,
  'roc_auc': np.float64(0.9081345565749236),
  'Best Params': {'bootstrap': True,
   'max_depth': 5,
   'min_samples_leaf': 2,
   'min_samples_split': 5,
   'n_estimators': 100}},
 {'model': 'Logistic Regression (Baseline)',
  'accuracy': 0.8152173913043478,
  'precision': 0.8504672897196262,
  'recall': 0.8348623853211009,
  'f1': 0.8425925925925926,
  'roc_a

In [53]:
results_df = pd.DataFrame(results).sort_values("accuracy", ascending=False)
results_df

Unnamed: 0,model,accuracy,precision,recall,f1,roc_auc,Best Params
0,Random Forest (Baseline),0.853261,0.886792,0.862385,0.874419,0.901468,Default
1,Support Vector Machine (Baseline),0.836957,0.862385,0.862385,0.862385,,Default
2,Random Forest (Grid Search),0.826087,0.866667,0.834862,0.850467,0.908135,"{'bootstrap': True, 'max_depth': 5, 'min_sampl..."
3,Logistic Regression (Baseline),0.815217,0.850467,0.834862,0.842593,0.895902,Default
4,Logistic Regression (Randomized Search),0.809783,0.842593,0.834862,0.83871,0.895413,"{'C': 1.5701864044243652, 'penalty': 'l1', 'so..."
5,Support Vector Machine (Grid Search),0.798913,0.833333,0.825688,0.829493,,"{'C': 10, 'gamma': 'scale', 'kernel': 'linear'}"
6,Decision Tree (Grid Search),0.793478,0.825688,0.825688,0.825688,0.825749,"{'criterion': 'entropy', 'max_depth': 5, 'min_..."
7,Decision Tree (Baseline),0.766304,0.823529,0.770642,0.796209,0.765321,Default


<h3>Model Export & Deployment</h3>

In [54]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import joblib

In [55]:
best_pipline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", baseline_rf)
])

best_pipline.fit(X_train, y_train)
joblib.dump(best_pipline, "../models/final_model.pkl")
print("final model saved to '../models/final_model.pkl'")

final model saved to '../models/final_model.pkl'


<h3>Save the final pipeline (preprocessing + model)</h3>

In [56]:

import joblib
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier


In [63]:
# 1. Load your cleaned dataset
df = pd.read_csv("../data/heart_disease_cleaned.csv") 
df.info()
df.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          920 non-null    float64
 1   sex          920 non-null    float64
 2   trestbps     920 non-null    float64
 3   chol         920 non-null    float64
 4   fbs          920 non-null    float64
 5   thalach      920 non-null    float64
 6   exang        920 non-null    float64
 7   oldpeak      920 non-null    float64
 8   target       920 non-null    int64  
 9   cp_2.0       920 non-null    bool   
 10  cp_3.0       920 non-null    bool   
 11  cp_4.0       920 non-null    bool   
 12  restecg_1.0  920 non-null    bool   
 13  restecg_2.0  920 non-null    bool   
 14  slope_2.0    920 non-null    bool   
 15  slope_3.0    920 non-null    bool   
 16  thal_6.0     920 non-null    bool   
 17  thal_7.0     920 non-null    bool   
dtypes: bool(9), float64(8), int64(1)
memory usage: 72.

Index(['age', 'sex', 'trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak',
       'target', 'cp_2.0', 'cp_3.0', 'cp_4.0', 'restecg_1.0', 'restecg_2.0',
       'slope_2.0', 'slope_3.0', 'thal_6.0', 'thal_7.0'],
      dtype='object')

In [58]:
# Drop target to get feature matrix and the target
X = df.drop("target", axis=1)
y = df["target"]

In [66]:
# --- 2. Define which columns are numeric and which are categorical (same as training) ---
NUM_FEATURES = ["age", "trestbps", "chol", "thalach", "oldpeak"]
CAT_FEATURES = ["sex", "cp_2.0", "cp_3.0", "cp_4.0", "fbs", "restecg_1.0", "restecg_2.0", "exang", "slope_2.0", "slope_3.0", "thal_6.0", "thal_7.0"]

In [67]:
# --- 3. Preprocessing pipelines ---
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipe, NUM_FEATURES),
        ("cat", cat_pipe, CAT_FEATURES)
    ]
)

In [68]:
# --- 4. Final pipeline with classifier ---
clf = RandomForestClassifier(random_state=42, n_estimators=100)  # baseline RF
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", clf)
])


In [69]:
# --- 5. Fit pipeline on full training data (or on X, y) ---
pipeline.fit(X , y)

In [77]:
pipeline.fit(X_train, y_train)

# save model
joblib.dump(pipeline, "models/final_model.pkl")

# save schema (feature names)
schema = {
    "num": ["age", "trestbps", "chol", "thalach", "oldpeak"],  # numerical features
    "cat": ["sex", "fbs", "exang", "cp_2.0", "cp_3.0", "cp_4.0",
            "restecg_1.0", "restecg_2.0", "slope_2.0", "slope_3.0",
            "thal_6.0", "thal_7.0"]
}
joblib.dump(schema, "models/feature_schema.pkl")

['models/feature_schema.pkl']