# 06 - Hyperparameter Tuning & Final Export

In [1]:
import pandas as pd, numpy as np, joblib
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from scipy.stats import randint, uniform

CLEAN_PATH = Path.cwd().parents[0] / 'data' / 'heart_disease_clean.csv'
df = pd.read_csv(CLEAN_PATH)

X = df.drop(columns=['target'])
y = df['target']

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

preprocess = ColumnTransformer(transformers=[
    ('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                           ('scaler', StandardScaler())]), num_cols),
    ('cat', Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                            ('onehot', OneHotEncoder(handle_unknown='ignore'))]), cat_cols)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Candidate 1: RandomForest with RandomizedSearch
rf = RandomForestClassifier(random_state=42)
rf_params = {
    'clf__n_estimators': randint(200, 600),
    'clf__max_depth': randint(3, 20),
    'clf__min_samples_split': randint(2, 20),
    'clf__min_samples_leaf': randint(1, 10)
}
rf_pipe = Pipeline(steps=[('prep', preprocess), ('clf', rf)])
rf_search = RandomizedSearchCV(rf_pipe, rf_params, n_iter=25, cv=5, scoring='roc_auc', n_jobs=-1, random_state=42, verbose=1)
rf_search.fit(X_train, y_train)

# Candidate 2: SVM with GridSearch
svm = SVC(probability=True, random_state=42)
svm_params = {
    'clf__C': [0.1, 1, 3, 10],
    'clf__gamma': ['scale', 0.01, 0.001],
    'clf__kernel': ['rbf']
}
svm_pipe = Pipeline(steps=[('prep', preprocess), ('clf', svm)])
svm_grid = GridSearchCV(svm_pipe, svm_params, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
svm_grid.fit(X_train, y_train)

best_rf_auc = rf_search.best_score_
best_svm_auc = svm_grid.best_score_

print('RF best AUC (CV):', best_rf_auc)
print('SVM best AUC (CV):', best_svm_auc)

best_model = rf_search.best_estimator_ if best_rf_auc >= best_svm_auc else svm_grid.best_estimator_

final_path = Path.cwd().parents[0] / 'models' / 'final_model.pkl'
joblib.dump(best_model, final_path)
print('✅ Exported final model to', final_path)


Fitting 5 folds for each of 25 candidates, totalling 125 fits


Fitting 5 folds for each of 12 candidates, totalling 60 fits
RF best AUC (CV): 1.0
SVM best AUC (CV): 1.0
✅ Exported final model to C:\Users\Moamen\Desktop\Heart_Disease_Project\Heart_Disease_Project\models\final_model.pkl
