In [1]:
# --- Cell 1: Imports ---
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# --- Cell 2: Load and Prepare Data ---
df = pd.read_csv('../data/heart_disease_cleaned.csv')
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Cell 3: Hyperparameter Tuning for Random Forest (GridSearchCV) ---
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    n_jobs=-1, # Use all available cores
    verbose=2,
    scoring='accuracy'
)

# Fit the grid search
grid_search.fit(X_train_scaled, y_train)

# --- Cell 4: Evaluate the Best Model ---
print("--- Best Hyperparameters for Random Forest ---")
print(grid_search.best_params_)

best_rf_model = grid_search.best_estimator_
y_pred_tuned = best_rf_model.predict(X_test_scaled)

print("\n--- Tuned Random Forest Classification Report ---")
print(classification_report(y_test, y_pred_tuned))
tuned_accuracy = accuracy_score(y_test, y_pred_tuned)
print(f"Tuned Model Accuracy: {tuned_accuracy:.4f}")

# --- Cell 5: Save the Tuned Model and Scaler ---
# We'll save the best model and the scaler separately for the Streamlit app
joblib.dump(best_rf_model, '../models/final_model.pkl')
joblib.dump(scaler, '../models/scaler.pkl')

print("\nBest model saved to 'models/final_model.pkl'")
print("Scaler saved to 'models/scaler.pkl'")


# --- Cell 6: Append Tuned Results to Metrics File ---
with open("../results/evaluation_metrics.txt", "a") as f:
    f.write("\n--- Tuned Model: Random Forest ---\n")
    f.write(f"Best Parameters: {grid_search.best_params_}\n")
    f.write(f"Tuned Accuracy: {tuned_accuracy:.4f}\n")
    f.write(f"Classification Report:\n{classification_report(y_test, y_pred_tuned)}\n\n")

print("\nTuned model evaluation metrics appended to results/evaluation_metrics.txt")

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END m