In [3]:
import pandas as pd

df = pd.read_csv("../data/Heart_Disease_Selected_Features.csv")
df.head()

Unnamed: 0,ca_0.0,thalach,thal_3.0,oldpeak,age,cp_4,thal_7.0,trestbps,target
0,1.0,0.017494,0.0,1.068965,0.936181,0.0,0.0,0.75038,0
1,0.0,-1.816334,1.0,0.381773,1.378929,1.0,0.0,1.596266,1
2,0.0,-0.89942,0.0,1.326662,1.378929,1.0,1.0,-0.659431,1
3,1.0,1.63301,1.0,2.099753,-1.94168,0.0,0.0,-0.095506,0
4,1.0,0.978071,1.0,0.295874,-1.498933,0.0,0.0,-0.095506,0


In [4]:
from sklearn.model_selection import train_test_split
X=df.drop("target",axis=1)
y=df["target"]
X_train ,X_test , y_train, y_test = train_test_split(X,y,test_size=0.2)

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

baseline_model = RandomForestClassifier(random_state=42)
baseline_model.fit(X_train, y_train)

y_pred_base = baseline_model.predict(X_test)
print("Baseline Accuracy:", accuracy_score(y_test, y_pred_base))
print(classification_report(y_test, y_pred_base))

Baseline Accuracy: 0.8666666666666667
              precision    recall  f1-score   support

           0       0.79      0.96      0.87        28
           1       0.96      0.78      0.86        32

    accuracy                           0.87        60
   macro avg       0.88      0.87      0.87        60
weighted avg       0.88      0.87      0.87        60



In [7]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)
print("Best Parameters (GridSearch):", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters (GridSearch): {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}
Best CV Score: 0.8183510638297872


In [8]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

param_dist = {
    "n_estimators": np.arange(100, 1000, 100),
    "max_depth": [None] + list(np.arange(5, 30, 5)),
    "min_samples_split": np.arange(2, 15, 2),
    "min_samples_leaf": np.arange(1, 10, 2),
    "bootstrap": [True, False]
}

random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=50,  # number of random combinations to try
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    random_state=42,
    verbose=2
)

random_search.fit(X_train, y_train)
print("Best Parameters (RandomizedSearch):", random_search.best_params_)
print("Best CV Score:", random_search.best_score_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters (RandomizedSearch): {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 9, 'max_depth': 25, 'bootstrap': True}
Best CV Score: 0.8181737588652483


In [9]:
best_grid = grid_search.best_estimator_
y_pred_grid = best_grid.predict(X_test)
print("GridSearch Test Accuracy:", accuracy_score(y_test, y_pred_grid))

best_random = random_search.best_estimator_
y_pred_random = best_random.predict(X_test)
print("RandomizedSearch Test Accuracy:", accuracy_score(y_test, y_pred_random))

print("Baseline Test Accuracy:", accuracy_score(y_test, y_pred_base))

GridSearch Test Accuracy: 0.8833333333333333
RandomizedSearch Test Accuracy: 0.8666666666666667
Baseline Test Accuracy: 0.8666666666666667


In [13]:
import joblib

joblib.dump(grid_search.best_estimator_, "../models/random_forest_model.pkl")

loaded_rf = joblib.load("../models/random_forest_model.pkl")
y_pred = loaded_rf.predict(X_test)

In [14]:
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Test Accuracy: 0.8833333333333333
