In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = 'C:/Users/ashwa/OneDrive/Desktop/cleaned_standardized_heart_data.csv'  # Update path as needed
heart_data = pd.read_csv(file_path)

# Select features for the model
selected_features = ['physicalhealthdays', 'weightinkilograms', 'bmi',
                     'heightinmeters', 'sleephours', 'generalhealth',
                     'hadangina', 'agecategory']
X = heart_data[selected_features].copy()
y = heart_data['hadheartattack']

# Handle missing values (imputation)
X = X.fillna(X.median(numeric_only=True))

# Convert categorical features to minimal encodings (if necessary)
X = pd.get_dummies(X, drop_first=True)

# Address class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled dataset into training and testing sets
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

# Define parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],          # Number of trees
    'max_depth': [10, 20, None],             # Maximum depth of trees
    'min_samples_split': [2, 5, 10],         # Minimum samples to split
    'min_samples_leaf': [1, 2, 4]            # Minimum samples in a leaf node
}

# Initialize GridSearchCV for Random Forest
grid_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                       param_grid=param_grid_rf,
                       scoring='f1_weighted',  # Optimize for F1-score
                       cv=3,
                       verbose=2)

# Perform the grid search
grid_rf.fit(X_train_smote, y_train_smote)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  54.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  52.2s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 1.1min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.7min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.7min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.7min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 3.0min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 2.5min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 2.6min
[CV] END max_depth=10, min_sa