In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv("D:/heart_disease.csv")
print(df.columns)
print(df.isnull().sum())

Index(['Age', 'Gender', 'Blood Pressure', 'Cholesterol Level',
       'Exercise Habits', 'Smoking', 'Family Heart Disease', 'Diabetes', 'BMI',
       'High Blood Pressure', 'Low HDL Cholesterol', 'High LDL Cholesterol',
       'Alcohol Consumption', 'Stress Level', 'Sleep Hours',
       'Sugar Consumption', 'Triglyceride Level', 'Fasting Blood Sugar',
       'CRP Level', 'Homocysteine Level', 'Heart Disease Status'],
      dtype='object')
Age                       29
Gender                    19
Blood Pressure            19
Cholesterol Level         30
Exercise Habits           25
Smoking                   25
Family Heart Disease      21
Diabetes                  30
BMI                       22
High Blood Pressure       26
Low HDL Cholesterol       25
High LDL Cholesterol      26
Alcohol Consumption     2586
Stress Level              22
Sleep Hours               25
Sugar Consumption         30
Triglyceride Level        26
Fasting Blood Sugar       22
CRP Level                 26
Homocy

In [3]:
categorical_columns = ['Gender', 'Blood Pressure', 'Cholesterol Level', 'Exercise Habits', 
                       'Smoking', 'Family Heart Disease', 'Diabetes', 
                       'High Blood Pressure', 'Low HDL Cholesterol', 
                       'High LDL Cholesterol', 'Alcohol Consumption', 
                       'Stress Level', 'Sleep Hours', 'Sugar Consumption', 
                       'Triglyceride Level', 'Fasting Blood Sugar', 
                       'CRP Level', 'Homocysteine Level']
encoder = LabelEncoder()

In [7]:
for col in categorical_columns:
    if df[col].dtype == 'object':  
        df[col] = encoder.fit_transform(df[col])

In [9]:
X = df.drop('Heart Disease Status', axis=1)  
y = df['Heart Disease Status']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
rf_default = RandomForestClassifier(random_state=42)
rf_default.fit(X_train, y_train)

In [17]:
y_pred_default = rf_default.predict(X_test)
print("Default Model Accuracy:", accuracy_score(y_test, y_pred_default))
print("Classification Report for Default Model:\n", classification_report(y_test, y_pred_default))

Default Model Accuracy: 0.8065
Classification Report for Default Model:
               precision    recall  f1-score   support

          No       0.81      1.00      0.89      1613
         Yes       0.00      0.00      0.00       387

    accuracy                           0.81      2000
   macro avg       0.40      0.50      0.45      2000
weighted avg       0.65      0.81      0.72      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap':[True,False]
}

In [21]:
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_grid,
    n_iter=10,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

In [23]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [25]:
best_params = random_search.best_params_
print(f"Best Parameters: {best_params}")

Best Parameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 10, 'bootstrap': True}


In [29]:
best_rf = random_search.best_estimator_
y_pred_best = best_rf.predict(X_test)

In [31]:
print("\nTuned Model Accuracy:", accuracy_score(y_test, y_pred_best))
print("Classification Report for Tuned Model:\n", classification_report(y_test, y_pred_best))



Tuned Model Accuracy: 0.8065
Classification Report for Tuned Model:
               precision    recall  f1-score   support

          No       0.81      1.00      0.89      1613
         Yes       0.00      0.00      0.00       387

    accuracy                           0.81      2000
   macro avg       0.40      0.50      0.45      2000
weighted avg       0.65      0.81      0.72      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [33]:
default_accuracy = accuracy_score(y_test, y_pred_default)
tuned_accuracy = accuracy_score(y_test, y_pred_best)

print(f"\nDefault Model Accuracy: {default_accuracy}")
print(f"Tuned Model Accuracy: {tuned_accuracy}")


Default Model Accuracy: 0.8065
Tuned Model Accuracy: 0.8065
