In [1]:
import os
import pandas as pd
### 1. Link notebook with google drive and access data from your personal Gdrive
from google.colab import drive
drive.mount("/content/gdrive")

### 2.Set the data path for dataset and model location (ex: model_loc = "/content/gdrive/My Drive/Dataset/")
dataset_dir = "/content/gdrive/My Drive/Colab Notebooks/"
model_loc = "/content/gdrive/My Drive/Colab Notebooks/"

print(os.listdir(dataset_dir))
data = pd.read_csv(dataset_dir+'heart.csv')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
['[1_April_2024]_Heart_Disease_NN.ipynb', 'heart.csv', 'Assignment2.ipynb', 'Assignment3.ipynb']


In [2]:
# Import libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import time

# Preprocess dataset (Assuming the target column is named 'target')
X = data.drop(columns=['target'])
y = data['target']

In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Define the parameter grid based on the proposed hyperparameters
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4]
}

# Initialize the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

In [5]:
# Perform GridSearchCV
start_time_grid = time.time()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
end_time_grid = time.time()

# Get the best parameters and best score from Grid Search
best_params_grid = grid_search.best_params_
best_score_grid = grid_search.best_score_

# Test the best model from Grid Search on the test set
best_model_grid = grid_search.best_estimator_
y_pred_grid = best_model_grid.predict(X_test)
test_accuracy_grid = accuracy_score(y_test, y_pred_grid)

# Print Grid Search results
print("Grid Search Best Parameters:", best_params_grid)
print("Grid Search Best Cross-Validation Accuracy:", best_score_grid)
print("Grid Search Test Accuracy:", test_accuracy_grid)
print("Grid Search Computational Time: {:.2f} seconds".format(end_time_grid - start_time_grid))

Grid Search Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Grid Search Best Cross-Validation Accuracy: 0.8139455782312925
Grid Search Test Accuracy: 0.8524590163934426
Grid Search Computational Time: 34.95 seconds


In [6]:
# Perform RandomizedSearchCV
start_time_random = time.time()
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=10, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)
end_time_random = time.time()

# Get the best parameters and best score from Random Search
best_params_random = random_search.best_params_
best_score_random = random_search.best_score_

# Test the best model from Random Search on the test set
best_model_random = random_search.best_estimator_
y_pred_random = best_model_random.predict(X_test)
test_accuracy_random = accuracy_score(y_test, y_pred_random)

# Print Random Search results
print("Random Search Best Parameters:", best_params_random)
print("Random Search Best Cross-Validation Accuracy:", best_score_random)
print("Random Search Test Accuracy:", test_accuracy_random)
print("Random Search Computational Time: {:.2f} seconds".format(end_time_random - start_time_random))

Random Search Best Parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 10}
Random Search Best Cross-Validation Accuracy: 0.8139455782312925
Random Search Test Accuracy: 0.8524590163934426
Random Search Computational Time: 14.83 seconds


In [7]:
# Compare results
print("\nComparison:")
print(f"Grid Search - Best Params: {best_params_grid}, CV Accuracy: {best_score_grid}, Test Accuracy: {test_accuracy_grid}")
print(f"Random Search - Best Params: {best_params_random}, CV Accuracy: {best_score_random}, Test Accuracy: {test_accuracy_random}")
print("Grid Search Computational Time: {:.2f} seconds".format(end_time_grid - start_time_grid))
print("Random Search Computational Time: {:.2f} seconds".format(end_time_random - start_time_random))


Comparison:
Grid Search - Best Params: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}, CV Accuracy: 0.8139455782312925, Test Accuracy: 0.8524590163934426
Random Search - Best Params: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 10}, CV Accuracy: 0.8139455782312925, Test Accuracy: 0.8524590163934426
Grid Search Computational Time: 34.95 seconds
Random Search Computational Time: 14.83 seconds
