## Load libs

In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

## Load dataset

In [8]:
data = pd.read_csv("heart.csv")
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [10]:
data.isnull().sum()
# data.dropna() # delete the null rows

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [11]:
# get features and labels from data
y = data["target"]
X = data.drop("target", axis=1)
print(type(X), type(y))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>


## Divide Data into Train and test¶

In [12]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(686, 13) (339, 13)
(686,) (339,)


## Build Random Forest Model with hyperparameters¶

In [15]:
rf_Model = RandomForestClassifier() # instance a random forest with default parameters

In [16]:
param_grid = {
    "n_estimators":[50, 100, 150, 200],
    "max_depth":[int(i) for i in range(2,5,1)],
    "criterion":["gini", "entropy"],
}

In [17]:
from sklearn.model_selection import GridSearchCV

In [22]:
rf_grid = GridSearchCV(estimator=rf_Model, param_grid=param_grid)
# print(rf_grid.best_params_)

In [23]:
rf_grid.fit(X_train, y_train) # scan the param sets and fit for each forest

In [24]:
rf_grid.best_params_


{'criterion': 'entropy', 'max_depth': 4, 'n_estimators': 100}

In [25]:
best_model = rf_grid.best_estimator_
print(type(best_model))

<class 'sklearn.ensemble._forest.RandomForestClassifier'>


In [26]:
# or use a new model with the best params set
new_best_model = RandomForestClassifier(n_estimators=100, max_depth=4, criterion="entropy")
# retrain the model 
new_best_model.fit(X_train, y_train)

In [27]:
# compare the accuracy score on the train and test dataset
from sklearn.metrics import accuracy_score

In [30]:
y_test_pred = best_model.predict(X_test) # 

In [31]:
accuracy_score(y_test, y_test_pred)

0.8613569321533924

In [32]:
y_train_pred = best_model.predict(X_train) # 
accuracy_score(y_train, y_train_pred)

0.9139941690962099

In [None]:
# if train acc is slightly better than test acc, it is good.
# if train acc is largely better than test acc, it is overfitted.
# both train and test acc is not good, --> under fit.