In [1]:
# AIM: To compare the performance of RandomForestClassifier with respect to DecisionTreeClassifier

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_csv('../datasets/Heart.csv', index_col=0)

In [5]:
df

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,45,1,typical,110,264,0,0,132,0,1.2,2,0.0,reversable,Yes
300,68,1,asymptomatic,144,193,1,0,141,0,3.4,2,2.0,reversable,Yes
301,57,1,asymptomatic,130,131,0,0,115,1,1.2,2,1.0,reversable,Yes
302,57,0,nontypical,130,236,0,2,174,0,0.0,2,1.0,normal,Yes


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 303 entries, 1 to 303
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Age        303 non-null    int64  
 1   Sex        303 non-null    int64  
 2   ChestPain  303 non-null    object 
 3   RestBP     303 non-null    int64  
 4   Chol       303 non-null    int64  
 5   Fbs        303 non-null    int64  
 6   RestECG    303 non-null    int64  
 7   MaxHR      303 non-null    int64  
 8   ExAng      303 non-null    int64  
 9   Oldpeak    303 non-null    float64
 10  Slope      303 non-null    int64  
 11  Ca         299 non-null    float64
 12  Thal       301 non-null    object 
 13  AHD        303 non-null    object 
dtypes: float64(2), int64(9), object(3)
memory usage: 35.5+ KB


In [7]:
df = df.dropna()

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 297 entries, 1 to 302
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Age        297 non-null    int64  
 1   Sex        297 non-null    int64  
 2   ChestPain  297 non-null    object 
 3   RestBP     297 non-null    int64  
 4   Chol       297 non-null    int64  
 5   Fbs        297 non-null    int64  
 6   RestECG    297 non-null    int64  
 7   MaxHR      297 non-null    int64  
 8   ExAng      297 non-null    int64  
 9   Oldpeak    297 non-null    float64
 10  Slope      297 non-null    int64  
 11  Ca         297 non-null    float64
 12  Thal       297 non-null    object 
 13  AHD        297 non-null    object 
dtypes: float64(2), int64(9), object(3)
memory usage: 34.8+ KB


In [9]:
df

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,asymptomatic,140,241,0,0,123,1,0.2,2,0.0,reversable,Yes
299,45,1,typical,110,264,0,0,132,0,1.2,2,0.0,reversable,Yes
300,68,1,asymptomatic,144,193,1,0,141,0,3.4,2,2.0,reversable,Yes
301,57,1,asymptomatic,130,131,0,0,115,1,1.2,2,1.0,reversable,Yes


In [10]:
df = pd.get_dummies(df, columns=['Sex', 'ChestPain', 'Fbs', 'RestECG', 'ExAng', 'Slope', 'Ca', 'Thal'], drop_first=True)

In [11]:
df

Unnamed: 0,Age,RestBP,Chol,MaxHR,Oldpeak,AHD,Sex_1,ChestPain_nonanginal,ChestPain_nontypical,ChestPain_typical,...,RestECG_1,RestECG_2,ExAng_1,Slope_2,Slope_3,Ca_1.0,Ca_2.0,Ca_3.0,Thal_normal,Thal_reversable
1,63,145,233,150,2.3,No,True,False,False,True,...,False,True,False,False,True,False,False,False,False,False
2,67,160,286,108,1.5,Yes,True,False,False,False,...,False,True,True,True,False,False,False,True,True,False
3,67,120,229,129,2.6,Yes,True,False,False,False,...,False,True,True,True,False,False,True,False,False,True
4,37,130,250,187,3.5,No,True,True,False,False,...,False,False,False,False,True,False,False,False,True,False
5,41,130,204,172,1.4,No,False,False,True,False,...,False,True,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,140,241,123,0.2,Yes,False,False,False,False,...,False,False,True,True,False,False,False,False,False,True
299,45,110,264,132,1.2,Yes,True,False,False,True,...,False,False,False,True,False,False,False,False,False,True
300,68,144,193,141,3.4,Yes,True,False,False,False,...,False,False,False,True,False,False,True,False,False,True
301,57,130,131,115,1.2,Yes,True,False,False,False,...,False,False,True,True,False,True,False,False,False,True


In [12]:
X, y = df.drop(columns='AHD'), df['AHD']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

In [14]:
# The performance of DecisionTreeClassifier (as seen previously) was around 75% accuracy

In [15]:
clf = RandomForestClassifier(random_state=42)

In [16]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [26]:
param_grid = {
    'n_estimators': [10, 100, 900, 1000, 1100],
    'max_features': ['sqrt', 'log2'],
}

In [27]:
grid = GridSearchCV(clf, param_grid, scoring='accuracy', verbose=3)

In [28]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END max_features=sqrt, n_estimators=10;, score=0.690 total time=   0.0s
[CV 2/5] END max_features=sqrt, n_estimators=10;, score=0.905 total time=   0.0s
[CV 3/5] END max_features=sqrt, n_estimators=10;, score=0.732 total time=   0.0s
[CV 4/5] END max_features=sqrt, n_estimators=10;, score=0.634 total time=   0.0s
[CV 5/5] END max_features=sqrt, n_estimators=10;, score=0.780 total time=   0.0s
[CV 1/5] END max_features=sqrt, n_estimators=100;, score=0.643 total time=   0.0s
[CV 2/5] END max_features=sqrt, n_estimators=100;, score=0.857 total time=   0.0s
[CV 3/5] END max_features=sqrt, n_estimators=100;, score=0.780 total time=   0.0s
[CV 4/5] END max_features=sqrt, n_estimators=100;, score=0.756 total time=   0.0s
[CV 5/5] END max_features=sqrt, n_estimators=100;, score=0.707 total time=   0.1s
[CV 1/5] END max_features=sqrt, n_estimators=900;, score=0.690 total time=   1.0s
[CV 2/5] END max_features=sqrt, n_estimato

In [29]:
grid.best_params_

{'max_features': 'sqrt', 'n_estimators': 1000}

In [30]:
y_pred = grid.predict(X_test)

In [31]:
accuracy_score(y_test, y_pred)

0.8444444444444444