In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_csv("C:\\Users\\jalpa\\Downloads\\heart.csv")
df.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [40]:
x = df.iloc[:,0:12]
y = df.iloc[:,-1]

In [41]:
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,0:12], df.iloc[:,-1], test_size = 0.2, random_state = 42)

In [42]:
print(x_train.shape)
print(x_test.shape)

(242, 12)
(61, 12)


In [43]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
accuracy = accuracy_score(y_pred, y_test)

In [44]:
accuracy

0.8524590163934426

In [45]:
from sklearn.model_selection import cross_val_score
np.mean(cross_val_score(RandomForestClassifier(), x, y, cv = 10, scoring = 'accuracy'))

0.8080645161290322

In [46]:
cross_val_score(RandomForestClassifier(), x, y, cv = 10, scoring = 'accuracy')

array([0.87096774, 0.87096774, 0.87096774, 0.86666667, 0.9       ,
       0.73333333, 0.8       , 0.83333333, 0.76666667, 0.76666667])

# Tune the hyper parameter

In [47]:
rf = RandomForestClassifier(max_samples = 0.75, random_state = 42)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
accuracy = accuracy_score(y_pred, y_test)
accuracy

0.8688524590163934

In [48]:
score = cross_val_score(RandomForestClassifier(max_samples = 0.75), x, y, cv = 10, scoring = 'accuracy')

In [49]:
average_score = np.mean(score)
average_score

0.8213978494623657

# GridSearchCV

In [50]:
# Number of tree in random forest
n_estimators = [20,60,100,120]

# Number of features to consider at every split
max_features = [0.2,0.6,1.0]

# Maximum number of levels in trees
max_depth = [2,8, None]

# NNumber of samples
max_samples = [0.5, 0.75, 1.0]

In [51]:
param_grid = {'n_estimators' : n_estimators,
             'max_features' : max_features,
             'max_depth' : max_depth,
             'max_samples' : max_samples}
param_grid

{'n_estimators': [20, 60, 100, 120],
 'max_features': [0.2, 0.6, 1.0],
 'max_depth': [2, 8, None],
 'max_samples': [0.5, 0.75, 1.0]}

In [52]:
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()
rf_grid = GridSearchCV(estimator = rf,
                      param_grid = param_grid,
                      cv = 5,
                      verbose = 2,
                      n_jobs = -1)

In [53]:
rf_grid.fit(x_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [54]:
rf_grid.best_params_

{'max_depth': None,
 'max_features': 0.2,
 'max_samples': 1.0,
 'n_estimators': 60}

In [55]:
rf_grid.best_score_

0.8344387755102041

# RandomSearchCV

In [56]:
# Number of tree in random forest
n_estimators = [20,60,100,120]

# Number of features to consider at every split
max_features = [0.2,0.6,1.0]

# Maximum number of levels in trees
max_depth = [2,8, None]

# Number of samples
max_samples = [0.5, 0.75, 1.0]

#Bootstrap samples
bootstrap = [True, False]

#Minimum number of samples required to split a node
min_samples_split = [2,5]

#Minimum number of samples required at each leaf node
min_samples_leaf = [1,2]

In [57]:
param_grids = {'n_estimators' : n_estimators,
             'max_features' : max_features,
             'max_depth' : max_depth,
             'max_samples' : max_samples,
             'bootstrap' : bootstrap,
             'min_samples_split' : min_samples_split,
             'min_samples_leaf' : min_samples_leaf}
print(param_grids)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0], 'bootstrap': [True, False], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}


In [59]:
from sklearn.model_selection import RandomizedSearchCV

rf_grid = RandomizedSearchCV(estimator = rf,
                      param_distributions = param_grids,
                      cv = 5,
                      verbose = 2,
                      n_jobs = -1)

In [66]:
rf_grid.fit(x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


35 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jalpa\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jalpa\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\jalpa\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\ensemble\_forest.py", line 433, in fit
    raise ValueError(
ValueError: `ma

In [67]:
rf_grid.best_params_

{'n_estimators': 100,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_samples': 1.0,
 'max_features': 1.0,
 'max_depth': 8,
 'bootstrap': True}

In [68]:
rf_grid.best_score_

0.8177721088435375