In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split , GridSearchCV , RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression


Tuning Some important hyperparameter and comparing the Random Forest with other model

In [2]:
df = pd.read_csv("heart.csv")

In [3]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
X = df.iloc[:,0:-1]
y = df.iloc[:,-1]
X.shape , y.shape

((303, 13), (303,))

In [5]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 , random_state=2)
X_train.shape , X_test.shape

((242, 13), (61, 13))

In [12]:
rf = RandomForestClassifier()
lr = LogisticRegression(max_iter=1000)
svc = SVC()
gb = GradientBoostingClassifier()


In [13]:
rf.fit(X_train , y_train)
lr.fit(X_train , y_train)
svc.fit(X_train , y_train)
gb.fit(X_train , y_train)

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [14]:
y_pred_rf = rf.predict(X_test)
y_pred_lr = lr.predict(X_test)
y_pred_svc = svc.predict(X_test)
y_pred_gb = gb.predict(X_test)

In [15]:
from sklearn.metrics import accuracy_score
print("accuracy RF" , accuracy_score(y_test , y_pred_rf))
print("accuracy lr" , accuracy_score(y_test , y_pred_lr))
print("accuracy SVM" , accuracy_score(y_test , y_pred_svc))
print("accuracy GB " , accuracy_score(y_test , y_pred_gb) )

accuracy RF 0.9016393442622951
accuracy lr 0.8852459016393442
accuracy SVM 0.6721311475409836
accuracy GB  0.8852459016393442


In [35]:
from sklearn.model_selection import cross_val_score
print("cross val score" , np.mean(cross_val_score(rf , X_train , y_train , cv=10)))

cross val score 0.8098333333333333


In [None]:
# We can increase the accuracy litile bit  of Random Forest by tuning hyperparameter
rf2 = RandomForestClassifier(max_samples=0.75, random_state=21)
rf2.fit(X_train , y_train)
y_pred = rf2.predict(X_test)
print("accuracy rf2" , accuracy_score(y_test , y_pred )) 

# In random Forest it is good if we take max_samples in between 0.5 to 0.75
# by default max_fatures = sqrt(total no of columns)

accuracy rf2 0.9016393442622951


GridSearchCV

In [None]:
parameter = {
    'n_estimators' :[10 , 20 , 100 , 200 , 500],
    'max_samples' : [ 0.5 , 0.65 , 0.75,0.8 ],
    'max_features' :[0.5 , 0.6 , 0.75 , 0.8] , 
    'max_depth'  :[2 , 4 ,6, 'None']
}

# Total 5*4*4*4=320 times the model will trained and after training it will give best hyperparameter

In [27]:
gridsearch = GridSearchCV(estimator=RandomForestClassifier() , param_grid=parameter, cv=5,
                          verbose=1 , n_jobs=2)


In [28]:
# by default value of cv=5 , so the algorithm will run 320*5 = 1600 times
gridsearch.fit(X_train , y_train)

Fitting 5 folds for each of 320 candidates, totalling 1600 fits


400 fits failed out of a total of 1600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
400 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Jayhind\Desktop\MachineLearning\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Jayhind\Desktop\MachineLearning\myenv\lib\site-packages\sklearn\base.py", line 1358, in wrapper
    estimator._validate_params()
  File "c:\Users\Jayhind\Desktop\MachineLearning\myenv\lib\site-packages\sklearn\base.py", line 471, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Jayhind\Desktop\MachineLearning\myenv\lib\site-packages\sklearn\utils\

0,1,2
,estimator,RandomForestClassifier()
,param_grid,"{'max_depth': [2, 4, ...], 'max_features': [0.5, 0.6, ...], 'max_samples': [0.5, 0.65, ...], 'n_estimators': [10, 20, ...]}"
,scoring,
,n_jobs,2
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,6
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,0.8
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [29]:
gridsearch.best_params_


{'max_depth': 6, 'max_features': 0.8, 'max_samples': 0.5, 'n_estimators': 500}

In [30]:
gridsearch.best_score_

np.float64(0.8145408163265305)

RandomizedSearchCV

In [31]:
parameter = {
    'n_estimators' :[10 , 20 , 100 , 200 , 500],
    'max_samples' : [ 0.5 , 0.65 , 0.75,0.8 ],
    'max_features' :[0.5 , 0.6 , 0.75 , 0.8] , 
    'max_depth'  :[2 , 4 ,6, 'None'],
    'bootstrap'  :[True , False],
}

randomsearch = RandomizedSearchCV(estimator=RandomForestClassifier(),
                                param_distributions=parameter,cv=10 , n_jobs=2 , verbose=1  )
# it will take some randomly 10 combination of all possible combination 

In [32]:
randomsearch.fit(X_train , y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


90 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Jayhind\Desktop\MachineLearning\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Jayhind\Desktop\MachineLearning\myenv\lib\site-packages\sklearn\base.py", line 1358, in wrapper
    estimator._validate_params()
  File "c:\Users\Jayhind\Desktop\MachineLearning\myenv\lib\site-packages\sklearn\base.py", line 471, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Jayhind\Desktop\MachineLearning\myenv\lib\site-packages\sklearn\utils\_pa

0,1,2
,estimator,RandomForestClassifier()
,param_distributions,"{'bootstrap': [True, False], 'max_depth': [2, 4, ...], 'max_features': [0.5, 0.6, ...], 'max_samples': [0.5, 0.65, ...], ...}"
,n_iter,10
,scoring,
,n_jobs,2
,refit,True
,cv,10
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,2
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,0.6
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [33]:
randomsearch.best_score_ 

np.float64(0.7891666666666667)

In [34]:
randomsearch.best_params_

{'n_estimators': 100,
 'max_samples': 0.65,
 'max_features': 0.6,
 'max_depth': 2,
 'bootstrap': True}

Out of Bag score (oob_score)

In [36]:
rf3 = RandomForestClassifier(oob_score=True)
rf3.fit(X_train,y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [37]:
rf3.oob_score_

0.7933884297520661

In [38]:
print("accuracy:" , accuracy_score(y_test , rf3.predict(X_test)))

accuracy: 0.9016393442622951
