# Introduction to Hyperparameter Optimization


*   Sample Code for an article in https://bigdata.go.th/big-data-101/machine-learning-model-hyperparameter-optimization/
*   Created by Tinnakorn Marlaithong  (11/11/2021)




In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

## Import Data

In [None]:
train = pd.read_csv('https://raw.githubusercontent.com/Linktnk/Hyperparameter-Tuning-/main/data/titanic_train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/Linktnk/Hyperparameter-Tuning-/main/data/titanic_test.csv')

In [None]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare,Embarked,Name_length,Has_Cabin,FamilySize,IsAlone,Title
0,0,3,1,1,0,0,0,23,0,2,0,1
1,1,1,0,2,0,3,1,51,1,2,0,3
2,1,3,0,1,0,1,0,22,0,1,1,2
3,1,1,0,2,0,3,0,44,1,2,0,3
4,0,3,1,2,0,1,0,24,0,1,1,1


In [None]:
X=train.drop(['Survived'],axis=1)
y=train['Survived']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=101)

## Default

In [None]:
%%time

model = RandomForestClassifier(random_state= 101).fit(X_train,y_train)
predictionforest = model.predict(X_test)
print(confusion_matrix(y_test,predictionforest))
print(classification_report(y_test,predictionforest))
acc1 = accuracy_score(y_test,predictionforest)

[[91  8]
 [31 49]]
              precision    recall  f1-score   support

           0       0.75      0.92      0.82        99
           1       0.86      0.61      0.72        80

    accuracy                           0.78       179
   macro avg       0.80      0.77      0.77       179
weighted avg       0.80      0.78      0.78       179

CPU times: user 191 ms, sys: 2.72 ms, total: 194 ms
Wall time: 198 ms


In [None]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 101,
 'verbose': 0,
 'warm_start': False}

In [None]:
acc1

0.7821229050279329

## Manual Search

In [None]:
%%time
model = RandomForestClassifier(n_estimators=10, random_state= 101).fit(X_train,y_train)
predictionforest = model.predict(X_test)
print(confusion_matrix(y_test,predictionforest))
print(classification_report(y_test,predictionforest))
acc2 = accuracy_score(y_test,predictionforest)

[[91  8]
 [33 47]]
              precision    recall  f1-score   support

           0       0.73      0.92      0.82        99
           1       0.85      0.59      0.70        80

    accuracy                           0.77       179
   macro avg       0.79      0.75      0.76       179
weighted avg       0.79      0.77      0.76       179

CPU times: user 32 ms, sys: 1.43 ms, total: 33.4 ms
Wall time: 34.4 ms


In [None]:
%%time
model = RandomForestClassifier(n_estimators=10,max_features = "log2", min_samples_leaf = 30, random_state= 101).fit(X_train,y_train)
predictionforest = model.predict(X_test)
print(confusion_matrix(y_test,predictionforest))
print(classification_report(y_test,predictionforest))

[[98  1]
 [42 38]]
              precision    recall  f1-score   support

           0       0.70      0.99      0.82        99
           1       0.97      0.47      0.64        80

    accuracy                           0.76       179
   macro avg       0.84      0.73      0.73       179
weighted avg       0.82      0.76      0.74       179

CPU times: user 37.4 ms, sys: 1.82 ms, total: 39.2 ms
Wall time: 40.1 ms


In [None]:
acc2

0.770949720670391

## Grid Search

In [None]:
parameters ={
     'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
     'criterion' : ['gini', 'entropy'],
     'max_features': [0.3,0.5,0.7,0.9],
     'min_samples_leaf': [3,5,7,10,15],
     'min_samples_split': [2,5,10],
     'n_estimators': [50,100,200,400,600]}

In [None]:
%%time
from sklearn.model_selection import GridSearchCV
clf = RandomForestClassifier()
model = GridSearchCV(
    clf, 
    parameters, 
    cv= 4,
    scoring='accuracy',n_jobs=-1)

grid_result= model.fit(X_train, y_train)
print('Best Params: ', grid_result.best_params_)
print('Best Score: ', grid_result.best_score_)

Best Params:  {'criterion': 'gini', 'max_depth': 40, 'max_features': 0.7, 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 50}
Best Score:  0.8384831460674157
CPU times: user 3min 10s, sys: 12 s, total: 3min 22s
Wall time: 2h 56min 28s


In [None]:
predictionforest = model.best_estimator_.predict(X_test)
print(confusion_matrix(y_test,predictionforest))
print(classification_report(y_test,predictionforest))
acc3 = accuracy_score(y_test,predictionforest)

[[94  5]
 [24 56]]
              precision    recall  f1-score   support

           0       0.80      0.95      0.87        99
           1       0.92      0.70      0.79        80

    accuracy                           0.84       179
   macro avg       0.86      0.82      0.83       179
weighted avg       0.85      0.84      0.83       179



In [None]:
acc3

## Random Search

In [None]:
%%time
from sklearn.model_selection import RandomizedSearchCV

clf = RandomForestClassifier()
model = RandomizedSearchCV(estimator = clf, param_distributions = parameters, n_iter = 80, 
                               cv = 4, verbose= 1, random_state= 101, n_jobs = -1)
model.fit(X_train,y_train)
print('Best Score: ', model.best_score_*100)
print('Best Params: ', model.best_params_)

Fitting 4 folds for each of 80 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  2.6min finished


Best Score:  83.14606741573034
Best Params:  {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 0.5, 'max_depth': 30, 'criterion': 'gini'}
CPU times: user 2.99 s, sys: 196 ms, total: 3.19 s
Wall time: 2min 38s


In [None]:
predictionforest = model.best_estimator_.predict(X_test)
print(confusion_matrix(y_test,predictionforest))
print(classification_report(y_test,predictionforest))
acc4 = accuracy_score(y_test,predictionforest)

[[93  6]
 [29 51]]
              precision    recall  f1-score   support

           0       0.76      0.94      0.84        99
           1       0.89      0.64      0.74        80

    accuracy                           0.80       179
   macro avg       0.83      0.79      0.79       179
weighted avg       0.82      0.80      0.80       179



In [None]:
acc4


0.8044692737430168

In [6]:
print('Base Accuracy vs Manual Search {:0.4f}%.'.format( 100 * (acc2 - acc1) / acc1))
print('Base Accuracy vs Grid Search {:0.4f}%.'.format( 100 * (acc3 - acc1) / acc1))
print('Base Accuracy vs Random Search {:0.4f}%.'.format( 100 * (acc4 - acc1) / acc1))


Base Accuracy vs Manual Search -1.4286%.
Base Accuracy vs Grid Search 8.4612%.
Base Accuracy vs Random Search 2.8571%.
