# Hyperparameter Tuning in Python
A deep dive into Grid Search and Random Search

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from time import time

In [2]:
# Call random forest classifier
rf = RandomForestClassifier()

In [3]:
# Get the default setting
rf.get_params

<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)>

In [4]:
# Load the data
df = pd.read_csv('/content/glass.csv')

In [5]:
df.sample(5)

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
40,1.51793,12.79,3.5,1.12,73.03,0.64,8.77,0.0,0.0,1
141,1.51851,13.2,3.63,1.07,72.83,0.57,8.41,0.09,0.17,2
81,1.51593,13.25,3.45,1.43,73.17,0.61,7.86,0.0,0.0,2
176,1.51905,14.0,2.39,1.56,72.37,0.0,9.57,0.0,0.0,6
95,1.5186,13.36,3.43,1.43,72.26,0.51,8.6,0.0,0.0,2


In [6]:
# Seperate X and y
X = df.drop(columns=['Type'], axis=1)
y = df['Type']

In [7]:
# Generate training and test sets for X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [8]:
# Call and fit the random forest on the training set
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [9]:
# Predict on the test set and call accuracy
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [10]:
print(accuracy)

0.813953488372093


**Grid Search**

In [23]:
# Define the grid
param_grid = {'n_estimators': [50, 100, 200, 300],
              'min_samples_leaf': [1, 5, 10],
              'max_depth': [2, 4, 6, 8, 10],
              'max_features': ['auto', 'sqrt'],
              'bootstrap': [True, False]}

# Instantiate GridSearchCV
model_gridsearch = GridSearchCV(estimator=rf_model,
                                param_grid=param_grid,
                                scoring='accuracy',
                                n_jobs=4,
                                cv=5,
                                refit=True,
                                return_train_score=True
                                )
# Record the current time              
start = time()

# Fit the selected model
model_gridsearch.fit(X_train, y_train)

# Print the time spend and number of models ran
print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % ((time() - start), len(model_gridsearch.cv_results_['params'])))

GridSearchCV took 247.79 seconds for 240 candidate parameter settings.


In [24]:
# Predict on the test set and call accuracy
y_pred_grid = model_gridsearch.predict(X_test)
accuracy_grid = accuracy_score(y_test, y_pred_grid)

In [25]:
print(accuracy_grid)

0.8837209302325582


In [26]:
model_gridsearch.get_params

<bound method BaseEstimator.get_params of GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
         

In [27]:
cv_results_df = pd.DataFrame(model_gridsearch.cv_results_)
print(cv_results_df.loc[cv_results_df['rank_test_score'] == 1, 'params'])
print(model_gridsearch.best_params_)

111    {'bootstrap': True, 'max_depth': 10, 'max_feat...
Name: params, dtype: object
{'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 300}


In [28]:
print(model_gridsearch.best_estimator_)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


**Random Search**

In [35]:
# specify distributions to sample from
param_dist = {'n_estimators': list(range(100, 300, 10)),
              'min_samples_leaf': list(range(1, 50)),
              'max_depth': list(range(2, 20)),
              'max_features': ['auto', 'sqrt'],
              'bootstrap': [True, False]}

# specify number of search iterations
n_iter_search = 50

# Instantiate RandomSearchCV
model_random_search = RandomizedSearchCV(estimator=rf_model,
                                         param_distributions=param_dist,
                                         n_iter=n_iter_search)

# Record the current time
start = time()

# Fit the selected model
model_random_search.fit(X_train, y_train)

# Print the time spend and number of models ran
print("RandomizedSearchCV took %.2f seconds for %d candidate parameter settings." % ((time() - start), n_iter_search))

RandomizedSearchCV took 64.17 seconds for 50 candidate parameter settings.


In [36]:
# Predict on the test set and call accuracy
y_pred_random = model_random_search.predict(X_test)
accuracy_random = accuracy_score(y_test, y_pred_random)

In [37]:
print(accuracy_random)

0.8604651162790697


In [38]:
cv_results_random = pd.DataFrame(model_gridsearch.cv_results_)

print(cv_results_random.loc[cv_results_random['rank_test_score'] == 1, 'params'])

print(model_random_search.best_params_)

111    {'bootstrap': True, 'max_depth': 10, 'max_feat...
Name: params, dtype: object
{'n_estimators': 230, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 13, 'bootstrap': False}


In [39]:
print(model_random_search.best_estimator_)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=13, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=230,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
