# Hyperparameter Tuning Example

## Libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor

## Load Data

In [3]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
# One-hot encodings of categorical variables
df = pd.get_dummies(df, drop_first=True, dtype=int)
df.head()

Unnamed: 0,total_bill,tip,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,1.01,2,1,1,0,0,1,1
1,10.34,1.66,3,0,1,0,0,1,1
2,21.01,3.5,3,0,1,0,0,1,1
3,23.68,3.31,2,0,1,0,0,1,1
4,24.59,3.61,4,1,1,0,0,1,1


In [5]:
df.shape

(244, 9)

## Train / Test

In [6]:
X = df.drop('tip', axis=1)
y = df['tip']

# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

## Cross-validation

In [7]:
# Declare KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
# Declare scores to be used
scoring = {
    'RMSE': make_scorer(root_mean_squared_error),
    'MAE': make_scorer(mean_absolute_error),
}

## Hyperparameter Tuning

### Grid Search

In [9]:
# GridSearchCV with DT
dt = DecisionTreeRegressor()

param_dist = {
    'max_depth': [3, 10, 50, 100, None],
    'min_samples_split': [5, 8, 10, 15, 20],
    'min_samples_leaf': [2, 5, 10, 15, 20],
    'criterion': ['squared_error', 'absolute_error']
}

dt_gscv = GridSearchCV(dt,
                       param_grid=param_dist,
                       scoring=scoring,
                       return_train_score=True,
                       # Fuction to select the best model
                       refit=lambda x: x['mean_test_MAE'].argmin(),
                       cv=kf,
                       verbose=1,
                       n_jobs=-1)
dt_gscv.fit(X_train, y_train)

print('Best GridSearchCV parameters: ', dt_gscv.best_params_)

Fitting 5 folds for each of 250 candidates, totalling 1250 fits
Best GridSearchCV parameters:  {'criterion': 'squared_error', 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 5}


In [10]:
print('Train RMSE:', dt_gscv.cv_results_['mean_train_RMSE'][dt_gscv.best_index_].round(2))
print('Validation RMSE:', dt_gscv.cv_results_['mean_test_RMSE'][dt_gscv.best_index_].round(2))
print()
print('Train MAE:', dt_gscv.cv_results_['mean_train_MAE'][dt_gscv.best_index_].round(2))
print('Validation MAE:', dt_gscv.cv_results_['mean_test_MAE'][dt_gscv.best_index_].round(2))

Train RMSE: 0.91
Validation RMSE: 1.06

Train MAE: 0.67
Validation MAE: 0.79


### Randomized Search

With a Decistion Tree.

#### Round 1

In [11]:
# RandomizedSearchCV with DT
dt = DecisionTreeRegressor()

param_dist = {
    'max_depth': [3, 5, 10, 20, 50, 100, None],
    'min_samples_split': np.arange(2, 11),
    'min_samples_leaf': np.arange(1, 11),
    'criterion': ['squared_error', 'absolute_error']
}

dt_rscv = RandomizedSearchCV(dt,
                             param_distributions=param_dist,
                             scoring=scoring,
                             n_iter=50,
                             return_train_score=True,
                             # Fuction to select the best model
                             refit=lambda x: x['mean_test_MAE'].argmin(),
                             cv=kf,
                             verbose=1,
                             n_jobs=-1)
dt_rscv.fit(X_train, y_train)

print('Best RandomizedSearchCV parameters: ', dt_rscv.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best RandomizedSearchCV parameters:  {'min_samples_split': np.int64(10), 'min_samples_leaf': np.int64(2), 'max_depth': 3, 'criterion': 'squared_error'}


In [12]:
print('Train RMSE:', dt_rscv.cv_results_['mean_train_RMSE'][dt_rscv.best_index_].round(2))
print('Validation RMSE:', dt_rscv.cv_results_['mean_test_RMSE'][dt_rscv.best_index_].round(2))
print()
print('Train MAE:', dt_rscv.cv_results_['mean_train_MAE'][dt_rscv.best_index_].round(2))
print('Validation MAE:', dt_rscv.cv_results_['mean_test_MAE'][dt_rscv.best_index_].round(2))

Train RMSE: 0.91
Validation RMSE: 1.06

Train MAE: 0.67
Validation MAE: 0.79


In [23]:
# Get the indices of the smallest MAE values
best_indices = np.argsort(dt_rscv.cv_results_['mean_test_MAE'])[:5]

# Show best MAEs and their corresponding hyperparams.
for i in best_indices:
    print('MAE:', dt_rscv.cv_results_['mean_test_MAE'][i].round(2))
    print('Hyperparams:', dt_rscv.cv_results_['params'][i])
    print()

MAE: 0.8
Hyperparams: {'min_samples_split': np.int64(5), 'min_samples_leaf': np.int64(10)}

MAE: 0.8
Hyperparams: {'min_samples_split': np.int64(10), 'min_samples_leaf': np.int64(7)}

MAE: 0.8
Hyperparams: {'min_samples_split': np.int64(2), 'min_samples_leaf': np.int64(7)}

MAE: 0.8
Hyperparams: {'min_samples_split': np.int64(21), 'min_samples_leaf': np.int64(9)}

MAE: 0.8
Hyperparams: {'min_samples_split': np.int64(6), 'min_samples_leaf': np.int64(9)}



#### Round 2

In [14]:
# RandomizedSearchCV with DT
dt = DecisionTreeRegressor(max_depth=3, criterion='absolute_error')

param_dist = {
    'min_samples_split': np.arange(2, 51),
    'min_samples_leaf': np.arange(1, 31),
}

dt_rscv = RandomizedSearchCV(dt,
                             param_distributions=param_dist,
                             scoring=scoring,
                             n_iter=50,
                             return_train_score=True,
                             # Fuction to select the best model
                             refit=lambda x: x['mean_test_MAE'].argmin(),
                             cv=kf,
                             verbose=1,
                             n_jobs=-1)
dt_rscv.fit(X_train, y_train)

print('Best RandomizedSearchCV parameters: ', dt_rscv.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best RandomizedSearchCV parameters:  {'min_samples_split': np.int64(5), 'min_samples_leaf': np.int64(10)}


In [15]:
print('Train RMSE:', dt_rscv.cv_results_['mean_train_RMSE'][dt_rscv.best_index_].round(2))
print('Validation RMSE:', dt_rscv.cv_results_['mean_test_RMSE'][dt_rscv.best_index_].round(2))
print()
print('Train MAE:', dt_rscv.cv_results_['mean_train_MAE'][dt_rscv.best_index_].round(2))
print('Validation MAE:', dt_rscv.cv_results_['mean_test_MAE'][dt_rscv.best_index_].round(2))

Train RMSE: 1.04
Validation RMSE: 1.11

Train MAE: 0.65
Validation MAE: 0.8


## With Pipeline

In [16]:
# RandomizedSearchCV with kNN Pipeline
knn = Pipeline([
    ('scaler', None),
    ('knn', KNeighborsRegressor())
])

param_dist = {
    'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'knn__n_neighbors': np.arange(2, 51),
    'knn__p': np.arange(1, 11),
}

knn_rscv = RandomizedSearchCV(knn,
                              param_distributions=param_dist,
                              scoring=scoring,
                              n_iter=50,
                              return_train_score=True,
                              # Fuction to select the best model
                              refit=lambda x: x['mean_test_MAE'].argmin(),
                              cv=kf,
                              verbose=1,
                              n_jobs=-1)
knn_rscv.fit(X_train, y_train)

print('Best RandomizedSearchCV parameters: ', knn_rscv.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best RandomizedSearchCV parameters:  {'scaler': RobustScaler(), 'knn__p': np.int64(9), 'knn__n_neighbors': np.int64(13)}


In [17]:
print('Train RMSE:', knn_rscv.cv_results_['mean_train_RMSE'][knn_rscv.best_index_].round(2))
print('Validation RMSE:', knn_rscv.cv_results_['mean_test_RMSE'][knn_rscv.best_index_].round(2))
print()
print('Train MAE:', knn_rscv.cv_results_['mean_train_MAE'][knn_rscv.best_index_].round(2))
print('Validation MAE:', knn_rscv.cv_results_['mean_test_MAE'][knn_rscv.best_index_].round(2))

Train RMSE: 1.06
Validation RMSE: 1.1

Train MAE: 0.77
Validation MAE: 0.83
