# Hyperparameter Optimisation Tutorial

https://www.youtube.com/watch?v=5nYqK-HaoKY&t=2923s

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

from sklearn.model_selection import train_test_split



In [2]:
df = pd.read_csv('./mobile_price_range_data.csv')
df.head(10)

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
5,1859,0,0.5,1,3,0,22,0.7,164,1,...,1004,1654,1067,17,1,10,1,0,0,1
6,1821,0,1.7,0,4,1,10,0.8,139,8,...,381,1018,3220,13,8,18,1,0,1,3
7,1954,0,0.5,1,0,0,24,0.8,187,4,...,512,1149,700,16,3,5,1,1,1,0
8,1445,1,0.5,0,0,0,53,0.7,174,7,...,386,836,1099,17,1,20,1,0,0,0
9,509,1,0.6,1,2,1,9,0.1,93,5,...,1137,1224,513,19,10,12,1,0,0,0


In [3]:
# numpy arrays of features
X = df.drop('price_range', axis=1).values
y = df['price_range'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1223)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1700, 20) (300, 20) (1700,) (300,)


## Grid Search

In [4]:
from sklearn.model_selection import GridSearchCV

classifier = RandomForestClassifier(n_jobs=-1)
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [1, 3, 5, 7],
    'criterion': ['gini', 'entropy']
}

model = GridSearchCV(
    estimator=classifier,
    param_grid=param_grid,
    scoring='accuracy',
    verbose=10,
    cv=5
)

model.fit(X_train, y_train)

print("best score " + str(model.best_score_))

print('best params')
print(model.best_estimator_.get_params())

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV 1/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............
[CV 1/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.647 total time=   8.1s
[CV 2/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............
[CV 2/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.662 total time=   0.1s
[CV 3/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............
[CV 3/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.641 total time=   0.0s
[CV 4/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............
[CV 4/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.653 total time=   0.0s
[CV 5/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............
[CV 5/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.594 total time=   0.0s
[CV 1/5; 2/32] START criterion=gini, max_de

## Random Search

In [25]:
from sklearn.model_selection import RandomizedSearchCV

classifier = RandomForestClassifier(n_jobs=-1)
param_grid = {
    'n_estimators': np.arange(100, 1500, 100),
    'max_depth': [1, 3, 5, 7],
    'criterion': ['gini', 'entropy']
}

model = RandomizedSearchCV(
    estimator=classifier,
    param_distributions=param_grid,
    n_iter=10,
    scoring='accuracy',
    verbose=10,
    cv=5
)

model.fit(X_train, y_train)

print("best score " + str(model.best_score_))

print('best params')
print(model.best_estimator_.get_params())

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START criterion=entropy, max_depth=1, n_estimators=1200..........
[CV 1/5; 1/10] END criterion=entropy, max_depth=1, n_estimators=1200;, score=0.541 total time=   0.7s
[CV 2/5; 1/10] START criterion=entropy, max_depth=1, n_estimators=1200..........
[CV 2/5; 1/10] END criterion=entropy, max_depth=1, n_estimators=1200;, score=0.582 total time=   1.0s
[CV 3/5; 1/10] START criterion=entropy, max_depth=1, n_estimators=1200..........
[CV 3/5; 1/10] END criterion=entropy, max_depth=1, n_estimators=1200;, score=0.553 total time=   0.9s
[CV 4/5; 1/10] START criterion=entropy, max_depth=1, n_estimators=1200..........
[CV 4/5; 1/10] END criterion=entropy, max_depth=1, n_estimators=1200;, score=0.565 total time=   0.7s
[CV 5/5; 1/10] START criterion=entropy, max_depth=1, n_estimators=1200..........
[CV 5/5; 1/10] END criterion=entropy, max_depth=1, n_estimators=1200;, score=0.518 total time=   0.7s
[CV 1/5; 2/10] START cri

## Grid/Random Search with Pipelines

In [28]:
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import pipeline

scl = preprocessing.StandardScaler()
pca = decomposition.PCA()

rf = RandomForestClassifier(n_jobs=-1)

classifier = pipeline.Pipeline(
    [
        ('scaling', scl),
        ('pca', pca),
        ('rf', rf)
    ]
)

param_grid = {
    'pca__n_components': np.arange(5, 10),
    'rf__n_estimators': np.arange(100, 1500, 100),
    'rf__max_depth': np.arange(1,20,5),
    'rf__criterion': ['gini', 'entropy']
}

model = RandomizedSearchCV(
    estimator=classifier,
    param_distributions=param_grid,
    n_iter=10,
    scoring='accuracy',
    verbose=10,
    cv=5
)

model.fit(X_train, y_train)

print("best score " + str(model.best_score_))

print('best params')
print(model.best_estimator_.get_params())

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START pca__n_components=9, rf__criterion=gini, rf__max_depth=6, rf__n_estimators=100
[CV 1/5; 1/10] END pca__n_components=9, rf__criterion=gini, rf__max_depth=6, rf__n_estimators=100;, score=0.421 total time=   4.8s
[CV 2/5; 1/10] START pca__n_components=9, rf__criterion=gini, rf__max_depth=6, rf__n_estimators=100
[CV 2/5; 1/10] END pca__n_components=9, rf__criterion=gini, rf__max_depth=6, rf__n_estimators=100;, score=0.471 total time=   0.1s
[CV 3/5; 1/10] START pca__n_components=9, rf__criterion=gini, rf__max_depth=6, rf__n_estimators=100
[CV 3/5; 1/10] END pca__n_components=9, rf__criterion=gini, rf__max_depth=6, rf__n_estimators=100;, score=0.432 total time=   0.1s
[CV 4/5; 1/10] START pca__n_components=9, rf__criterion=gini, rf__max_depth=6, rf__n_estimators=100
[CV 4/5; 1/10] END pca__n_components=9, rf__criterion=gini, rf__max_depth=6, rf__n_estimators=100;, score=0.441 total time=   0.1s
[CV 5/5; 1/10] 

## Bayesian Search

In [47]:
from sklearn.model_selection import StratifiedKFold
from functools import partial
from skopt import space
from skopt import gp_minimize

def optimise(params, param_names, X, y):
    params = dict(zip(param_names, params))
    model = RandomForestClassifier(**params)
    kf = StratifiedKFold(n_splits=5)
    accuracies = []
    for idx in kf.split(X=X, y=y):
        train_idx, test_idx = idx[0], idx[1]
        
        Xtrain = X[train_idx]
        ytrain = y[train_idx]
        
        Xtest = X[test_idx]
        ytest = y[test_idx]
        
        model.fit(Xtrain, ytrain)
        preds = model.predict(Xtest)
        fold_acc = metrics.accuracy_score(ytest, preds)
        accuracies.append(fold_acc)
        
    # set for minimisation
    return -1.0 * np.mean(accuracies)

param_space = [
    space.Integer(3,15, name="max_depth"),
    space.Integer(100,600, name="n_estimators"),
    space.Categorical(['gini', 'entropy'], name="criterion"),
    space.Real(0.01, 1, prior="uniform", name="max_features")
]
param_names = [
    'max_depth',
    'n_estimators',
    'criterion',
    'max_features'
]

optimization_function = partial(
    optimise,
    param_names=param_names,
    X=X,
    y=y
)

result = gp_minimize(
    optimization_function,
    dimensions=param_space,
    n_calls=15,
    n_random_starts=4,
    verbose=10
)

print(
    dict(
        zip(param_names, result.x)
    )
)

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 10.3819
Function value obtained: -0.8995
Current minimum: -0.8995
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 10.6042
Function value obtained: -0.8810
Current minimum: -0.8995
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 6.6732
Function value obtained: -0.7645
Current minimum: -0.8995
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 30.4456
Function value obtained: -0.9050
Current minimum: -0.9050
Iteration No: 5 started. Searching for the next optimal point.
Iteration No: 5 ended. Search finished for the next optimal point.
Time taken: 2.3993
Function value obtained: -0.8045
Current minimum: -0.9050
Iteration 

## Hyperopt

- https://hyperopt.github.io/hyperopt/
- https://machinelearningmastery.com/hyperopt-for-automated-machine-learning-with-scikit-learn/

In [49]:
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope

# hyperopt uses a dict rather than a list - use the docs (above) for dtypes
param_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 3, 15, 1)),
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 600, 1)),
    'criterion': hp.choice('criterion', ['gini', 'entropy']),
    'max_features': hp.uniform('max_features', 0.01, 1)
}

# param_names = ['max_depth', 'n_estimators', 'criterion', 'max_features']

# optimization_function = partial(optimise, param_names=param_names, X=X, y=y)

trials = Trials()

result = fmin(
    fn=optimization_function,
    space=param_space,
    algo=tpe.suggest,
    max_evals=15,
    trials=trials
)

print(result)

  0%|          | 0/15 [00:00<?, ?trial/s, best loss=?]

job exception: The 'criterion' parameter of RandomForestClassifier must be a str among {'entropy', 'log_loss', 'gini'}. Got 'max_features' instead.



  0%|          | 0/15 [00:00<?, ?trial/s, best loss=?]


InvalidParameterError: The 'criterion' parameter of RandomForestClassifier must be a str among {'entropy', 'log_loss', 'gini'}. Got 'max_features' instead.