### Different ways to do Hyperparameter tunning ML:

1. Grid Search CV
2. Random Search CV
3. Optuna
4. HyperOpt

References:<br>
* https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
* https://www.kdnuggets.com/2020/05/hyperparameter-optimization-machine-learning-models.html
* https://www.analyticsvidhya.com/blog/2020/11/hyperparameter-tuning-using-optuna/#:~:text=So%20this%20function%20takes%20a%20trial%20object%20as%20its%20argument.&text=The%20objective%20function%20value%20is,the%20nature%20of%20the%20objective.
* https://machinelearningmastery.com/hyperopt-for-automated-machine-learning-with-scikit-learn/
* https://github.com/krishnaik06/All-Hyperparamter-Optimization

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, classification_report
import pandas as pd

In [40]:
df = pd.read_csv("winequality_red.csv")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [124]:
df["quality"].unique()

array([5, 6, 7, 4, 8, 3])

In [41]:
X = df.drop(columns = 'quality')
y = df['quality']

In [42]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.30, random_state= 42)

### Manual Hyperparameter tunning

In [126]:


rf = RandomForestClassifier(n_estimators=10, random_state = 42)
rf.fit(x_train, y_train)
predictions=rf.predict(x_test)
print(accuracy_score(y_test,predictions))

0.6333333333333333


### GridSearch CV

In [44]:

grid_param = {
    "n_estimators" : [90,100],
    'criterion': ['gini', 'entropy'],
    'max_depth' : range(2,10,1),
    'min_samples_leaf' : range(1,5,1),
    'min_samples_split': range(2,5,1),
    'max_features' : ['auto','log2']
}

grid_search = GridSearchCV(estimator=rf,param_grid=grid_param,cv=5,n_jobs =-1,verbose = 3)

In [45]:
grid_search.fit(x_train,y_train)

Fitting 5 folds for each of 768 candidates, totalling 3840 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   26.5s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   42.8s
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:   58.2s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3840 out of 3840 | elapsed:  3.0min finished


GridSearchCV(cv=5,
             estimator=RandomForestClassifier(n_estimators=5, random_state=42),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(2, 10),
                         'max_features': ['auto', 'log2'],
                         'min_samples_leaf': range(1, 5),
                         'min_samples_split': range(2, 5),
                         'n_estimators': [90, 100]},
             verbose=3)

In [54]:
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'criterion': 'entropy', 'max_depth': 9, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 90}


In [59]:
criterion= 'entropy'
max_depth= 9
max_features= 'auto'
min_samples_leaf= 1
min_samples_split= 2
n_estimators= 90

rf_grid = RandomForestClassifier(criterion=criterion, max_depth=max_depth,
                                max_features=max_features, min_samples_leaf=min_samples_leaf,
                                min_samples_split=min_samples_split, n_estimators=n_estimators, 
                                random_state = 1000)
rf_grid.fit(x_train, y_train)
predictions=rf_grid.predict(x_test)
print(accuracy_score(y_test,predictions))

0.6520833333333333


### RandomSearch CV

In [62]:
random_param = {
    "n_estimators" : [90,100, 150],
    'criterion': ['gini', 'entropy'],
    'max_depth' : range(2,20,1),
    'min_samples_leaf' : range(1,10,1),
    'min_samples_split': range(2,10,1),
    'max_features' : ['auto','log2']
}


random_search = RandomizedSearchCV(rf,random_param,cv=5,n_jobs =-1,verbose = 3)

In [63]:
random_search.fit(x_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    5.2s finished


RandomizedSearchCV(cv=5,
                   estimator=RandomForestClassifier(n_estimators=5,
                                                    random_state=42),
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': range(2, 20),
                                        'max_features': ['auto', 'log2'],
                                        'min_samples_leaf': range(1, 10),
                                        'min_samples_split': range(2, 10),
                                        'n_estimators': [90, 100, 150]},
                   verbose=3)

In [64]:
print("Best Parameters:", random_search.best_params_)

Best Parameters: {'n_estimators': 100, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 14, 'criterion': 'gini'}


In [65]:
criterion= 'gini'
max_depth= 14
max_features= 'auto'
min_samples_leaf= 1
min_samples_split= 4
n_estimators= 100

rf_search = RandomForestClassifier(criterion=criterion, max_depth=max_depth,
                                max_features=max_features, min_samples_leaf=min_samples_leaf,
                                min_samples_split=min_samples_split, n_estimators=n_estimators, 
                                random_state = 1000)
rf_search.fit(x_train, y_train)
predictions=rf_search.predict(x_test)
print(accuracy_score(y_test,predictions))

0.65625


### Optuna

In [127]:
!pip install optuna



In [101]:
import optuna
import sklearn
from sklearn import datasets
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 2, 150)
    max_depth = trial.suggest_int('max_depth', 1, 32)
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    return sklearn.model_selection.cross_val_score(clf, X, y, 
       n_jobs=-1, cv=5).mean()

In [102]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[32m[I 2021-05-12 11:20:31,992][0m A new study created in memory with name: no-name-0003b556-3580-4148-857b-5f5a9f84c2e8[0m
[32m[I 2021-05-12 11:20:32,409][0m Trial 0 finished with value: 0.5860148902821317 and parameters: {'n_estimators': 110, 'max_depth': 6}. Best is trial 0 with value: 0.5860148902821317.[0m
[32m[I 2021-05-12 11:20:32,734][0m Trial 1 finished with value: 0.5709894200626959 and parameters: {'n_estimators': 76, 'max_depth': 15}. Best is trial 0 with value: 0.5860148902821317.[0m
[32m[I 2021-05-12 11:20:32,986][0m Trial 2 finished with value: 0.5747492163009404 and parameters: {'n_estimators': 59, 'max_depth': 15}. Best is trial 0 with value: 0.5860148902821317.[0m
[32m[I 2021-05-12 11:20:33,116][0m Trial 3 finished with value: 0.5559717868338558 and parameters: {'n_estimators': 21, 'max_depth': 26}. Best is trial 0 with value: 0.5860148902821317.[0m
[32m[I 2021-05-12 11:20:33,218][0m Trial 4 finished with value: 0.5459815830721004 and parameters: {'n_

[32m[I 2021-05-12 11:20:47,001][0m Trial 42 finished with value: 0.5747531347962382 and parameters: {'n_estimators': 127, 'max_depth': 2}. Best is trial 31 with value: 0.594134012539185.[0m
[32m[I 2021-05-12 11:20:47,461][0m Trial 43 finished with value: 0.5847609717868338 and parameters: {'n_estimators': 150, 'max_depth': 5}. Best is trial 31 with value: 0.594134012539185.[0m
[32m[I 2021-05-12 11:20:47,902][0m Trial 44 finished with value: 0.5791300940438872 and parameters: {'n_estimators': 121, 'max_depth': 9}. Best is trial 31 with value: 0.594134012539185.[0m
[32m[I 2021-05-12 11:20:48,315][0m Trial 45 finished with value: 0.5872629310344827 and parameters: {'n_estimators': 129, 'max_depth': 7}. Best is trial 31 with value: 0.594134012539185.[0m
[32m[I 2021-05-12 11:20:48,677][0m Trial 46 finished with value: 0.5803722570532915 and parameters: {'n_estimators': 144, 'max_depth': 3}. Best is trial 31 with value: 0.594134012539185.[0m
[32m[I 2021-05-12 11:20:49,078][0

[32m[I 2021-05-12 11:21:02,244][0m Trial 85 finished with value: 0.5809952978056426 and parameters: {'n_estimators': 132, 'max_depth': 3}. Best is trial 47 with value: 0.6028859717868338.[0m
[32m[I 2021-05-12 11:21:02,569][0m Trial 86 finished with value: 0.5678683385579937 and parameters: {'n_estimators': 144, 'max_depth': 2}. Best is trial 47 with value: 0.6028859717868338.[0m
[32m[I 2021-05-12 11:21:03,076][0m Trial 87 finished with value: 0.576009012539185 and parameters: {'n_estimators': 129, 'max_depth': 20}. Best is trial 47 with value: 0.6028859717868338.[0m
[32m[I 2021-05-12 11:21:03,422][0m Trial 88 finished with value: 0.587884012539185 and parameters: {'n_estimators': 116, 'max_depth': 6}. Best is trial 47 with value: 0.6028859717868338.[0m
[32m[I 2021-05-12 11:21:03,713][0m Trial 89 finished with value: 0.5878879310344828 and parameters: {'n_estimators': 108, 'max_depth': 4}. Best is trial 47 with value: 0.6028859717868338.[0m
[32m[I 2021-05-12 11:21:04,060

In [103]:
trial = study.best_trial
print('Accuracy: {}'.format(trial.value))

Accuracy: 0.6028859717868338


In [104]:
print("Best hyperparameters: {}".format(trial.params))

Best hyperparameters: {'n_estimators': 133, 'max_depth': 6}


### HyperOpt

In [128]:
!pip install hyperopt



In [114]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

In [115]:
space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.quniform('max_depth', 10, 1200, 10),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
    }

In [118]:
def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = space['max_depth'],
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 )
    
    accuracy = cross_val_score(model, x_train, y_train, cv = 5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [119]:
from sklearn.model_selection import cross_val_score
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

100%|██████████| 80/80 [10:08<00:00,  7.60s/trial, best loss: -0.6318745996156311]


{'criterion': 0,
 'max_depth': 1140.0,
 'max_features': 2,
 'min_samples_leaf': 0.01604499012682059,
 'min_samples_split': 0.03222644181384632,
 'n_estimators': 5}

In [120]:
crit = {0: 'entropy', 1: 'gini'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 10, 1: 50, 2: 300, 3: 750, 4: 1200,5:1300,6:1500}


print(crit[best['criterion']])
print(feat[best['max_features']])
print(est[best['n_estimators']])

entropy
log2
1300


In [123]:
rf_ho = RandomForestClassifier(criterion = crit[best['criterion']], max_depth = best['max_depth'], 
                                       max_features = feat[best['max_features']], 
                                       min_samples_leaf = best['min_samples_leaf'], 
                                       min_samples_split = best['min_samples_split'], 
                                       n_estimators = est[best['n_estimators']]).fit(x_train,y_train)
rf_ho.fit(x_train, y_train)
predictions=rf_ho.predict(x_test)
print(accuracy_score(y_test,predictions))

0.5729166666666666
