# HyperParameter Tuning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [10]:
df['Glucose'] = np.where(df['Glucose']==0, df['Glucose'].median(), df['Glucose'])
df['Insulin'] = np.where(df['Insulin']==0, df['Insulin'].median(), df['Insulin'])
df['SkinThickness'] = np.where(df['SkinThickness']==0, df['SkinThickness'].median(), df['SkinThickness'])

In [11]:
df.loc[df['Glucose']==0]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome


In [12]:
X = df.drop(columns = ['Outcome'])

In [13]:
y = df['Outcome']

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [10]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [23]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10)
classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=10)

In [12]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[127  22]
 [ 38  44]]
0.7402597402597403
              precision    recall  f1-score   support

           0       0.77      0.85      0.81       149
           1       0.67      0.54      0.59        82

    accuracy                           0.74       231
   macro avg       0.72      0.69      0.70       231
weighted avg       0.73      0.74      0.73       231



In [13]:
### Manual Hyperparameter Tuning
model=RandomForestClassifier(n_estimators=300,criterion='entropy',
                             max_features='sqrt',min_samples_leaf=10,random_state=100).fit(X_train,y_train)
predictions=model.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(accuracy_score(y_test,predictions))
print(classification_report(y_test,predictions))

[[130  19]
 [ 31  51]]
0.7835497835497836
              precision    recall  f1-score   support

           0       0.81      0.87      0.84       149
           1       0.73      0.62      0.67        82

    accuracy                           0.78       231
   macro avg       0.77      0.75      0.75       231
weighted avg       0.78      0.78      0.78       231



# 1. Randomizd Search CV

In [14]:
from sklearn.model_selection import RandomizedSearchCV

In [15]:
##Number of Trees in Random Forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
##Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
##Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10,1000,10)]
##Minimum number of samples required to split a node
min_samples_split = [2,5,10,14]
##Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,3,4,5]
##Create the random grid
random_grid = {'n_estimators': n_estimators,
              'max_features' : max_features,
              'max_depth' : max_depth,
              'min_samples_split' : min_samples_split,
              'min_samples_leaf' : min_samples_leaf,
              'criterion' : ['gini', 'entropy']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 3, 4, 5], 'criterion': ['gini', 'entropy']}


In [26]:
rf = RandomForestClassifier()
rf_randomcv = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_jobs=-1, cv=2,n_iter=2)
rf_randomcv.fit(X_train, y_train)

RandomizedSearchCV(cv=2, estimator=RandomForestClassifier(), n_iter=2,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5],
                                        'min_samples_split': [2, 5, 10, 14],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]})

In [27]:
rf_randomcv.best_params_

{'n_estimators': 1200,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 890,
 'criterion': 'gini'}

In [28]:
rf_randomcv

RandomizedSearchCV(cv=2, estimator=RandomForestClassifier(), n_iter=2,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5],
                                        'min_samples_split': [2, 5, 10, 14],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]})

In [29]:
best_random_grid=rf_randomcv.best_estimator_

In [27]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
y_pred = best_random_grid.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [31]:
rf_randomcv.best_params_

{'n_estimators': 1200,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 890,
 'criterion': 'gini'}

# 2. Grid Search CV

In [32]:
from sklearn.model_selection import GridSearchCV

In [33]:
param_grid = {
    'criterion': [rf_randomcv.best_params_['criterion']],
    'max_depth': [rf_randomcv.best_params_['max_depth']],
    'max_features': [rf_randomcv.best_params_['max_features']],
    'min_samples_leaf': [rf_randomcv.best_params_['min_samples_leaf'], 
                         rf_randomcv.best_params_['min_samples_leaf']+2, 
                         rf_randomcv.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [rf_randomcv.best_params_['min_samples_split'] - 2,
                          rf_randomcv.best_params_['min_samples_split'] - 1,
                          rf_randomcv.best_params_['min_samples_split'], 
                          rf_randomcv.best_params_['min_samples_split'] +1,
                          rf_randomcv.best_params_['min_samples_split'] + 2],
    'n_estimators': [rf_randomcv.best_params_['n_estimators'] - 200, rf_randomcv.best_params_['n_estimators'] - 100, 
                     rf_randomcv.best_params_['n_estimators'], 
                     rf_randomcv.best_params_['n_estimators'] + 100, rf_randomcv.best_params_['n_estimators'] + 200]
}

print(param_grid)

{'criterion': ['gini'], 'max_depth': [890], 'max_features': ['log2'], 'min_samples_leaf': [1, 3, 5], 'min_samples_split': [3, 4, 5, 6, 7], 'n_estimators': [1000, 1100, 1200, 1300, 1400]}


In [47]:
rf = RandomForestClassifier()
rf_gridsearch_cv = GridSearchCV(estimator = rf, param_grid = param_grid, n_jobs = -1, cv=2,verbose =2)

In [48]:
rf_gridsearch_cv.fit(X_train, y_train)

Fitting 2 folds for each of 75 candidates, totalling 150 fits


GridSearchCV(cv=2, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': [890],
                         'max_features': ['log2'],
                         'min_samples_leaf': [1, 3, 5],
                         'min_samples_split': [3, 4, 5, 6, 7],
                         'n_estimators': [1000, 1100, 1200, 1300, 1400]},
             verbose=2)

In [49]:
rf_gridsearch_cv.best_estimator_

RandomForestClassifier(max_depth=890, max_features='log2', min_samples_leaf=5,
                       min_samples_split=5, n_estimators=1100)

In [50]:
best_params = rf_gridsearch_cv.best_estimator_

In [51]:
y_pred = best_params.predict(X_test)

In [52]:
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[126  23]
 [ 26  56]]
0.7878787878787878
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       149
           1       0.71      0.68      0.70        82

    accuracy                           0.79       231
   macro avg       0.77      0.76      0.77       231
weighted avg       0.79      0.79      0.79       231



# 3. Bayesian Optimization

In [53]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

In [None]:
##hp.choice : selecting between multiple numbers/strings eg: gini, entropy, None or [10,20.30,400]
##hp.quinform : selecting integer numbers
##hp.uniform  : selecting floating numbers

In [54]:
##Let's create the space grid

In [56]:
space = {'criterion' : hp.choice('criterion', ['entropy', 'gini']),
        'max_depth' : hp.quniform('max_depth', 10,1200,10),
        'max_features' : hp.choice('max_features', ['auto','sqrt', 'log2',None]),
        'min_samples_leaf' : hp.uniform('min_samples_leaf',0,0.5),
        'min_samples_split' : hp.uniform('min_samples_split', 0,1),
        'n_estimators' : hp.choice('n_estimators',[10,50,100,750,900,1000])
        }

In [57]:
space

{'criterion': <hyperopt.pyll.base.Apply at 0x2b1e28fa430>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x2b1e28fa610>,
 'max_features': <hyperopt.pyll.base.Apply at 0x2b1e28fa850>,
 'min_samples_leaf': <hyperopt.pyll.base.Apply at 0x2b1e28fa9a0>,
 'min_samples_split': <hyperopt.pyll.base.Apply at 0x2b1e28faa90>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x2b1e28fab50>}

In [58]:
##defining objective function

In [59]:
def objective(space):
    rf = RandomForestClassifier(criterion = space['criterion'], max_depth = space['max_depth'],max_features = space['max_features'],
                               min_samples_leaf = space['min_samples_leaf'], min_samples_split = space['min_samples_split'],
                                n_estimators = space['n_estimators'])
    
    accuracy = cross_val_score(rf, X_train, y_train, cv =5).mean()
    return{'loss' : -accuracy, 'status': STATUS_OK}

In [60]:
from sklearn.model_selection import cross_val_score

In [63]:
trials = Trials()
best = fmin(fn = objective, space = space, algo = tpe.suggest,
           max_evals = 80, trials = trials)
best

100%|██████████| 80/80 [07:26<00:00,  5.58s/trial, best loss: -0.765368639667705] 


{'criterion': 0,
 'max_depth': 930.0,
 'max_features': 2,
 'min_samples_leaf': 0.03097670882881497,
 'min_samples_split': 0.11557672972149434,
 'n_estimators': 2}

In [64]:
crit = {0:'entropy', 1:'gini'}
max_feat = {0:'auto', 1:'sqrt',2:'log2'}
n_estima = {0:10, 1:50,2:100,3:750,4:900,5:1000}

In [69]:
print(crit[best['criterion']])
print(max_feat[best['max_features']])
print(n_estima[best['n_estimators']])

entropy
log2
100


In [70]:
trained_forest = RandomForestClassifier(criterion = crit[best['criterion']],max_depth = best['max_depth'],max_features =  max_feat[best['max_features']],
                                       min_samples_leaf=best['min_samples_leaf'],min_samples_split=best['min_samples_split'],n_estimators= n_estima[best['n_estimators']])

In [72]:
trained_forest.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_depth=930.0,
                       max_features='log2',
                       min_samples_leaf=0.03097670882881497,
                       min_samples_split=0.11557672972149434)

In [73]:
y_pred = trained_forest.predict(X_test)

In [74]:
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[129  20]
 [ 35  47]]
0.7619047619047619
              precision    recall  f1-score   support

           0       0.79      0.87      0.82       149
           1       0.70      0.57      0.63        82

    accuracy                           0.76       231
   macro avg       0.74      0.72      0.73       231
weighted avg       0.76      0.76      0.76       231



# 4. Genetic Algorithm

In [3]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(param)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [4]:
from tpot import TPOTClassifier



In [15]:
tpot_classifier = TPOTClassifier(generations= 5, population_size= 6, offspring_size= 3,
                                 verbosity= 2, early_stop= 12,
                                 config_dict={'sklearn.ensemble.RandomForestClassifier': param}, 
                                 cv = 4, scoring = 'accuracy')
tpot_classifier.fit(X_train,y_train)

Optimization Progress:   0%|          | 0/21 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7820619126589277

Generation 2 - Current best internal CV score: 0.7820619126589277

Generation 3 - Current best internal CV score: 0.7820619126589277

Generation 4 - Current best internal CV score: 0.7857656163626312

Generation 5 - Current best internal CV score: 0.7857656163626312

Best pipeline: RandomForestClassifier(CombineDFs(input_matrix, CombineDFs(input_matrix, input_matrix)), criterion=entropy, max_depth=890, max_features=auto, min_samples_leaf=6, min_samples_split=14, n_estimators=1200)


TPOTClassifier(config_dict={'sklearn.ensemble.RandomForestClassifier': {'criterion': ['entropy',
                                                                                      'gini'],
                                                                        'max_depth': [10,
                                                                                      120,
                                                                                      230,
                                                                                      340,
                                                                                      450,
                                                                                      560,
                                                                                      670,
                                                                                      780,
                                                                                 

In [84]:
import tensorflow as tf

In [85]:
print(tf.__version__)

2.3.0


In [16]:
accuracy = tpot_classifier.score(X_test, y_test)
print(accuracy)

0.7272727272727273


# 5. Optuna

In [17]:
import optuna

In [18]:
import optuna
import sklearn.svm
def objective(trial):

    classifier = trial.suggest_categorical('classifier', ['RandomForest', 'SVC'])
    
    if classifier == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 200, 2000,10)
        max_depth = int(trial.suggest_float('max_depth', 10, 100, log=True))

        clf = sklearn.ensemble.RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth)
    else:
        c = trial.suggest_float('svc_c', 1e-10, 1e10, log=True)
        
        clf = sklearn.svm.SVC(C=c, gamma='auto')

    return sklearn.model_selection.cross_val_score(
        clf,X_train,y_train, n_jobs=-1, cv=3).mean()

In [19]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

trial = study.best_trial

print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[32m[I 2021-08-03 22:01:26,067][0m A new study created in memory with name: no-name-367f8535-c911-4231-a46d-8356157ee785[0m
[32m[I 2021-08-03 22:01:39,689][0m Trial 0 finished with value: 0.7690875232774674 and parameters: {'classifier': 'RandomForest', 'n_estimators': 780, 'max_depth': 59.70443473016753}. Best is trial 0 with value: 0.7690875232774674.[0m
[32m[I 2021-08-03 22:01:50,807][0m Trial 1 finished with value: 0.7728119180633147 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1200, 'max_depth': 73.66854765653908}. Best is trial 1 with value: 0.7728119180633147.[0m
[32m[I 2021-08-03 22:02:02,408][0m Trial 2 finished with value: 0.7746741154562383 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1760, 'max_depth': 21.046830980540737}. Best is trial 2 with value: 0.7746741154562383.[0m
[32m[I 2021-08-03 22:02:02,597][0m Trial 3 finished with value: 0.6536312849162011 and parameters: {'classifier': 'SVC', 'svc_c': 0.7940367367837038}. Bes

[32m[I 2021-08-03 22:05:57,679][0m Trial 35 finished with value: 0.7728119180633146 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1830, 'max_depth': 64.23420814129545}. Best is trial 24 with value: 0.7821229050279329.[0m
[32m[I 2021-08-03 22:05:57,772][0m Trial 36 finished with value: 0.6536312849162011 and parameters: {'classifier': 'SVC', 'svc_c': 0.0013742259828096542}. Best is trial 24 with value: 0.7821229050279329.[0m
[32m[I 2021-08-03 22:06:06,218][0m Trial 37 finished with value: 0.7802607076350093 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1650, 'max_depth': 39.97571859029606}. Best is trial 24 with value: 0.7821229050279329.[0m
[32m[I 2021-08-03 22:06:14,261][0m Trial 38 finished with value: 0.7783985102420856 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1630, 'max_depth': 41.17972241673026}. Best is trial 24 with value: 0.7821229050279329.[0m
[32m[I 2021-08-03 22:06:14,334][0m Trial 39 finished with value: 

[32m[I 2021-08-03 22:09:04,414][0m Trial 70 finished with value: 0.7690875232774674 and parameters: {'classifier': 'RandomForest', 'n_estimators': 940, 'max_depth': 42.996260153076605}. Best is trial 24 with value: 0.7821229050279329.[0m
[32m[I 2021-08-03 22:09:08,526][0m Trial 71 finished with value: 0.7821229050279329 and parameters: {'classifier': 'RandomForest', 'n_estimators': 980, 'max_depth': 54.43790748594871}. Best is trial 24 with value: 0.7821229050279329.[0m
[32m[I 2021-08-03 22:09:13,084][0m Trial 72 finished with value: 0.7765363128491619 and parameters: {'classifier': 'RandomForest', 'n_estimators': 990, 'max_depth': 60.56765467334256}. Best is trial 24 with value: 0.7821229050279329.[0m
[32m[I 2021-08-03 22:09:20,409][0m Trial 73 finished with value: 0.7690875232774674 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1560, 'max_depth': 32.25781249510856}. Best is trial 24 with value: 0.7821229050279329.[0m
[32m[I 2021-08-03 22:09:27,237][0m T

Accuracy: 0.7839851024208566
Best hyperparameters: {'classifier': 'RandomForest', 'n_estimators': 970, 'max_depth': 69.82218139852151}


In [20]:
trial

FrozenTrial(number=86, values=[0.7839851024208566], datetime_start=datetime.datetime(2021, 8, 3, 22, 10, 34, 827410), datetime_complete=datetime.datetime(2021, 8, 3, 22, 10, 40, 140769), params={'classifier': 'RandomForest', 'n_estimators': 970, 'max_depth': 69.82218139852151}, distributions={'classifier': CategoricalDistribution(choices=('RandomForest', 'SVC')), 'n_estimators': IntUniformDistribution(high=2000, low=200, step=10), 'max_depth': LogUniformDistribution(high=100.0, low=10.0)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=86, state=TrialState.COMPLETE, value=None)

In [21]:
study.best_params

{'classifier': 'RandomForest',
 'n_estimators': 970,
 'max_depth': 69.82218139852151}

In [24]:
rf=RandomForestClassifier(n_estimators=330,max_depth=30)
rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=30, n_estimators=330)

In [28]:
y_pred=rf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[118  31]
 [ 30  52]]
0.7359307359307359
              precision    recall  f1-score   support

           0       0.80      0.79      0.79       149
           1       0.63      0.63      0.63        82

    accuracy                           0.74       231
   macro avg       0.71      0.71      0.71       231
weighted avg       0.74      0.74      0.74       231

