## All-Hyperparamter-Optimization

1. GridSearchCV
2. RandomizedSearchCV
3. Bayesian Optimization -Automate Hyperparameter Tuning (Hyperopt)
4. Sequential Model Based Optimization(Tuning a scikit-learn estimator with skopt)
5. Optuna- Automate Hyperparameter Tuning
6. Genetic Algorithms (TPOT Classifier)
## References
1. https://github.com/fmfn/BayesianOptimization
2. https://github.com/hyperopt/hyperopt
3. https://www.jeremyjordan.me/hyperparameter-tuning/
4. https://optuna.org/
5. https://towardsdatascience.com/hyperparameters-optimization-526348bb8e2d(By Pier Paolo Ippolito )
6. https://scikit-optimize.github.io/stable/auto_examples/hyperparameter-optimization.html

In [10]:
import numpy as np
import pandas as pd


In [11]:
df=pd.read_csv('https://raw.githubusercontent.com/krishnaik06/All-Hyperparamter-Optimization/master/diabetes.csv')

In [12]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [13]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [15]:
df['Glucose']=np.where(df['Glucose']==0,df['Glucose'].median(),df['Glucose'])
df['Pregnancies']=np.where(df['Pregnancies']==0,df['Pregnancies'].median(),df['Pregnancies'])
df['Insulin']=np.where(df['Insulin']==0,df['Insulin'].median(),df['Insulin'])
df['SkinThickness']=np.where(df['SkinThickness']==0,df['SkinThickness'].median(),df['SkinThickness'])

In [16]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1.0,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8.0,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1.0,89.0,66,23.0,94.0,28.1,0.167,21,0
4,3.0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [17]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [20]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)


In [21]:
from sklearn.ensemble  import RandomForestClassifier
tree_model=RandomForestClassifier(n_estimators=10)
tree_model.fit(x_train,y_train)



RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [23]:
prediction=tree_model.predict(x_test)

In [24]:

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[94 13]
 [17 30]]
0.8051948051948052
              precision    recall  f1-score   support

           0       0.85      0.88      0.86       107
           1       0.70      0.64      0.67        47

    accuracy                           0.81       154
   macro avg       0.77      0.76      0.76       154
weighted avg       0.80      0.81      0.80       154



## Randomized SearchCV

In [31]:
from sklearn.model_selection import RandomizedSearchCV

#Number of tress in randomforest
n_estimators=[int(x) for x in np.linspace(start=200,stop=2000,num=10)]
#maximun number of levels in tree
max_depth=[int(x) for x in np.linspace(start=10,stop=1000,num=10)]
#number of features to consider at each spilt
max_features=['auto','sqrt','log2']
#minimum samples required at each split of node
min_samples_split=[2,5,10,14]
#minimum samples required at each leaf node
min_samples_leaf=[1,2,4,6,8]
# random_state
random_state=[int(x) for x in np.linspace(start=10,stop=100,num=10)]

In [32]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']
              'random_state':random_state}
print(random_grid)

SyntaxError: invalid syntax (<ipython-input-32-7e13d8dd7caf>, line 7)

In [33]:
rf=RandomForestClassifier()
RandomizedSearchcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100, n_jobs=-1, cv=3,
    verbose=2)
RandomizedSearchcv.fit(x_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  8.8min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [34]:
RandomizedSearchcv.best_params_

{'n_estimators': 800,
 'min_samples_split': 5,
 'min_samples_leaf': 8,
 'max_features': 'auto',
 'max_depth': 120,
 'criterion': 'gini'}

In [42]:
best_random_grid=RandomizedSearchcv.best_estimator_
best_random_grid

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=120, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=8, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=800,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [39]:
y_pred=best_random_grid.predict(x_test)

In [40]:
from sklearn.metrics import accuracy_score

print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

[[98  9]
 [19 28]]
Accuracy Score 0.8181818181818182
Classification report:               precision    recall  f1-score   support

           0       0.84      0.92      0.88       107
           1       0.76      0.60      0.67        47

    accuracy                           0.82       154
   macro avg       0.80      0.76      0.77       154
weighted avg       0.81      0.82      0.81       154



## GridSearchCv

In [41]:
from sklearn.model_selection import GridSearchCV


In [43]:
param_grid = {
    'criterion': [RandomizedSearchcv.best_params_['criterion']],
    'max_depth': [RandomizedSearchcv.best_params_['max_depth']],
    'max_features': [RandomizedSearchcv.best_params_['max_features']],
    'min_samples_leaf': [RandomizedSearchcv.best_params_['min_samples_leaf'], 
                         RandomizedSearchcv.best_params_['min_samples_leaf']+2, 
                         RandomizedSearchcv.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [RandomizedSearchcv.best_params_['min_samples_split'] - 2,
                          RandomizedSearchcv.best_params_['min_samples_split'] - 1,
                          RandomizedSearchcv.best_params_['min_samples_split'], 
                          RandomizedSearchcv.best_params_['min_samples_split'] +1,
                          RandomizedSearchcv.best_params_['min_samples_split'] + 2],
    'n_estimators': [RandomizedSearchcv.best_params_['n_estimators'] - 200, RandomizedSearchcv.best_params_['n_estimators'] - 100, 
                     RandomizedSearchcv.best_params_['n_estimators'], 
                     RandomizedSearchcv.best_params_['n_estimators'] + 100, RandomizedSearchcv.best_params_['n_estimators'] + 200]
}

print(param_grid)

{'criterion': ['gini'], 'max_depth': [120], 'max_features': ['auto'], 'min_samples_leaf': [8, 10, 12], 'min_samples_split': [3, 4, 5, 6, 7], 'n_estimators': [600, 700, 800, 900, 1000]}


In [None]:
rf=RandomForestClassifier()
GridSearchcv=GridSearchCV(estimator=rf,param_grid=random_grid, n_jobs=-1, cv=3,
    verbose=2)
GridSearchcv.fit(x_train,y_train)

Fitting 3 folds for each of 12000 candidates, totalling 36000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   47.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 17.4min


In [None]:
GridSearchcv.best_params_

In [None]:
best_grid=GridSearchcv.best_estimator_

In [None]:
y_pred=best_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))