# Machine Learning Tutorial Python - 16: Hyper parameter Tuning (GridSearchCV)

In this python machine learning tutorial for beginners we will look into,
1) how to hyper tune machine learning model paramers 
2) choose best model for given machine learning problem
We will start by comparing traditional train_test_split approach with k fold cross validation. 
Then we will see how GridSearchCV helps run K Fold cross validation with its convenient api. GridSearchCV helps find best parameters that gives maximum
performance. RandomizedSearchCV is another class in sklearn library that does same thing as GridSearchCVbut without running exhaustive search, this helps 
with computation time and resources. We will also see how to find best model among all the classification algorithm using GridSearchCV. In the end we have interesting exercise for you to solve.

In [2]:
from sklearn import svm, datasets
iris = datasets.load_iris()

In [4]:
import pandas as pd
df = pd.DataFrame(iris.data, columns= iris.feature_names)
df['flower'] = iris.target
df['flower'] = df['flower'].apply(lambda x: iris.target_names[x])
df[47:52]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
47,4.6,3.2,1.4,0.2,setosa
48,5.3,3.7,1.5,0.2,setosa
49,5.0,3.3,1.4,0.2,setosa
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor


In [17]:
# splitting
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size= 0.3)

In [18]:
# using SVM model
model = svm.SVC(kernel= 'rbf', C=30, gamma= 'auto')
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.9777777777777777

In [21]:
# not reliable as score changes with randomness of selection
# thus use kFolds cross validation
# use kernel = 'linear'
from sklearn.model_selection import cross_val_score
cross_val_score(svm.SVC(kernel= 'linear', C= 10, gamma= 'auto'), iris.data, iris.target,cv = 5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [23]:
# kernel = 'rbf' C=10
cross_val_score(svm.SVC(kernel= 'rbf', C= 10, gamma= 'auto'), iris.data, iris.target,cv = 5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [24]:
# kerneel = 'rbf', C= 20
cross_val_score(svm.SVC(kernel= 'rbf', C= 20, gamma= 'auto'), iris.data, iris.target,cv = 5) 

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

In [28]:
# method is repetitive
# thus
# hyper parameter tuning
import numpy as np
kernels =['rbf', 'linear']
C = [1, 10, 20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(svm.SVC(kernel= kval, C= cval, gamma= 'auto'), iris.data, iris.target, cv= 5)
        avg_scores[kval + '_' + str(cval)] = np.average(cv_scores)

avg_scores

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

In [29]:
# gridsearchcv does the above 
from sklearn.model_selection import GridSearchCV
# define our classifier
# model is svm
clf = GridSearchCV(svm.SVC(gamma= 'auto'),{
    'C':[1,10,20],
    'kernel': ['rbf', 'linear']
}, cv= 5, return_train_score= False)

# model trainng
clf.fit(iris.data, iris.target)
clf.cv_results_

{'mean_fit_time': array([0.0044693 , 0.02887197, 0.00379715, 0.00579791, 0.02819805,
        0.00859923]),
 'std_fit_time': array([0.00092789, 0.04906944, 0.00116665, 0.00444411, 0.03652461,
        0.00417569]),
 'mean_score_time': array([0.0029305 , 0.00352664, 0.01360002, 0.03399682, 0.00440044,
        0.03080006]),
 'std_score_time': array([0.00132757, 0.00247218, 0.01047961, 0.05028636, 0.00338268,
        0.05312075]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'}],


In [30]:
# import data to df
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004469,0.000928,0.00293,0.001328,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.028872,0.049069,0.003527,0.002472,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.003797,0.001167,0.0136,0.01048,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.005798,0.004444,0.033997,0.050286,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.028198,0.036525,0.0044,0.003383,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.008599,0.004176,0.0308,0.053121,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6


In [32]:
# trim df down to necessary columns
df[['param_C', 'param_kernel', 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


In [33]:
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'inverse_transform',
 'multimetric_',
 'n_features_in_',
 'n_jobs',
 'n_splits

In [35]:
clf.best_score_

0.9800000000000001

In [36]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [42]:
# tackling computation problem
# try randomizedssearchcv
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(svm.SVC(gamma= 'auto'),{
    'C': [1,10,20],
    'kernel': ['rbf', 'linear']
},
cv = 5,
return_train_score= False,
n_iter= 2 #onlly two iterations
)
rs.fit(iris.data, iris.target)
pd.DataFrame(rs.cv_results_)[['param_C', 'param_kernel','mean_test_score']]
 

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,linear,0.98
1,20,linear,0.966667


In [43]:
# choosing the best model
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [44]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma= 'auto'),
        'params': {
            'C': [1,10,20],
            'kernel':  ['rbf', 'linear']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1,5,10]
        }

    },
    'logistic_regression': {
        'model': LogisticRegression(solver='liblinear', multi_class= 'auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [47]:
# uisng gridsearch cv
scores= []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score= False)
    clf.fit(iris.data, iris.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [49]:
# append resukts to df
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.96,{'n_estimators': 10}
2,logistic_regression,0.966667,{'C': 5}


In [None]:
# best model is svm

# Exercise: Machine Learning Finding Optimal Model and Hyperparameters

For digits dataset in sklearn.dataset, please try following classifiers and find out the one that gives best performance. Also find the optimal parameters for that classifier.

In [None]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier