# Finding best model and hyper parameter tunning using GridSearchCV

For iris flower dataset in sklearn library, we are going to find out best model and best hyper parameters using GridSearchCV

In [47]:
from sklearn import svm ,datasets
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split , GridSearchCV , RandomizedSearchCV ,cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [4]:
data = datasets.load_iris()

In [13]:
data.DESCR



In [15]:
dataframe = pd.DataFrame(data.data , columns= data.feature_names )

In [18]:
dataframe['flower'] = data.target
dataframe

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [19]:
dataframe['flower'] = dataframe['flower'].apply(lambda x: data.target_names[x])
dataframe[47:150]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
47,4.6,3.2,1.4,0.2,setosa
48,5.3,3.7,1.5,0.2,setosa
49,5.0,3.3,1.4,0.2,setosa
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica



Approach 1: Use train_test_split and manually tune parameters by trial and error

In [20]:
X_train ,X_test  , y_train , y_test = train_test_split(data.data , data.target , test_size = 0.3 , random_state =2 )

In [21]:
model = svm.SVC(kernel='rbf',C=30,gamma='auto')
model.fit(X_train,y_train)
model.score(X_test, y_test)

0.9777777777777777

In [33]:
X_train ,X_test  , y_train , y_test = train_test_split(data.data , data.target , test_size = 0.25 , random_state =1 ,shuffle = True )
model = svm.SVC(kernel='rbf',C=30,gamma='auto')
model.fit(X_train,y_train)
model.score(X_test, y_test)

0.9736842105263158

#### Approach 2: Use K Fold Cross validation
Manually try suppling models with different parameters to cross_val_score function with 5 fold cross validation

In [44]:
cross_val_score(svm.SVC(kernel='rbf',C=30,gamma='auto') , data.data , data.target , cv=5)

array([0.96666667, 1.        , 0.9       , 0.93333333, 1.        ])

In [45]:
cross_val_score(svm.SVC(kernel='linear',C=30,gamma='auto') , data.data , data.target , cv=5)

array([1. , 1. , 0.9, 0.9, 1. ])

In [46]:
cross_val_score(svm.SVC(kernel='rbf',C=20,gamma='auto') , data.data , data.target , cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

Above approach is tiresome and very manual. We can use for loop as an alternative

In [48]:
kernels = ['rbf' , 'linear']
c = [1, 10,20]
avg_scores = {}
for kval in kernels:
    for cval in c:
        cv_scores = cross_val_score(svm.SVC(kernel =kval , C = cval , gamma = 'auto') , data.data , data.target , cv = 5)
        avg_scores[kval + '_'+str(cval)] = np.average(cv_scores)
avg_scores

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

From above results we can say that rbf with C=1 or 10 or linear with C=1 will give best performance

#### Approach 3: Use GridSearchSV
GridSearchCV does exactly same thing as for loop above but in a single line of code

In [49]:
clf = GridSearchCV(svm.SVC(gamma='auto'),{'C':[1,10,20],'kernel':['rbf' , 'linear']} ,cv=5,return_train_score= False)
clf.fit(data.data , data.target)
clf.cv_results_

{'mean_fit_time': array([0.00119653, 0.00059905, 0.00059786, 0.00019913, 0.00079775,
        0.00079799]),
 'std_fit_time': array([0.00116321, 0.00048912, 0.00048815, 0.00039825, 0.00039888,
        0.00039899]),
 'mean_score_time': array([0.00059838, 0.00040169, 0.00039911, 0.0001997 , 0.00059857,
        0.00039892]),
 'std_score_time': array([0.00048858, 0.00049198, 0.00048881, 0.0003994 , 0.00048873,
        0.00048858]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'}],


In [50]:

df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001197,0.001163,0.000598,0.000489,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.000599,0.000489,0.000402,0.000492,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.000598,0.000488,0.000399,0.000489,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.000199,0.000398,0.0002,0.000399,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.000798,0.000399,0.000599,0.000489,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.000798,0.000399,0.000399,0.000489,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,5


In [51]:
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


In [52]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [53]:
clf.best_score_

0.98

###### Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation

In [54]:
RS = RandomizedSearchCV(svm.SVC(gamma='auto'),{'C':[1,10,20],'kernel':['rbf' , 'linear']} ,cv=5,
                         return_train_score= False , n_iter=2)
RS.fit(data.data , data.target)
pd.DataFrame(RS.cv_results_)[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,20,linear,0.966667
1,1,rbf,0.98


######    How about different models with different hyperparameters?

In [65]:
model_params ={
    'svm' :{
        'model': svm.SVC(gamma='auto'),
        'params' :{
        'C' : [1,10,20],
        'kernel' :['rbf' , 'linear']
        }
    } , 
    'random_forest':{
        'model' : RandomForestClassifier(),
        'params': {
            'n_estimators': [1,5,10]
        }
    } , 
    'logistic_regression':{
        'model' :  LogisticRegression(solver='liblinear' , multi_class='auto'),
        'params' : {
            'C': [1,5,10]
        }
    }
}

In [66]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(data.data, data.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.973333,{'n_estimators': 5}
2,logistic_regression,0.966667,{'C': 5}



Based on above, I can conclude that SVM with C=1 and kernel='rbf' is the best model for solving my problem of iris flower classification