## Finding best model and hyper parameter tunning using GridSearchCV

**For iris flower dataset in sklearn library, we are going to find out best model and best hyper parameters using GridSearchCV**

**Load iris flower dataset**

In [1]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.datasets import load_iris
iris = load_iris()

In [4]:
dir(iris)

['DESCR',
 'data',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [6]:
iris.data[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [7]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [8]:
df['flower'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [9]:
df['flower'] = df['flower'].apply(lambda x: iris.target_names[x])
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [10]:
df[47:53]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
47,4.6,3.2,1.4,0.2,setosa
48,5.3,3.7,1.5,0.2,setosa
49,5.0,3.3,1.4,0.2,setosa
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
52,6.9,3.1,4.9,1.5,versicolor


In [11]:
df[97:103]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
97,6.2,2.9,4.3,1.3,versicolor
98,5.1,2.5,3.0,1.1,versicolor
99,5.7,2.8,4.1,1.3,versicolor
100,6.3,3.3,6.0,2.5,virginica
101,5.8,2.7,5.1,1.9,virginica
102,7.1,3.0,5.9,2.1,virginica


In [13]:
inputs = df.drop('flower', axis='columns')
inputs.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [14]:
target = df['flower']
target.head()

0    setosa
1    setosa
2    setosa
3    setosa
4    setosa
Name: flower, dtype: object

### Approach 1: Use train_test_split and manually tune parameters by trial and error

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.3)

In [2]:
from sklearn.svm import SVC

In [16]:
model = SVC(kernel='rbf', C=30, gamma='auto')

In [17]:
model.fit(X_train, y_train)

SVC(C=30, gamma='auto')

In [18]:
model.predict(X_test)

array(['setosa', 'versicolor', 'virginica', 'virginica', 'virginica',
       'versicolor', 'versicolor', 'setosa', 'virginica', 'versicolor',
       'setosa', 'setosa', 'setosa', 'virginica', 'versicolor',
       'versicolor', 'setosa', 'setosa', 'setosa', 'versicolor', 'setosa',
       'virginica', 'versicolor', 'versicolor', 'setosa', 'virginica',
       'virginica', 'setosa', 'virginica', 'versicolor', 'setosa',
       'versicolor', 'versicolor', 'setosa', 'versicolor', 'virginica',
       'virginica', 'virginica', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'versicolor', 'virginica'], dtype=object)

In [19]:
model.score(X_test, y_test)

0.9555555555555556

### Approach 2: Use K Fold Cross Validation

**Manually try supplying models with different parameters to cross_val_score function with 5 fold cross validation**

In [20]:
from sklearn.model_selection import cross_val_score

In [21]:
cross_val_score(SVC(kernel='linear', C=10, gamma='auto'), inputs, target, cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [22]:
cross_val_score(SVC(kernel='rbf', C=10, gamma='auto'), inputs, target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [23]:
cross_val_score(SVC(kernel='rbf', C=20, gamma='auto'), inputs, target, cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

**Above approach is tiresome and very manual. We can use *for* loop as an alternative**

In [24]:
kernels = ['rbf', 'linear']
C = [1, 10, 20]
avg_scores = {}

for kval in kernels:
    for cval in C:
        print(cross_val_score(SVC(kernel=kval, C=cval, gamma='auto'), inputs, target, cv=5))

[0.96666667 1.         0.96666667 0.96666667 1.        ]
[0.96666667 1.         0.96666667 0.96666667 1.        ]
[0.96666667 1.         0.9        0.96666667 1.        ]
[0.96666667 1.         0.96666667 0.96666667 1.        ]
[1.         1.         0.9        0.96666667 1.        ]
[1.         1.         0.9        0.93333333 1.        ]


In [26]:
kernels = ['rbf', 'linear']
C = [1, 10, 20]
avg_scores = {}

for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(SVC(kernel=kval, C=cval, gamma='auto'), inputs, target, cv=5)
        avg_scores[kval + '_' + str(cval)] = np.average(cv_scores)
        
avg_scores

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

**From above results we can say that rbf with C=1 or 10 or linear with C=1 will give best performance**

### Approach 3: Use GridSearchCV

**GridSearchCV does exactly same thing as *for* loop above but in a single line of code**

In [27]:
from sklearn.model_selection import GridSearchCV

In [29]:
clf = GridSearchCV(SVC(gamma='auto'), {
    'C':[1, 10, 20],
    'kernel': ['rbf', 'linear']
}, cv=5, return_train_score=False)

In [30]:
clf.fit(inputs, target)

GridSearchCV(cv=5, estimator=SVC(gamma='auto'),
             param_grid={'C': [1, 10, 20], 'kernel': ['rbf', 'linear']})

In [32]:
clf.cv_results_

{'mean_fit_time': array([0.00220027, 0.00440006, 0.00440035, 0.00439939, 0.00220013,
        0.        ]),
 'std_fit_time': array([0.00440054, 0.00538895, 0.0053893 , 0.00538818, 0.00440025,
        0.        ]),
 'mean_score_time': array([0.00219874, 0.        , 0.        , 0.        , 0.00220766,
        0.00440025]),
 'std_score_time': array([0.00439749, 0.        , 0.        , 0.        , 0.00441532,
        0.00538924]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'}],


In [35]:
df = pd.DataFrame(clf.cv_results_)
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0022,0.004401,0.002199,0.004397,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.0044,0.005389,0.0,0.0,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.0044,0.005389,0.0,0.0,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.004399,0.005388,0.0,0.0,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.0022,0.0044,0.002208,0.004415,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5


In [37]:
df[['param_C', 'param_kernel', 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


In [38]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [39]:
clf.best_score_

0.9800000000000001

In [40]:
clf.best_estimator_

SVC(C=1, gamma='auto')

In [41]:
clf.best_index_

0

In [42]:
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'inverse_transform',
 'multimetric_',
 'n_features_in_',
 'n_jobs',
 'n_splits

**Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation**

In [43]:
from sklearn.model_selection import RandomizedSearchCV

In [44]:
rs = RandomizedSearchCV(SVC(gamma='auto'), {
    'C': [1, 10, 20],
    'kernel': ['rbf', 'linear']
    },
    cv=5,
    return_train_score=False,
    n_iter=2
)

In [45]:
rs.fit(inputs, target)

RandomizedSearchCV(cv=5, estimator=SVC(gamma='auto'), n_iter=2,
                   param_distributions={'C': [1, 10, 20],
                                        'kernel': ['rbf', 'linear']})

In [46]:
pd.DataFrame(rs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002193,0.000405,0.001407,0.000489,linear,10,"{'kernel': 'linear', 'C': 10}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,2
1,0.002393,0.000496,0.001408,0.000484,rbf,10,"{'kernel': 'rbf', 'C': 10}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1


In [47]:
pd.DataFrame(rs.cv_results_)[['param_C', 'param_kernel', 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,10,linear,0.973333
1,10,rbf,0.98


**How about different models with different hyperparameters?**

In [48]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params' : {
            'C': [1, 10, 20], 
            'kernel': ['rbf', 'linear']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1, 5, 10]
        }
    },
    'logistic_regression': {
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params' : {
            'C': [1, 5, 10]
        }
    }
}

In [52]:
scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(inputs, target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [54]:
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.96,{'n_estimators': 5}
2,logistic_regression,0.966667,{'C': 5}


In [None]:
scores = []

for key, value in model_params.items():
    clf = GridSearchCV(value['model'], value['params'], cv=5, return_train_score=False)
    clf.fit(inputs, target)
    scores.append({
        'model': key,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [56]:
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.946667,{'n_estimators': 5}
2,logistic_regression,0.966667,{'C': 5}


**Bases on above, I can conclude that SVM with C=1 and kernel='rbf' is the best model for solving my problem of iris flower classification**