# **Finding best model and hyper parameter tunning using GridSearchCV and RandomSearchCV**

In [60]:
import numpy as np

In [14]:
from sklearn import datasets
iris = datasets.load_iris()

In [21]:
data = pd.DataFrame(iris.data,columns=iris.feature_names)
data['flower'] = iris.target
data['flower'] = data['flower'].apply(lambda x: iris.target_names[x])

In [22]:
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [23]:
data.shape

(150, 5)

# Approach 1: Use train_test_split and manually tune parameters by trial and error

In [50]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(iris.data, iris.target, random_state =42, train_size= 0.8)
print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)

(120, 4) (30, 4) (120,) (30,)


In [51]:
from sklearn.svm import SVC
model = SVC(kernel ='rbf', C = 30, gamma = 'auto').fit(xtrain, ytrain)
model.score(xtrain, ytrain)

0.9833333333333333

## Approach 2: Use K Fold Cross validation

Manually try suppling models with different parameters to cross_val_score function with 5 fold cross validation

In [57]:
from sklearn.model_selection import cross_val_score
x = iris.data
y = iris.target
cross_val_score(SVC(kernel= 'linear', C= 1, gamma = 'auto'), x, y, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [58]:
cross_val_score(SVC(kernel = 'linear', C = 10, gamma= 'auto'), x, y, cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [59]:
cross_val_score(SVC(kernel='rbf', C=20, gamma= 'auto'), x, y, cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

In [66]:
kernel = ['rbf', 'linear']
C = [1,20,30]
score={}
for k in kernel:
  for c in C:
    cvs = cross_val_score(SVC(kernel = k, C= c, gamma='auto'), x, y, cv =5)
    score[k+'_'+str(c)] =  np.average(cvs)

score

{'rbf_1': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'rbf_30': 0.96,
 'linear_1': 0.9800000000000001,
 'linear_20': 0.9666666666666666,
 'linear_30': 0.96}

From above results we can say that rbf with C=1 or linear with C=1 will give best performance

## Approach 3: Use GridSearchCV

GridSearchCV does exactly same thing as for loop above but in a single line of code

In [92]:
from sklearn.model_selection import GridSearchCV
svc = SVC(gamma = 'auto')
estimaters = {
    'kernel': ['rbf', 'linear'],
    'C' : [1,20,30]
}
clf = GridSearchCV(svc, estimaters, cv = 5, return_train_score = False )
clf.fit(iris.data, iris.target)
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001989,0.000183,0.001477,0.0002,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.001919,0.000516,0.001037,6.1e-05,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.001655,0.0001,0.001078,4.2e-05,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,3
3,0.002724,0.002334,0.005425,0.008668,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,4
4,0.004542,0.004226,0.003908,0.002957,30,rbf,"{'C': 30, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.933333,1.0,0.96,0.038873,5
5,0.001747,0.000313,0.001111,0.000114,30,linear,"{'C': 30, 'kernel': 'linear'}",1.0,1.0,0.9,0.9,1.0,0.96,0.04899,5


In [93]:
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,20,rbf,0.966667
3,20,linear,0.966667
4,30,rbf,0.96
5,30,linear,0.96


In [94]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [95]:
clf.best_score_

0.9800000000000001

Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation

In [98]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(svc, estimaters,cv=5, return_train_score=False, n_iter=2)
rs.fit(iris.data, iris.target)
rs.best_params_

{'kernel': 'rbf', 'C': 1}

How about different models with different hyperparameters?

In [100]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [115]:
best_param = {
    'svm':{
        'model': SVC(gamma = 'auto'),
        'params': {
            'kernel': ['linear', 'rbf'],
            'C': [1, 10, 20, 30]
         }
    },

    'LogisticRegression':{
        'model': LogisticRegression(solver = 'liblinear', multi_class='auto'),
        'params': {
            'C': [1,10,20]
        }
    },

    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators' : [1,5,10]
        }
    },

    'decisiontree': {
    'model': DecisionTreeClassifier(),
    'params': {
        'criterion': ['gini', 'entropy'],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
  }
}

In [116]:
scores=[]
for model, param in best_param.items():
  clf = GridSearchCV(param['model'], param['params'], cv=5, return_train_score=False )
  clf.fit(iris.data, iris.target)
  scores.append({
      'model': model,
      'best_score': clf.best_score_,
      'best_param': clf.best_params_
  })

df = pd.DataFrame(scores)

In [117]:
pd.options.display.max_colwidth=100

In [118]:
df

Unnamed: 0,model,best_score,best_param
0,svm,0.98,"{'C': 1, 'kernel': 'linear'}"
1,LogisticRegression,0.966667,{'C': 10}
2,RandomForestClassifier,0.953333,{'n_estimators': 10}
3,decisiontree,0.966667,"{'criterion': 'gini', 'min_samples_leaf': 1, 'min_samples_split': 5}"


Based on the above data we can conclude that SVM with C=1 and kernel=linear is the best model to sovle the problem of iris flower classification

