In [2]:
import pandas as pd
import numpy as np

# Loading Iris Dataset

In [3]:
from sklearn.datasets import load_iris

In [4]:
iris = load_iris()

In [5]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)

In [6]:
df['flower'] = iris.target

In [7]:
df['flower'] = df['flower'].apply(lambda x : iris.target_names[x])

In [8]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


# Manual Approach

In [9]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

## Using cross_val_score

Here we use five folds and get score at each fold.

By five folds we mean that if we have 100 samples for example, at first we use first 20 for test and rest for train. Then in second we use 1-20 and 41-100 for train training and rest for testing. Then in third 41-60 for testing and so on to last 20 for testing in fifth

With this we can know best score at a fold since train test split changes sample at each run.

In [10]:
cross_val_score(SVC(kernel='linear', C=1, gamma='auto'), iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [11]:
cross_val_score(SVC(kernel='linear', C=10, gamma='auto'), iris.data, iris.target, cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [12]:
cross_val_score(SVC(kernel='rbf', C=10, gamma='auto'), iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [13]:
cross_val_score(SVC(kernel='rbf', C=1, gamma='auto'), iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

## Using for loop for better result

In [14]:
kernels = ['linear', 'rbf']
C = [1, 10, 20]
avg_scores = {}

for i in kernels:
    for j in C:
        scores = cross_val_score(SVC(kernel=i, C=j, gamma='auto'), iris.data, iris.target, cv=5)
        avg = np.average(scores)

        avg_scores[i + '_' + str(j)] = avg

avg_scores

{'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666,
 'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668}

This shows linear and rbf kernels with C=1 and linear rbf kernel with C=10 gave same and best result

# GridSearchCV Approach

In [15]:
from sklearn.model_selection import GridSearchCV

In [16]:
clf = GridSearchCV(SVC(gamma='auto'), {
    'C' : [1, 10, 20],
    'kernel' : ['rbf', 'linear']
}, cv=5)

In [17]:
clf.fit(iris.data, iris.target)

In [18]:
clf.cv_results_

{'mean_fit_time': array([0.00174956, 0.00129337, 0.00304685, 0.00248137, 0.00184169,
        0.00351858]),
 'std_fit_time': array([0.00144249, 0.00184405, 0.0033295 , 0.00153569, 0.00150831,
        0.00405413]),
 'mean_score_time': array([0.00196171, 0.00055351, 0.00331893, 0.00117083, 0.00059276,
        0.0007659 ]),
 'std_score_time': array([0.00282391, 0.00072187, 0.00338791, 0.00096569, 0.00072612,
        0.00129189]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'}],


These results are hard to read. Therefore we store them in a dataframe so that readability is increased

In [19]:
df_clf = pd.DataFrame(clf.cv_results_)
df_clf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.00175,0.001442,0.001962,0.002824,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.001293,0.001844,0.000554,0.000722,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.003047,0.003329,0.003319,0.003388,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.002481,0.001536,0.001171,0.000966,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.001842,0.001508,0.000593,0.000726,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.003519,0.004054,0.000766,0.001292,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6


In [20]:
df_clf[['param_kernel', 'param_C', 'mean_test_score']]

Unnamed: 0,param_kernel,param_C,mean_test_score
0,rbf,1,0.98
1,linear,1,0.98
2,rbf,10,0.98
3,linear,10,0.973333
4,rbf,20,0.966667
5,linear,20,0.966667


In [21]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

The best parameters for our model

In [22]:
clf.best_score_

0.9800000000000001

The best score we obtained

In [23]:
dir(clf)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_default_requests',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_select_best_index',
 '_validate_data',
 '_validate_params',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 '

# RandomizedSearchCV Approach

In [24]:
from sklearn.model_selection import RandomizedSearchCV

In [25]:
rs = RandomizedSearchCV(SVC(gamma='auto'), {
    'kernel' : ['rbf', 'linear'],
    'C' : [1, 10, 20]
}, cv=5, n_iter=2)

rs.fit(iris.data, iris.target)
pd.DataFrame(rs.cv_results_)[['param_kernel','param_C','mean_test_score']]

Unnamed: 0,param_kernel,param_C,mean_test_score
0,rbf,10,0.98
1,rbf,20,0.966667


Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation
For example of too many paramters consider many values of C

# Different Models with different Hyperparameters

In [29]:
model_params = {
    'svc' : {
        'model' : SVC(gamma='auto'),
        'params' : {
            'kernel' : ['rbf', 'linear'],
            'C' : [1, 10, 20]
        }
    },
    'random_forest' : {
        'model' : RandomForestClassifier(),
        'params' : {
            'n_estimators' : [5, 10]
        }
    },
    'logistic_regression' : {
        'model' : LogisticRegression(solver='liblinear',multi_class='auto'),
        'params' : {
            'C' : [1, 5, 10]
        }
    }
}

In [30]:
scores = []

for i, j in model_params.items():
    clf = GridSearchCV(j['model'], j['params'], cv=5)
    clf.fit(iris.data, iris.target)

    scores.append({
        'model' : i,
        'best_score' : clf.best_score_,
        'best_params' : clf.best_params_
    })

pd.DataFrame(scores)

Unnamed: 0,model,best_score,best_params
0,svc,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.96,{'n_estimators': 5}
2,logistic_regression,0.966667,{'C': 5}


This tells us which model has best score with which parameters