In [1]:
from sklearn import datasets,svm
import pandas as pd

In [2]:
iris = datasets.load_iris()

In [3]:
df  = pd.DataFrame(iris.data,columns = iris.feature_names)

In [4]:
df["target"] = iris.target

In [5]:
df.head(2)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0


In [6]:
df["target_names"] = df["target"].apply(lambda x : iris.target_names[x])

In [7]:
df.head(2)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa


In [8]:
# Approach 1: Use train_test_split and manually tune parameters by trial and error

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
xtrain,xtest,ytrain,ytest = train_test_split(iris.data,iris.target,train_size = .7)

In [11]:
from sklearn.svm import SVC

In [12]:
model = SVC(kernel = "rbf", C =30,gamma = "auto")   # we are change  parameter by trail and error mannualy

In [13]:
model.fit(xtrain,ytrain)

SVC(C=30, gamma='auto')

In [14]:
model.score(xtest,ytest)

0.9555555555555556

In [15]:
# Approach 2: Use K Fold Cross validation
# Manually try suppling models with different parameters to cross_val_score function with 5 fold cross validation


In [16]:
from sklearn.model_selection import cross_val_score,GridSearchCV

In [17]:
cross_val_score(SVC(gamma = "auto",kernel = "linear",C =10),iris.data,iris.target,cv = 5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [18]:
cross_val_score(SVC(gamma = "auto",kernel = "linear",C =30),iris.data,iris.target,cv = 5)

array([1. , 1. , 0.9, 0.9, 1. ])

In [19]:
cross_val_score(SVC(gamma = "auto",kernel = "rbf",C =10),iris.data,iris.target,cv = 5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [20]:
cross_val_score(SVC(gamma = "scale",kernel = "rbf",C =30),iris.data,iris.target,cv = 5)

array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ])

In [21]:
cross_val_score(SVC(gamma = "scale",kernel = "rbf",C =10),iris.data,iris.target,cv = 5).mean()

0.9800000000000001

In [22]:
(.96666667+1+.96666667+.96666667+1)/5

0.9800000019999999

In [23]:
# Above approach is tiresome and very manual. We can use for loop as an alternative


In [24]:
import numpy as np

In [25]:
k_val = ["rbf","linear"]
c_val = [1,10,30,40]
avg_score = {}
for k in k_val:
    for c in c_val:
        score = cross_val_score(SVC(kernel = k , C = c, gamma = "auto"),iris.data,iris.target, cv = 5)
        avg_score[k + "_" +str(c) ] = np.average(score)   # score.mean()
        

In [26]:
print(avg_score)

{'rbf_1': 0.9800000000000001, 'rbf_10': 0.9800000000000001, 'rbf_30': 0.96, 'rbf_40': 0.96, 'linear_1': 0.9800000000000001, 'linear_10': 0.9733333333333334, 'linear_30': 0.96, 'linear_40': 0.96}


In [27]:
# From above results we can say that rbf or linear with C=10 or 1 will give best performance


In [28]:
# Approach 3: Use GridSearchCV
# GridSearchCV does exactly same thing as for loop above but in a single line of code


In [29]:
from sklearn.model_selection import GridSearchCV

In [30]:
clf = GridSearchCV(svm.SVC(gamma = "auto"),{"kernel":["rbf","linear"],"C" :[1,10,30]}, cv= 5, return_train_score=False) # cv = 5 is for 5 cross validation

In [31]:
clf.fit(iris.data,iris.target)

GridSearchCV(cv=5, estimator=SVC(gamma='auto'),
             param_grid={'C': [1, 10, 30], 'kernel': ['rbf', 'linear']})

In [32]:
clf.cv_results_

{'mean_fit_time': array([0.00128927, 0.00079646, 0.00081868, 0.00073037, 0.00080471,
        0.00059323]),
 'std_fit_time': array([0.00086027, 0.00039826, 0.00041145, 0.00042506, 0.00040241,
        0.00048441]),
 'mean_score_time': array([0.00059834, 0.00019865, 0.0004993 , 0.00026827, 0.00040164,
        0.        ]),
 'std_score_time': array([0.00048854, 0.0003973 , 0.00044454, 0.0004505 , 0.00049193,
        0.        ]),
 'param_C': masked_array(data=[1, 1, 10, 10, 30, 30],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 30, 'kernel': 'rbf'},
  {'C': 30, 'kernel': 'linear'}],


In [33]:
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001289,0.00086,0.000598,0.000489,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.000796,0.000398,0.000199,0.000397,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.000819,0.000411,0.000499,0.000445,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.00073,0.000425,0.000268,0.00045,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.000805,0.000402,0.000402,0.000492,30,rbf,"{'C': 30, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.933333,1.0,0.96,0.038873,5
5,0.000593,0.000484,0.0,0.0,30,linear,"{'C': 30, 'kernel': 'linear'}",1.0,1.0,0.9,0.9,1.0,0.96,0.04899,5


In [34]:
df[["param_C","param_kernel","mean_test_score"]].head()

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,30,rbf,0.96


In [35]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [36]:
clf.best_score_

0.9800000000000001

In [37]:
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'inverse_transform',
 'multimetric_',
 'n_features_in_',
 'n_jobs',
 'n_splits

In [38]:
# Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when
# you have too many parameters to try and your training time is longer. It helps reduce the cost of computation


In [39]:
from sklearn.model_selection import RandomizedSearchCV

In [40]:
clf = RandomizedSearchCV(SVC(gamma = "auto"),{"kernel" : ["linear","rbf"],"C" :[1,10,30]},cv = 5,return_train_score = False,n_iter = 2)

In [41]:
clf.fit(iris.data,iris.target)

RandomizedSearchCV(cv=5, estimator=SVC(gamma='auto'), n_iter=2,
                   param_distributions={'C': [1, 10, 30],
                                        'kernel': ['linear', 'rbf']})

In [42]:
clf.cv_results_

{'mean_fit_time': array([0.00072503, 0.00099854]),
 'std_fit_time': array([0.00050574, 0.00066586]),
 'mean_score_time': array([0.00019937, 0.00033679]),
 'std_score_time': array([0.00039873, 0.00036846]),
 'param_kernel': masked_array(data=['linear', 'linear'],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_C': masked_array(data=[10, 30],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'kernel': 'linear', 'C': 10}, {'kernel': 'linear', 'C': 30}],
 'split0_test_score': array([1., 1.]),
 'split1_test_score': array([1., 1.]),
 'split2_test_score': array([0.9, 0.9]),
 'split3_test_score': array([0.96666667, 0.9       ]),
 'split4_test_score': array([1., 1.]),
 'mean_test_score': array([0.97333333, 0.96      ]),
 'std_test_score': array([0.03887301, 0.04898979]),
 'rank_test_score': array([1, 2])}

In [43]:
df = pd.DataFrame(clf.cv_results_)

In [44]:
df[["param_kernel","param_C","mean_test_score"]]

Unnamed: 0,param_kernel,param_C,mean_test_score
0,linear,10,0.973333
1,linear,30,0.96


In [45]:
clf.best_score_

0.9733333333333334

In [46]:
clf.best_params_

{'kernel': 'linear', 'C': 10}

In [47]:
# How about different models with different hyperparameters?


In [48]:
from sklearn.linear_model import LogisticRegression

In [49]:
from sklearn.ensemble import RandomForestClassifier

In [50]:
from sklearn.svm import SVC

In [51]:
model_para = {"svm":{"model":SVC(gamma = "auto"),
                     "params":{"kernel":["linear","rbf"],
                               "C":[1,10,30]}},
             "logis":{"model":LogisticRegression(solver='liblinear',multi_class='auto'),
                     "params":{"C":[1,5,10]}},
              "Random":{"model":RandomForestClassifier(),
                       "params":{"n_estimators":[3,5,10]}}
             }

In [52]:
s = []
for model_name,mp in model_para.items():
    clf = GridSearchCV(mp["model"],mp["params"],cv = 5,return_train_score = False)
    clf.fit(iris.data,iris.target)
    s.append({"model_name":model_name,
             "best_score":clf.best_score_,
             "best_para":clf.best_params_})

In [53]:
s

[{'model_name': 'svm',
  'best_score': 0.9800000000000001,
  'best_para': {'C': 1, 'kernel': 'linear'}},
 {'model_name': 'logis',
  'best_score': 0.9666666666666668,
  'best_para': {'C': 5}},
 {'model_name': 'Random',
  'best_score': 0.96,
  'best_para': {'n_estimators': 3}}]

In [54]:
pd.DataFrame(s,columns= ["model_name","best_score","best_para"])

Unnamed: 0,model_name,best_score,best_para
0,svm,0.98,"{'C': 1, 'kernel': 'linear'}"
1,logis,0.966667,{'C': 5}
2,Random,0.96,{'n_estimators': 3}


In [55]:
# Based on above, I can conclude that SVM with C=1 and kernel='linear' is the best model for solving my problem of iris 
# flower classification


In [56]:


Exercise: Machine Learning Finding Optimal Model and Hyperparameters
For digits dataset in sklearn.dataset, please try following classifiers and find out the one that gives best performance. Also find the optimal parameters for that classifier.

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

SyntaxError: invalid syntax (Temp/ipykernel_18444/1828075965.py, line 1)

In [None]:
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [None]:
digits = datasets.load_digits()

In [None]:
model_para = {"svm":{
                    "model":SVC(),
                    "model_para":{
                                  "kernel":["linear","rbf"],
                                 "C":[1,10,30]
                                  }
                    },
              "logis":{
                      "model":LogisticRegression(solver='liblinear',multi_class='auto'),
                       "model_para":{
                                     "C":[1,5,10]
                                     }
                      },
              "random" : {
                          "model":RandomForestClassifier(),
                         "model_para":{
                                       "n_estimators":[5,10,15]
                                       }
              }
             }

In [None]:
score = [ ]
for model_name,mp in model_para.items():
    clf = GridSearchCV(mp["model"],mp["model_para"],cv = 5,return_train_score = False)
    clf.fit(digits.data,digits.target)
    score.append({"model_name":model_name,
                 "model_best":clf.best_score_,
                 "model_best_para":clf.best_params_})

In [None]:
pd.DataFrame(score,columns = [ "model_name","model_best","model_best_para"])

In [None]:
we can conclude svm is best in this case with parameter c =1 and kernel = "rbf"

In [None]:
dir(digits)