# Model searching

In this notebook I'm gonna focus on building searchgrid that will search for best clasification algorythm to predict target from iris_dataset.

In [8]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_iris

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, LeaveOneOut, RepeatedKFold

## Loading and spliting the data

In [9]:
iris_dataset = load_iris()

X = iris_dataset.data
y = iris_dataset.target

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2, random_state=6)

## Searching for best algorythm

In [15]:
model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params' : {
            'C': [1,5,10,15,20],
            'kernel': ['rbf','linear','poly','sigmoid'],
            'degree': [2,3]
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,4,5,10,15,45]
        }
    },
    'logistic_regression': {
        'model': LogisticRegression(solver='liblinear', multi_class='auto', penalty="l2"),
        'params': {
            'C': [1,5,10]
        }
    },
    'KNN' : {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [7,9,11,15,17,19,21],
            'weights': ['uniform', 'distance'],
            'p': [1, 2]
        }
    },
    'naive_bayes' : {
        'model': GaussianNB(),
        'params' : {
            'var_smoothing': [1e-8,1e-9,1e-10]
        }
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ["gini", "entropy", "log_loss"],
            'splitter': ["best", "random"]
        }
    }
}

In [16]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=RepeatedKFold(n_splits=5, n_repeats=10, random_state=4), return_train_score=False)
    clf.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

100 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Admin\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Admin\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "C:\Users\Admin\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 352, in fit
    criterion = CRITERIA_CLF[self.criterion](
KeyError: 'log_loss'



Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'degree': 2, 'kernel': 'linear'}"
1,random_forest,0.955333,{'n_estimators': 45}
2,logistic_regression,0.964,{'C': 10}
3,KNN,0.972,"{'n_neighbors': 15, 'p': 2, 'weights': 'distan..."
4,naive_bayes,0.956,{'var_smoothing': 1e-08}
5,decision_tree,0.946667,"{'criterion': 'gini', 'splitter': 'best'}"


As we can see the best score got SVM model

## Creating best model

In [14]:
svm = SVC(C=1, degree=2, kernel='linear')

In [15]:
svm.fit(X_train, y_train)

SVC(C=1, degree=2, kernel='linear')

In [16]:
print('testing model on training data', svm.score(X_train, y_train))

testing model on training data 0.9833333333333333


In [17]:
print('testing model on testing data', svm.score(X_test, y_test))

testing model on testing data 1.0


In [18]:
# 99% of data into training and 1 data point into testing iteratively
loo_result = cross_val_score(svm, X, y, cv=LeaveOneOut())
print('real accuracy =', loo_result.mean())

real accuracy = 0.98


In [46]:
# KFolds repeated n times
rkf_result = cross_val_score(svm, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=10, random_state=4))
print('real accuracy =', rkf_result.mean())

real accuracy = 0.98
