<a href="https://colab.research.google.com/github/MpRonald/Machine-Learning/blob/main/GridSearchCV_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import time
from sklearn.metrics import precision_score, recall_score, f1_score,\
                            accuracy_score, precision_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
iris = pd.read_csv('https://github.com/MpRonald/datasets/raw/main/iris_data.csv')
iris

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
X = iris.iloc[:,:3]
y = iris.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
model = SVC(kernel='rbf', C=30, gamma='auto').fit(X_train, y_train)
model.score(X_test, y_test)

0.9333333333333333

In [5]:
model_params = {
    'svm': {
        'model': SVC(),
        'params' : {
            'gamma' : ['scale', 'auto'],
            'C': [1,10,20],
            'kernel': ['rbf','linear', 'sigmoid'],
            'degree' : [3,4,5,6,7],
            'probability' : [True, False],
            'decision_function_shape' : ['ovo', 'ovr']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10,50,100,150,250,500],
            'criterion' : ['gini', 'entropy', 'log_loss'],
            'max_depth' : [5, 25, 50],
            'min_samples_split' : [2,4,6,8],
            'min_samples_leaf' : [1,2,3],
            'max_features' : ['sqrt', 'log2', None]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(),
        'params': {
            'multi_class' : ['auto', 'ovr', 'multinomial'],
            'solver' : ['lbfgs', 'liblinear', 'newton-cg',
                        'newton-cholesky', 'sag', 'saga'],
            'C': [1,5,10,15,20],
            'fit_intercept' : [True, False],
            'intercept_scaling' : [1,2,3,4]
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),
        'params': {}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'splitter' : ['best', 'random'],
            'criterion' : ['gini', 'entropy', 'log_loss'],
            'max_depth' : [5, 25, 50],
            'min_samples_split' : [2,4,6,8],
            'min_samples_leaf' : [1,2,3],
            'max_features' : ['auto', 'sqrt', 'log2']
            
        }
    },
    'knn' : {
        'model': KNeighborsClassifier(),
        'params' : {
            'n_neighbors' : [3,4,5,6,7,8,9,10],
            'weights' : ['uniform', 'distance'],
            'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'leaf_size' : [30, 60, 90],
            'n_jobs' : [2,3,4]
        }
    }   
}

In [6]:
scores = []

for model_name, mp in model_params.items():
    start = time.time()
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X, y)
    end = time.time()
    finish_time = end - start
    y_pred = clf.predict(X_test)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_,
        'time_seconds' : finish_time,
        'accuracy_score': accuracy_score(y_test, y_pred)
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params',
                                  'time_seconds', 'accuracy_score'])
df

4320 fits failed out of a total of 12960.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4320 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/ensemble/_forest.py", line 450, in fit
    trees = Parallel(
  File "/usr/local/lib/python3.8/dist-packages/joblib/parallel.py", line 1085, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.8/dist-packages/joblib/parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.8/dist-

Unnamed: 0,model,best_score,best_params,time_seconds,accuracy_score
0,svm,0.966667,"{'C': 20, 'decision_function_shape': 'ovo', 'd...",14.094789,0.933333
1,random_forest,0.973333,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",1528.269291,0.966667
2,logistic_regression,0.966667,"{'C': 20, 'fit_intercept': True, 'intercept_sc...",35.260541,0.9
3,naive_bayes_gaussian,0.88,{},0.018462,0.9
4,naive_bayes_multinomial,0.953333,{},0.018263,0.9
5,decision_tree,0.946667,"{'criterion': 'entropy', 'max_depth': 5, 'max_...",8.662148,0.966667
6,knn,0.953333,"{'algorithm': 'auto', 'leaf_size': 30, 'n_jobs...",30.662785,1.0
