<h3 align="center">Codebasics ML Course: Grid Search CV</h3>

We will generate a synthetic dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(
    n_features=10, 
    n_samples=1000, 
    n_informative=8,
    n_redundant=2,
    n_repeated=0,
    n_classes=2, 
    random_state=42
)

### Method 1: Evaluate the model using train, test split and tune parameters by trial and error

In [2]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

model = DecisionTreeClassifier(criterion="entropy", max_depth=10) # criteria: "gini" or "entropy", max_depth=5 or 10
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.82      0.75      0.79       130
           1       0.76      0.82      0.79       120

    accuracy                           0.79       250
   macro avg       0.79      0.79      0.79       250
weighted avg       0.79      0.79      0.79       250



### Method 2: Use K Fold Cross Validation

In [25]:
from sklearn.model_selection import cross_val_score

cross_val_score(DecisionTreeClassifier(criterion="gini", max_depth=5), X, y, cv=5)

array([0.775, 0.79 , 0.755, 0.805, 0.77 ])

In [26]:
cross_val_score(DecisionTreeClassifier(criterion="entropy", max_depth=5), X, y, cv=5)

array([0.765, 0.775, 0.755, 0.815, 0.78 ])

In [27]:
criterion = ["gini", "entropy"]
max_depth = [5, 10, 15]

avg_scores = {}

for c in criterion:
    for d in max_depth:
        clf = DecisionTreeClassifier(criterion=c, max_depth=d)
        score_list = cross_val_score(clf, X, y, cv=5)
        avg_scores[c + "_" + str(d)] = np.average(score_list)
        
avg_scores

{'gini_5': np.float64(0.7829999999999999),
 'gini_10': np.float64(0.784),
 'gini_15': np.float64(0.7889999999999999),
 'entropy_5': np.float64(0.7809999999999999),
 'entropy_10': np.float64(0.79),
 'entropy_15': np.float64(0.8160000000000001)}

### Method 3: Use GridSearchCV

In [28]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(
    DecisionTreeClassifier(),
    {'criterion': ["gini", "entropy"],'max_depth': [5, 10, 15]},
    cv=5,
    return_train_score=False
)
clf.fit(X, y)
clf.cv_results_

{'mean_fit_time': array([0.00582685, 0.01092606, 0.00782237, 0.00509067, 0.01175499,
        0.01498713]),
 'std_fit_time': array([0.0019338 , 0.00174279, 0.00233237, 0.0034416 , 0.00420317,
        0.00423576]),
 'mean_score_time': array([0.00189009, 0.00055456, 0.00080166, 0.00174704, 0.0008018 ,
        0.00040126]),
 'std_score_time': array([0.00226375, 0.00068694, 0.00098183, 0.00281618, 0.000982  ,
        0.00080252]),
 'param_criterion': masked_array(data=['gini', 'gini', 'gini', 'entropy', 'entropy',
                    'entropy'],
              mask=[False, False, False, False, False, False],
        fill_value=np.str_('?'),
             dtype=object),
 'param_max_depth': masked_array(data=[5, 10, 15, 5, 10, 15],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'params': [{'criterion': 'gini', 'max_depth': 5},
  {'criterion': 'gini', 'max_depth': 10},
  {'criterion': 'gini', 'max_depth': 15},
  {'criterion': 'entropy', 'max_depth': 5

In [29]:
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005827,0.001934,0.00189,0.002264,gini,5,"{'criterion': 'gini', 'max_depth': 5}",0.78,0.795,0.75,0.795,0.765,0.777,0.017493,6
1,0.010926,0.001743,0.000555,0.000687,gini,10,"{'criterion': 'gini', 'max_depth': 10}",0.805,0.745,0.82,0.8,0.81,0.796,0.026344,3
2,0.007822,0.002332,0.000802,0.000982,gini,15,"{'criterion': 'gini', 'max_depth': 15}",0.8,0.725,0.84,0.8,0.835,0.8,0.04111,2
3,0.005091,0.003442,0.001747,0.002816,entropy,5,"{'criterion': 'entropy', 'max_depth': 5}",0.765,0.785,0.755,0.815,0.79,0.782,0.020881,5
4,0.011755,0.004203,0.000802,0.000982,entropy,10,"{'criterion': 'entropy', 'max_depth': 10}",0.78,0.79,0.805,0.765,0.78,0.784,0.013191,4
5,0.014987,0.004236,0.000401,0.000803,entropy,15,"{'criterion': 'entropy', 'max_depth': 15}",0.76,0.8,0.84,0.8,0.84,0.808,0.029933,1


In [30]:
df[["param_criterion", "param_max_depth", "mean_test_score"]]

Unnamed: 0,param_criterion,param_max_depth,mean_test_score
0,gini,5,0.777
1,gini,10,0.796
2,gini,15,0.8
3,entropy,5,0.782
4,entropy,10,0.784
5,entropy,15,0.808


In [31]:
clf.best_params_

{'criterion': 'entropy', 'max_depth': 15}

In [32]:
model = clf.best_estimator_
model

### Now let's try different models with different parameters

In [33]:
from sklearn import svm

model_params = {
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params' : {
            'criterion': ["gini", "entropy"],
            'max_depth': [5, 10, 15]
        }  
    },
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']            
        }
    }
}

scores = []

for key, val in model_params.items():
    clf = GridSearchCV(
        val['model'],
        val['params'],
        cv=5,
        return_train_score=False
    )
    clf.fit(X, y)
    scores.append({
        'model': key,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
scores

[{'model': 'decision_tree',
  'best_score': np.float64(0.8089999999999999),
  'best_params': {'criterion': 'entropy', 'max_depth': 15}},
 {'model': 'svm',
  'best_score': np.float64(0.9260000000000002),
  'best_params': {'C': 1, 'kernel': 'rbf'}}]

In [38]:
df = pd.DataFrame(scores, columns=["model", "best_score", "best_params"])
df

Unnamed: 0,model,best_score,best_params
0,decision_tree,0.809,"{'criterion': 'entropy', 'max_depth': 15}"
1,svm,0.926,"{'C': 1, 'kernel': 'rbf'}"
