<h3 align="center">Codebasics ML Course: Randomized Search CV</h3>

We will generate a synthetic dataset

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

X, y = make_classification(
    n_features=10, 
    n_samples=1000, 
    n_informative=8,
    n_redundant=2,
    n_repeated=0,
    n_classes=2, 
    random_state=42
)

### GridSearchCV

In [4]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(
    DecisionTreeClassifier(),
    {'criterion': ["gini", "entropy"],'max_depth': [5, 10, 15, 20]},
    cv=5,
    return_train_score=False
)
clf.fit(X, y)
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.008012,0.001538,0.001072,0.001382,gini,5,"{'criterion': 'gini', 'max_depth': 5}",0.78,0.795,0.75,0.805,0.77,0.78,0.019235,6
1,0.007415,0.004827,0.002656,0.005311,gini,10,"{'criterion': 'gini', 'max_depth': 10}",0.765,0.74,0.8,0.785,0.81,0.78,0.0251,6
2,0.008845,0.007032,0.000204,0.000407,gini,15,"{'criterion': 'gini', 'max_depth': 15}",0.805,0.73,0.815,0.82,0.815,0.797,0.033853,4
3,0.012981,0.003762,0.001688,0.002371,gini,20,"{'criterion': 'gini', 'max_depth': 20}",0.81,0.72,0.81,0.825,0.825,0.798,0.039573,3
4,0.008357,0.004842,0.0,0.0,entropy,5,"{'criterion': 'entropy', 'max_depth': 5}",0.765,0.775,0.75,0.815,0.79,0.779,0.022226,8
5,0.011434,0.005524,0.001115,0.001382,entropy,10,"{'criterion': 'entropy', 'max_depth': 10}",0.78,0.8,0.83,0.76,0.795,0.793,0.023152,5
6,0.018231,0.002301,0.000424,0.000599,entropy,15,"{'criterion': 'entropy', 'max_depth': 15}",0.765,0.795,0.83,0.805,0.865,0.812,0.033705,1
7,0.010793,0.008173,0.0,0.0,entropy,20,"{'criterion': 'entropy', 'max_depth': 20}",0.77,0.785,0.835,0.795,0.84,0.805,0.027749,2


### RandomizedSearchCV

Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation

In [5]:
from sklearn.model_selection import RandomizedSearchCV

clf = RandomizedSearchCV(
    DecisionTreeClassifier(),
    {'criterion': ["gini", "entropy"],'max_depth': [5, 10, 15, 20]},
    cv=5,
    return_train_score=False,
    n_iter=3 
)
clf.fit(X, y)
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.008814,0.003013,0.006185,0.008811,10,gini,"{'max_depth': 10, 'criterion': 'gini'}",0.785,0.755,0.8,0.805,0.81,0.791,0.019849,3
1,0.010714,0.003138,0.0,0.0,20,entropy,"{'max_depth': 20, 'criterion': 'entropy'}",0.765,0.8,0.835,0.805,0.84,0.809,0.027092,1
2,0.006626,0.000988,0.0,0.0,20,gini,"{'max_depth': 20, 'criterion': 'gini'}",0.79,0.725,0.815,0.815,0.815,0.792,0.034871,2


In [6]:
clf.best_params_

{'max_depth': 20, 'criterion': 'entropy'}