In [1]:
# Author: Roi Yehoshua <roiyeho@gmail.com>
# Date: January 2014
# License: MIT

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(42)

In [3]:
from sklearn.datasets import load_iris

X, y = load_iris(as_frame=True, return_X_y=True)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

Grid Search

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(1, 11),
    'min_samples_leaf': np.arange(1, 11)
}

clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(clf, param_grid, cv=3, n_jobs=-1)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
                         'min_samples_leaf': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])})

In [6]:
grid_search.best_params_

{'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1}

In [7]:
grid_search.best_score_

0.9374110953058321

In [8]:
test_accuracy = grid_search.score(X_test, y_test)
print(f'Test accuracy: {test_accuracy:.4f}')

Test accuracy: 0.9211


In [9]:
df = pd.DataFrame(grid_search.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003678,4.845241e-04,0.001995,2.973602e-07,gini,1,1,"{'criterion': 'gini', 'max_depth': 1, 'min_sam...",0.657895,0.648649,0.675676,0.660740,0.011216,181
1,0.004682,9.103719e-04,0.001990,6.409278e-06,gini,1,2,"{'criterion': 'gini', 'max_depth': 1, 'min_sam...",0.657895,0.648649,0.675676,0.660740,0.011216,181
2,0.004678,9.514294e-04,0.001994,2.973602e-07,gini,1,3,"{'criterion': 'gini', 'max_depth': 1, 'min_sam...",0.657895,0.648649,0.675676,0.660740,0.011216,181
3,0.003989,4.495664e-07,0.002680,9.469369e-04,gini,1,4,"{'criterion': 'gini', 'max_depth': 1, 'min_sam...",0.657895,0.648649,0.675676,0.660740,0.011216,181
4,0.003510,4.271640e-04,0.001994,1.946680e-07,gini,1,5,"{'criterion': 'gini', 'max_depth': 1, 'min_sam...",0.657895,0.648649,0.675676,0.660740,0.011216,181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.002325,4.682234e-04,0.001664,4.717101e-04,entropy,10,6,"{'criterion': 'entropy', 'max_depth': 10, 'min...",0.947368,0.918919,0.945946,0.937411,0.013089,1
196,0.003324,4.692914e-04,0.001662,4.713142e-04,entropy,10,7,"{'criterion': 'entropy', 'max_depth': 10, 'min...",0.947368,0.918919,0.945946,0.937411,0.013089,1
197,0.002992,8.142961e-04,0.001994,3.371748e-07,entropy,10,8,"{'criterion': 'entropy', 'max_depth': 10, 'min...",0.947368,0.918919,0.945946,0.937411,0.013089,1
198,0.002327,4.705836e-04,0.001330,4.708088e-04,entropy,10,9,"{'criterion': 'entropy', 'max_depth': 10, 'min...",0.947368,0.918919,0.945946,0.937411,0.013089,1


Random Search

In [10]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter space
param_dist = {    
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(1, 11),
    'min_samples_leaf': np.arange(1, 11)
}

# Create a RandomizedSearchCV instance
clf = DecisionTreeClassifier(random_state=42)
random_search = RandomizedSearchCV(clf, param_dist, n_iter=20, cv=3, random_state=42, n_jobs=-1)

# Fit to the training data
random_search.fit(X_train, y_train)

RandomizedSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=42),
                   n_iter=20, n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
                                        'min_samples_leaf': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])},
                   random_state=42)

In [11]:
random_search.best_params_

{'min_samples_leaf': 6, 'max_depth': 10, 'criterion': 'gini'}

In [12]:
random_search.best_score_

0.9374110953058321

In [13]:
test_accuracy = grid_search.score(X_test, y_test)
print(f'Test accuracy: {test_accuracy:.4f}')

Test accuracy: 0.9211


Bayesian Optimization

In [14]:
from skopt import BayesSearchCV
from skopt.space import Categorical, Integer

# Define the search space for hyperparameters
search_space = {
    'criterion': Categorical(['gini', 'entropy']),
    'max_depth': Integer(1, 11),
    'min_samples_leaf': Integer(1, 11)
}

# Create a BayesSearchCV instance
clf = DecisionTreeClassifier(random_state=42)
bayes_search = BayesSearchCV(clf, search_space, n_iter=20, cv=3, random_state=42, n_jobs=-1)

# Perform the Bayesian optimization search
bayes_search.fit(X_train, y_train)



BayesSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=42),
              n_iter=20, n_jobs=-1, random_state=42,
              search_spaces={'criterion': Categorical(categories=('gini', 'entropy'), prior=None),
                             'max_depth': Integer(low=1, high=11, prior='uniform', transform='normalize'),
                             'min_samples_leaf': Integer(low=1, high=11, prior='uniform', transform='normalize')})

In [15]:
bayes_search.best_params_

OrderedDict([('criterion', 'entropy'),
             ('max_depth', 8),
             ('min_samples_leaf', 10)])

In [16]:
bayes_search.best_score_

0.9374110953058321

In [17]:
test_accuracy = bayes_search.score(X_test, y_test)
print(f'Test accuracy: {test_accuracy:.4f}')

Test accuracy: 0.9211
