In [1]:
from time import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.base import BaseEstimator, ClassifierMixin
from dtree import *
from sklearn.metrics import f1_score

class MyClassifier(BaseEstimator, ClassifierMixin): 
    def __init__(self, min_leaf_size=5, max_level=5, criterion='entropy', splitting='mean'):
        params = np.asarray(x)
        self.min_leaf_size = min_leaf_size
        self.max_level = max_level
        self.criterion = criterion
        self.splitting = splitting
    
    def fit(self, X, y=None):
        self.model = experiment(X, self.min_leaf_size, self.max_level, self.criterion, self.splitting)
        return self

    def predict(self, X, y=None):
        return predicting(self.model, X)

    def score(self, X, y=None):
        predicted = self.predict(X)
        return f1_score(predicted, y, average='macro')

In [2]:
dataset = pickle_operating('Caltech_data_2', None)
print(len(dataset['train']), len(dataset['test']))

Caltech_data_2.pickle
(320, 324)


In [3]:
# Utility function to report best scores
def report(results, n_top=10):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [4]:
X, y  = [x[0] for x in dataset['train']], [x[1] for x in dataset['train']]
            
# specify parameters and distributions to sample from
param_dist = {"min_leaf_size": sp_randint(15, 51), 
              "max_level": sp_randint(2, 11), 
              "criterion": ["entropy", "gini"], 
              "splitting": ['median', "mean"]}

# run randomized search
n_iter_search = 5
random_search = RandomizedSearchCV(MyClassifier(), param_dist,
                                   cv=5,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(dataset['train'], y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_, 5)

  'recall', 'true', average, warn_for)


RandomizedSearchCV took 4.45 seconds for 5 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.250 (std: 0.017)
Parameters: {'max_level': 10, 'min_leaf_size': 16, 'criterion': 'gini', 'splitting': 'mean'}

Model with rank: 2
Mean validation score: 0.244 (std: 0.051)
Parameters: {'max_level': 6, 'min_leaf_size': 37, 'criterion': 'gini', 'splitting': 'mean'}

Model with rank: 3
Mean validation score: 0.214 (std: 0.054)
Parameters: {'max_level': 8, 'min_leaf_size': 19, 'criterion': 'entropy', 'splitting': 'median'}

Model with rank: 4
Mean validation score: 0.193 (std: 0.057)
Parameters: {'max_level': 9, 'min_leaf_size': 16, 'criterion': 'entropy', 'splitting': 'mean'}

Model with rank: 5
Mean validation score: 0.150 (std: 0.041)
Parameters: {'max_level': 4, 'min_leaf_size': 37, 'criterion': 'entropy', 'splitting': 'mean'}



In [5]:
X, y  = [x[0] for x in dataset['train']], [x[1] for x in dataset['train']]
            
# specify parameters and distributions to sample from
param_dist = {"min_leaf_size": sp_randint(15, 51), 
              "max_level": sp_randint(2, 16), 
              "criterion": ["entropy", "gini"], 
              "splitting": ['median', "mean"]}

# run randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(MyClassifier(), param_dist,
                                   cv=5,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(dataset['train'], y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_, 10)

RandomizedSearchCV took 7.62 seconds for 10 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.257 (std: 0.030)
Parameters: {'max_level': 14, 'min_leaf_size': 15, 'criterion': 'gini', 'splitting': 'median'}

Model with rank: 2
Mean validation score: 0.253 (std: 0.037)
Parameters: {'max_level': 13, 'min_leaf_size': 25, 'criterion': 'gini', 'splitting': 'mean'}

Model with rank: 3
Mean validation score: 0.229 (std: 0.040)
Parameters: {'max_level': 9, 'min_leaf_size': 35, 'criterion': 'gini', 'splitting': 'median'}

Model with rank: 3
Mean validation score: 0.229 (std: 0.040)
Parameters: {'max_level': 11, 'min_leaf_size': 49, 'criterion': 'gini', 'splitting': 'median'}

Model with rank: 5
Mean validation score: 0.225 (std: 0.027)
Parameters: {'max_level': 6, 'min_leaf_size': 18, 'criterion': 'gini', 'splitting': 'median'}

Model with rank: 6
Mean validation score: 0.214 (std: 0.054)
Parameters: {'max_level': 11, 'min_leaf_size': 24, 'criterion': 'entropy', 'splitti

In [6]:
X, y  = [x[0] for x in dataset['train']], [x[1] for x in dataset['train']]
            
# specify parameters and distributions to sample from
param_dist = {"min_leaf_size": sp_randint(15, 51), 
              "max_level": sp_randint(2, 11), 
              "criterion": ["entropy", "gini"], 
              "splitting": ['median', "mean"]}

# run randomized search
n_iter_search = 15
random_search = RandomizedSearchCV(MyClassifier(), param_dist,
                                   cv=5,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(dataset['train'], y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_, 15)

RandomizedSearchCV took 7.75 seconds for 15 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.250 (std: 0.017)
Parameters: {'max_level': 7, 'min_leaf_size': 16, 'criterion': 'gini', 'splitting': 'mean'}

Model with rank: 2
Mean validation score: 0.246 (std: 0.012)
Parameters: {'max_level': 8, 'min_leaf_size': 15, 'criterion': 'gini', 'splitting': 'mean'}

Model with rank: 3
Mean validation score: 0.229 (std: 0.040)
Parameters: {'max_level': 5, 'min_leaf_size': 50, 'criterion': 'gini', 'splitting': 'median'}

Model with rank: 3
Mean validation score: 0.229 (std: 0.040)
Parameters: {'max_level': 3, 'min_leaf_size': 46, 'criterion': 'gini', 'splitting': 'median'}

Model with rank: 3
Mean validation score: 0.229 (std: 0.040)
Parameters: {'max_level': 7, 'min_leaf_size': 47, 'criterion': 'gini', 'splitting': 'median'}

Model with rank: 3
Mean validation score: 0.229 (std: 0.040)
Parameters: {'max_level': 10, 'min_leaf_size': 40, 'criterion': 'gini', 'splitting': 'me

In [7]:
X, y  = [x[0] for x in dataset['train']], [x[1] for x in dataset['train']]
            
# specify parameters and distributions to sample from
param_dist = {"min_leaf_size": sp_randint(15, 51), 
              "max_level": sp_randint(2, 11), 
              "criterion": ["entropy", "gini"], 
              "splitting": ['median', "mean"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(MyClassifier(), param_dist,
                                   cv=5,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(dataset['train'], y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_, 20)

RandomizedSearchCV took 11.93 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.257 (std: 0.043)
Parameters: {'max_level': 6, 'min_leaf_size': 28, 'criterion': 'gini', 'splitting': 'mean'}

Model with rank: 2
Mean validation score: 0.251 (std: 0.051)
Parameters: {'max_level': 4, 'min_leaf_size': 32, 'criterion': 'gini', 'splitting': 'mean'}

Model with rank: 2
Mean validation score: 0.251 (std: 0.051)
Parameters: {'max_level': 5, 'min_leaf_size': 32, 'criterion': 'gini', 'splitting': 'mean'}

Model with rank: 4
Mean validation score: 0.250 (std: 0.017)
Parameters: {'max_level': 7, 'min_leaf_size': 16, 'criterion': 'gini', 'splitting': 'mean'}

Model with rank: 5
Mean validation score: 0.242 (std: 0.054)
Parameters: {'max_level': 3, 'min_leaf_size': 15, 'criterion': 'gini', 'splitting': 'mean'}

Model with rank: 6
Mean validation score: 0.235 (std: 0.037)
Parameters: {'max_level': 6, 'min_leaf_size': 17, 'criterion': 'gini', 'splitting': 'median'}

In [8]:
dataset = pickle_operating('MNIST_data_2', None)
print(len(dataset['train']), len(dataset['test']))

MNIST_data_2.pickle
(60000, 10000)


In [12]:
X, y  = [x[0] for x in dataset['train']], [x[1] for x in dataset['train']]
            
# specify parameters and distributions to sample from
param_dist = {"min_leaf_size": sp_randint(15, 51), 
              "max_level": sp_randint(2, 11), 
              "criterion": ["entropy", "gini"], 
              "splitting": ['median', "mean"]}

# run randomized search
n_iter_search = 5
random_search = RandomizedSearchCV(MyClassifier(), param_dist,
                                   cv=5,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(dataset['train'], y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_, 5)

RandomizedSearchCV took 1755.30 seconds for 5 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.754 (std: 0.006)
Parameters: {'max_level': 9, 'min_leaf_size': 27, 'criterion': 'gini', 'splitting': 'median'}

Model with rank: 2
Mean validation score: 0.732 (std: 0.008)
Parameters: {'max_level': 10, 'min_leaf_size': 49, 'criterion': 'entropy', 'splitting': 'mean'}

Model with rank: 3
Mean validation score: 0.698 (std: 0.009)
Parameters: {'max_level': 9, 'min_leaf_size': 49, 'criterion': 'entropy', 'splitting': 'mean'}

Model with rank: 4
Mean validation score: 0.682 (std: 0.009)
Parameters: {'max_level': 7, 'min_leaf_size': 38, 'criterion': 'gini', 'splitting': 'mean'}

Model with rank: 5
Mean validation score: 0.622 (std: 0.011)
Parameters: {'max_level': 6, 'min_leaf_size': 24, 'criterion': 'gini', 'splitting': 'mean'}



In [9]:
X, y  = [x[0] for x in dataset['train']], [x[1] for x in dataset['train']]
            
# specify parameters and distributions to sample from
param_dist = {"min_leaf_size": sp_randint(15, 51), 
              "max_level": sp_randint(2, 11), 
              "criterion": ["entropy", "gini"], 
              "splitting": ['median', "mean"]}

# run randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(MyClassifier(), param_dist,
                                   cv=5,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(dataset['train'], y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_, 10)

RandomizedSearchCV took 4189.25 seconds for 10 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.785 (std: 0.004)
Parameters: {'max_level': 10, 'min_leaf_size': 39, 'criterion': 'gini', 'splitting': 'median'}

Model with rank: 1
Mean validation score: 0.785 (std: 0.004)
Parameters: {'max_level': 10, 'min_leaf_size': 30, 'criterion': 'gini', 'splitting': 'median'}

Model with rank: 3
Mean validation score: 0.754 (std: 0.006)
Parameters: {'max_level': 9, 'min_leaf_size': 48, 'criterion': 'gini', 'splitting': 'median'}

Model with rank: 4
Mean validation score: 0.732 (std: 0.008)
Parameters: {'max_level': 10, 'min_leaf_size': 15, 'criterion': 'entropy', 'splitting': 'mean'}

Model with rank: 5
Mean validation score: 0.721 (std: 0.008)
Parameters: {'max_level': 10, 'min_leaf_size': 23, 'criterion': 'entropy', 'splitting': 'median'}

Model with rank: 5
Mean validation score: 0.721 (std: 0.008)
Parameters: {'max_level': 10, 'min_leaf_size': 50, 'criterion': 'entropy'

In [10]:
X, y  = [x[0] for x in dataset['train']], [x[1] for x in dataset['train']]
            
# specify parameters and distributions to sample from
param_dist = {"min_leaf_size": sp_randint(15, 51), 
              "max_level": sp_randint(2, 11), 
              "criterion": ["entropy", "gini"], 
              "splitting": ['median', "mean"]}

# run randomized search
n_iter_search = 15
random_search = RandomizedSearchCV(MyClassifier(), param_dist,
                                   cv=5,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(dataset['train'], y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_, 15)

RandomizedSearchCV took 4615.67 seconds for 15 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.785 (std: 0.004)
Parameters: {'max_level': 10, 'min_leaf_size': 47, 'criterion': 'gini', 'splitting': 'median'}

Model with rank: 2
Mean validation score: 0.765 (std: 0.008)
Parameters: {'max_level': 9, 'min_leaf_size': 40, 'criterion': 'gini', 'splitting': 'mean'}

Model with rank: 3
Mean validation score: 0.732 (std: 0.008)
Parameters: {'max_level': 10, 'min_leaf_size': 41, 'criterion': 'entropy', 'splitting': 'mean'}

Model with rank: 4
Mean validation score: 0.721 (std: 0.008)
Parameters: {'max_level': 10, 'min_leaf_size': 18, 'criterion': 'entropy', 'splitting': 'median'}

Model with rank: 5
Mean validation score: 0.690 (std: 0.006)
Parameters: {'max_level': 9, 'min_leaf_size': 38, 'criterion': 'entropy', 'splitting': 'median'}

Model with rank: 6
Mean validation score: 0.592 (std: 0.010)
Parameters: {'max_level': 7, 'min_leaf_size': 28, 'criterion': 'entropy',

In [11]:
X, y  = [x[0] for x in dataset['train']], [x[1] for x in dataset['train']]
            
# specify parameters and distributions to sample from
param_dist = {"min_leaf_size": sp_randint(15, 51), 
              "max_level": sp_randint(2, 11), 
              "criterion": ["entropy", "gini"], 
              "splitting": ['median', "mean"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(MyClassifier(), param_dist,
                                   cv=5,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(dataset['train'], y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_, 20)

RandomizedSearchCV took 6248.82 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.794 (std: 0.008)
Parameters: {'max_level': 10, 'min_leaf_size': 50, 'criterion': 'gini', 'splitting': 'mean'}

Model with rank: 2
Mean validation score: 0.785 (std: 0.004)
Parameters: {'max_level': 10, 'min_leaf_size': 44, 'criterion': 'gini', 'splitting': 'median'}

Model with rank: 2
Mean validation score: 0.785 (std: 0.004)
Parameters: {'max_level': 10, 'min_leaf_size': 21, 'criterion': 'gini', 'splitting': 'median'}

Model with rank: 4
Mean validation score: 0.732 (std: 0.008)
Parameters: {'max_level': 10, 'min_leaf_size': 29, 'criterion': 'entropy', 'splitting': 'mean'}

Model with rank: 5
Mean validation score: 0.721 (std: 0.008)
Parameters: {'max_level': 10, 'min_leaf_size': 47, 'criterion': 'entropy', 'splitting': 'median'}

Model with rank: 6
Mean validation score: 0.710 (std: 0.010)
Parameters: {'max_level': 8, 'min_leaf_size': 39, 'criterion': 'gini', 'sp