In [10]:
from time import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.base import BaseEstimator, ClassifierMixin
from dtree import *
from sklearn.metrics import f1_score

class MyClassifier(BaseEstimator, ClassifierMixin): 
    def __init__(self, min_leaf_size=5, max_level=5, criterion='entropy', splitting='mean'):
        params = np.asarray(x)
        self.min_leaf_size = min_leaf_size
        self.max_level = max_level
        self.criterion = criterion
        self.splitting = splitting
    
    def fit(self, X, y=None):
        self.model = experiment(X, self.min_leaf_size, self.max_level, self.criterion, self.splitting)
        return self

    def predict(self, X, y=None):
        return predicting(self.model, X)

    def score(self, X, y=None):
        predicted = self.predict(X)
        return f1_score(predicted, y, average='macro')

In [8]:
dataset = pickle_operating('Caltech_data_2', None)
print(len(dataset['train']), len(dataset['test']))

Caltech_data_2.pickle
(320, 324)


In [22]:
# Utility function to report best scores
def report(results, n_top=10):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [18]:
X, y  = [x[0] for x in dataset['train']], [x[1] for x in dataset['train']]
            
# specify parameters and distributions to sample from
param_dist = {"min_leaf_size": sp_randint(15, 51), 
              "max_level": sp_randint(2, 11), 
              "criterion": ["entropy", "gini"], 
              "splitting": ['median', "mean"]}

# run randomized search
n_iter_search = 1
random_search = RandomizedSearchCV(MyClassifier(), param_dist,
                                   cv=5,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(dataset['train'], y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))


RandomizedSearchCV took 1.69 seconds for 1 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.229 (std: 0.040)
Parameters: {'max_level': 6, 'min_leaf_size': 42, 'criterion': 'gini', 'splitting': 'median'}



In [23]:
report(random_search.cv_results_, 5)

Model with rank: 1
Mean validation score: 0.229 (std: 0.040)
Parameters: {'max_level': 6, 'min_leaf_size': 42, 'criterion': 'gini', 'splitting': 'median'}

