
# Scikit-learn model optimisation

## Start from good old GridSearch

In [0]:
# by Andrey Ustyuzhanin with heavy scikit-learn documentation re-use
# Kudos to Raghav RV <rvraghav93@gmail.com>

import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.datasets import make_hastie_10_2
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [0]:
X, y = make_hastie_10_2(n_samples=8000, random_state=42)

In [0]:
# let's check X shape
print (<YOUR CODE>) 

In [0]:
# let's start with simple 1D grid of numbers from 2 to 402 with step 10
param_grid ={'min_samples_split': <YOUR CODE>} 

In [0]:
# The scorers can be either be one of the predefined metric strings or a scorer
# callable, like the one returned by make_scorer
scoring = {'AUC': 'roc_auc', 
          }

# Setting refit='AUC', refits an estimator on the whole dataset with the
# parameter setting that has the best cross-validated AUC score.
# That estimator is made available at ``gs.best_estimator_`` along with
# parameters like ``gs.best_score_``, ``gs.best_parameters_`` and
# ``gs.best_index_``
gs = GridSearchCV(DecisionTreeClassifier(random_state=42),
                  param_grid=param_grid,
                  scoring=scoring, cv=5, refit='AUC',
                  return_train_score=True)
gs.fit(X, y);

In [0]:
# Let's examine keys available inside 'cv_results_'
print ("\n".join(gs.cv_results_.keys()))

In [0]:
def plot_optimisation(results, scoring, param_name):
  fig = plt.figure(figsize=(8, 8))

  plt.title("GridSearchCV evaluating", fontsize=14)

  plt.xlabel(param_name)
  plt.ylabel("Score")

  ax = plt.axes()
  y_min, y_max = 1e10, -1e10

  # Get the regular numpy array from the MaskedArray
  X_axis = np.array(results['param_' + param_name].data, dtype=float)

  for scorer, color in zip(sorted(scoring), ['g', 'k']):
      for sample, style in (('train', '--'), ('test', '-')):
          sample_score_mean = results['mean_%s_%s' % (sample, scorer)]
          sample_score_std = results['std_%s_%s' % (sample, scorer)]
          ax.fill_between(X_axis, sample_score_mean - sample_score_std,
                          sample_score_mean + sample_score_std,
                          alpha=0.1 if sample == 'test' else 0, color=color)
          ax.plot(X_axis, sample_score_mean, style, color=color,
                  alpha=1 if sample == 'test' else 0.7,
                  label="%s (%s)" % (scorer, sample))
          y_max = max(np.max(sample_score_mean + 1.5 * sample_score_std), y_max)
          y_min = min(np.min(sample_score_mean - 1.5 * sample_score_std), y_min)
          

      best_index = np.nonzero(results['rank_test_%s' % scorer] == 1)[0][0]
      best_score = results['mean_test_%s' % scorer][best_index]

      # Plot a dotted vertical line at the best score for that scorer marked by x
      ax.plot([X_axis[best_index], ] * 2, [0, best_score],
              linestyle='-.', color=color, marker='x', markeredgewidth=3, ms=8)

      # Annotate the best score for that scorer
      ax.annotate("%0.2f" % best_score,
                  (X_axis[best_index], best_score + 0.005))

  ax.set_ylim(y_min, y_max)
  plt.legend(loc="best")
  plt.show()

In [0]:
plot_optimisation(gs.cv_results_, scoring, list(param_grid.keys())[0])

So far, so good. Let's add `accuracy_score` as extra  metric

In [0]:
scoring = {'AUC': 'roc_auc', 
           'accuracy': make_scorer(<ACCURACY FUNCTION>)
          }

gs = GridSearchCV(DecisionTreeClassifier(random_state=42),
                  param_grid=param_grid,
                  scoring=scoring, cv=5, refit='AUC',
                  return_train_score=True)
gs.fit(X, y);

In [0]:
plot_optimisation(gs.cv_results_, scoring, list(param_grid.keys())[0])

## RandomizedSearch

In [0]:
from scipy.stats import randint as sp_randint
from time import time
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

# get some data
digits = load_digits()
X, y = digits.data, digits.target


In [0]:
# Let's check X and X[0] shapes
print(<YOUR CODE>) 

In [0]:
plt.imshow(X[0].reshape(8,8)); 

In [0]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [0]:
# build a classifier
clf = RandomForestClassifier(n_estimators=20)

# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [0]:
# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)



### Compare with GridSearch

In [0]:
print(clf)

In [0]:
# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
              "max_features": <YOUR CODE>, # logarithmic from 1 to 10
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": <YOUR CODE>, # either True or False
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)