
# Scikit-learn model optimisation

## Start from good old GridSearch

In [None]:
# by Andrey Ustyuzhanin with heavy scikit-learn documentation re-use
# Kudos to Raghav RV <rvraghav93@gmail.com>

import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.datasets import make_hastie_10_2, make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.manifold import locally_linear_embedding
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA

%matplotlib inline

In [None]:
# create X, y dataset for classification that has 2000 samples and 20 features with 10 informative features and 2 classes
# random_state=0 to make the results reproducible
X, y = make_classification(n_samples=2000, n_features=20, n_informative=10, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, random_state=0)



In [None]:
# PCA embedding
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# visualize embedded data
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap=plt.cm.Paired)
plt.title("PCA embedding")
plt.show()




In [None]:
X_lle, err = locally_linear_embedding(X, n_neighbors=5, n_components=2)
# PCA would be a better choice here, but LLE is more fun
plt.scatter(X_lle[:, 0], X_lle[:, 1], c=y, cmap='bwr', alpha=0.5);
X_lle.shape

In [None]:
# create dictionary with single key `min_samples_split` 
# that defines range from 2 to 402 with step 10
param_grid = {'min_samples_split': np.arange(2, 402, 10)}



In [None]:
# The scorers can be either be one of the predefined metric strings or a scorer
# callable, like the one returned by make_scorer
# create dictionary of scorers that are used for grid search. 
# it should contain 'accuracy' and 'roc_auc' keys
scorers = {'accuracy': make_scorer(accuracy_score), 'roc_auc': 'roc_auc'}

In [None]:
# run grid search with DecisionTreeClassifier, param_grid, scorers that computes training scores
# and 5-fold cross-validation
grid_obj = GridSearchCV(DecisionTreeClassifier(), param_grid, scoring=scorers, refit='accuracy', cv=5, return_train_score=True)
grid_fit = grid_obj.fit(X, y)



In [None]:
# Let's examine keys available inside 'cv_results_' attribute of grid_fit
grid_fit.cv_results_.keys()

In [None]:
def plot_optimisation(results, scoring, param_name):
  fig = plt.figure(figsize=(8, 8))

  plt.title("GridSearchCV evaluating", fontsize=14)

  plt.xlabel(param_name)
  plt.ylabel("Score")

  ax = plt.axes()
  y_min, y_max = 1e10, -1e10

  # Get the regular numpy array from the MaskedArray
  X_axis = np.array(results['param_' + param_name].data, dtype=float)

  for scorer, color in zip(sorted(scoring), ['g', 'k']):
      for sample, style in (('train', '--'), ('test', '-')):
          sample_score_mean = results['mean_%s_%s' % (sample, scorer)]
          sample_score_std = results['std_%s_%s' % (sample, scorer)]
          ax.fill_between(X_axis, sample_score_mean - sample_score_std,
                          sample_score_mean + sample_score_std,
                          alpha=0.1 if sample == 'test' else 0, color=color)
          ax.plot(X_axis, sample_score_mean, style, color=color,
                  alpha=1 if sample == 'test' else 0.7,
                  label="%s (%s)" % (scorer, sample))
          y_max = max(np.max(sample_score_mean + 1.5 * sample_score_std), y_max)
          y_min = min(np.min(sample_score_mean - 1.5 * sample_score_std), y_min)
          

      best_index = np.nonzero(results['rank_test_%s' % scorer] == 1)[0][0]
      best_score = results['mean_test_%s' % scorer][best_index]

      # Plot a dotted vertical line at the best score for that scorer marked by x
      ax.plot([X_axis[best_index], ] * 2, [0, best_score],
              linestyle='-.', color=color, marker='x', markeredgewidth=3, ms=8)

      # Annotate the best score for that scorer
      ax.annotate("%0.2f" % best_score,
                  (X_axis[best_index], best_score + 0.005))

  ax.set_ylim(y_min, y_max)
  plt.legend(loc="best")
  plt.show()

In [None]:
# plot grid search results
plot_optimisation(grid_fit.cv_results_, scorers, 'min_samples_split')

## RandomizedSearch

In [None]:
from scipy.stats import randint as sp_randint
from time import time
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier



In [None]:
# Utility function to report best scores
def report(results, n_top=3, score_name='roc_auc'):
    rank_key = 'rank_test_%s' % score_name
    mean_key = 'mean_test_%s' % score_name
    std_key = 'std_test_%s' % score_name
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results[rank_key] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation {0}, score: {1:.3f} (std: {2:.3f})".format(
                    score_name,
                    results[mean_key][candidate],
                    results[std_key][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [None]:
# build a classifier
clf = RandomForestClassifier(n_estimators=20)

# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [None]:
# run randomized search
n_iter_search = 60
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, scoring=scorers,
                                   n_iter=n_iter_search, refit='accuracy', cv=5, return_train_score=True)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))




In [None]:
report(random_search.cv_results_)

### Homework. Compare with GridSearch

In [None]:
# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
              "max_features": <YOUR CODE>, # logarithmic from 1 to 10
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": <YOUR CODE>, # either True or False
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)