In [1]:
import warnings

warnings.filterwarnings('ignore')

### Import boston dataset
First, let's import our datatset, in this case, we will use the regression dataset `Boston`
that comes with `sklearn`.

In [2]:
from sklearn.datasets import load_boston as load_dataset

dataset = load_dataset()

#### Import train_test_split and generate X_train, X_test, y_train, y_test
Following, we will split our data in `train, test` using `train_test_split` also from `sklearn`

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.3, random_state=0)

### Import the metric and the estimators
For this example, we will use `RandomForestRegressor` against `ExtraTreesRegressor`, we want to
tune them and see wich of them will be the better one for this problem. We also will use `r2_score` as
a scorer.

In [4]:
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

# NOTE: The default hyperparameters are intentionally bad
# to give more room for improvement during the BTBSession demo
default_hyperparams = {
    'n_estimators': 2,
    'max_features': 'log2',
    'min_samples_split': 2,
    'min_samples_leaf': 2,
}

rf = RandomForestRegressor(random_state=0, **default_hyperparams)

rf.fit(X_train, y_train)
pred = rf.predict(X_test)

r2_score(y_test, pred)

0.7149946643194653

In [5]:
et = ExtraTreesRegressor(random_state=0, **default_hyperparams)

et.fit(X_train, y_train)
pred = et.predict(X_test)

r2_score(y_test, pred)

0.6108880572971567

###  Create a Tunable dict
Following we will create a dictionary which has as keys the names of our `tunables` and as values
they weill contain the `tunable hyperparameters`. Those `tunable hyperparameters` are the ones that will
be tuned by the `btb.session.BTBSession` in order to improve the score.

In [6]:
tunables = {
    'random_forest': {
        'n_estimators': {'type': 'int', 'default': 2, 'range': [1, 1000]},
        'max_features': {'type': 'str', 'default': 'log2', 'range': [None, 'auto', 'log2', 'sqrt']},
        'min_samples_split': {'type': 'int', 'default': 2, 'range': [2, 20]},
        'min_samples_leaf': {'type': 'int', 'default': 2, 'range': [1, 20]},
    },
    'extra_trees': {
        'n_estimators': {'type': 'int', 'default': 2, 'range': [1, 1000]},
        'max_features': {'type': 'str', 'default': 'log2', 'range': [None, 'auto', 'log2', 'sqrt']},
        'min_samples_split': {'type': 'int', 'default': 2, 'range': [2, 20]},
        'min_samples_leaf': {'type': 'int', 'default': 2, 'range': [1, 20]},
    }
}

### Create a `score` function
As `BTBSession` requieres, we will create a function that scores our estimators. In this case
we will use `cross_val_score` and the `r2_scorer` that we imported before.

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

models = {
    'random_forest': RandomForestRegressor,
    'extra_trees': ExtraTreesRegressor,
}

def build_model(name, hyperparameters):
    model_class = models[name]
    return model_class(random_state=0, **hyperparameters)

def score_model(name, hyperparameters):
    model = build_model(name, hyperparameters)
    r2_scorer = make_scorer(r2_score)
    scores = cross_val_score(model, X_train, y_train, scoring=r2_scorer)
    return scores.mean()

### Instantiate BTBSession
After creating our `tunables` and our `scoring` function, we can proceed to `tune` them. (We will use
`verbose=True` in order to print a bar with the progress during `run`).

In [8]:
from btb.session import BTBSession

session = BTBSession(tunables, score_model, verbose=True)

### Run method
The main method, that returns the `best_proposal` for `n` iterations, is `run`. This method
will iterate thro the list, create proposals and score them against the `scoring` function.
Bear in mind that `BTBSession` will first try the default configuration atleast once, which means
that each `tunable` is given atleast one run.

In [9]:
session.run(iterations=2)  # Run two iterations, this will execute with default hyperparameters

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




{'id': 'd02e055a1279d42b74169151047c542c',
 'name': 'extra_trees',
 'config': {'n_estimators': 2,
  'max_features': 'log2',
  'min_samples_split': 2,
  'min_samples_leaf': 2},
 'score': 0.7294475145162741}

In [10]:
session.proposals

{'40e9094fa4b1901410b62c95ecbe9a21': {'id': '40e9094fa4b1901410b62c95ecbe9a21',
  'name': 'random_forest',
  'config': {'n_estimators': 2,
   'max_features': 'log2',
   'min_samples_split': 2,
   'min_samples_leaf': 2},
  'score': 0.7096417128432014},
 'd02e055a1279d42b74169151047c542c': {'id': 'd02e055a1279d42b74169151047c542c',
  'name': 'extra_trees',
  'config': {'n_estimators': 2,
   'max_features': 'log2',
   'min_samples_split': 2,
   'min_samples_leaf': 2},
  'score': 0.7294475145162741}}

#### Iterate over more iterations
When calling `run` again, we will continue from the last checkpoint, which in this case was at `iteration=2`.
Let's give it some more `iterations` and save the `best_proposal` in a variable, (you can always access it from
`session.best_proposal`).

In [11]:
best_proposal = session.run(iterations=100)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




#### best_proposal
A dictionary which has as `name` the key value to the model given name and `config` the best
configuration found for it and `score`, the score obtained with the `scoring` function.

In [12]:
best_proposal

{'id': 'fb22d93a7753e26834d07ca9c9a8dd40',
 'name': 'extra_trees',
 'config': {'n_estimators': 611,
  'max_features': 'log2',
  'min_samples_split': 2,
  'min_samples_leaf': 1},
 'score': 0.8686338104323431}

#### Build a model with the tuned configuration

In [13]:
best_model = build_model(best_proposal['name'], best_proposal['config'])

#### Fit the model and score it.
Bear in mind that the `r2_scorer` may give you lower score regarding the `cross_val_score`.

In [14]:
best_model.fit(X_train, y_train)
pred = best_model.predict(X_test)

r2_score(y_test, pred)

0.8007483075651847