# Single Table Regression + Tuning Demo

In this demo, we will learn how to use a simple MLBazaar pipeline in combination
with the BTB tuning library to predict housing prices.

In [103]:
# Setup logging and imports

from utils import get_tunables, pprint, setup

setup()

import numpy as np
from btb.tuning import GP
from mlblocks import MLPipeline
from mlprimitives.datasets import load_boston
from sklearn.model_selection import KFold
from tqdm import tnrange, tqdm_notebook

### Load the Dataset

First we load The Boston Housing Dataset form the MLPrimitives library.

The Boston Housing Dataset is a derived from information collected by the U.S. Census Service concerning housing in the area of Boston MA. The following describes the dataset columns:

- CRIM - per capita crime rate by town
- ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS - proportion of non-retail business acres per town.
- CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
- NOX - nitric oxides concentration (parts per 10 million)
- RM - average number of rooms per dwelling
- AGE - proportion of owner-occupied units built prior to 1940
- DIS - weighted distances to five Boston employment centres
- RAD - index of accessibility to radial highways
- TAX - full-value property-tax rate per \$10,000
- PTRATIO - pupil-teacher ratio by town
- B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- LSTAT - % lower status of the population
- MEDV - Median value of owner-occupied homes in \$1000's

In [115]:
dataset = load_boston()

X = dataset.data
y = dataset.target

The `X` variable contains the first 13 columns of the dataset:

In [116]:
X.shape

(506, 13)

In [117]:
dataset.data[0:3]

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, 0.0000e+00, 5.3800e-01,
        6.5750e+00, 6.5200e+01, 4.0900e+00, 1.0000e+00, 2.9600e+02,
        1.5300e+01, 3.9690e+02, 4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        6.4210e+00, 7.8900e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9690e+02, 9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        7.1850e+00, 6.1100e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9283e+02, 4.0300e+00]])

While the `y` variable contains the 14th column, which we will try to predict:

In [122]:
y.shape

(506,)

In [123]:
y[0:5]

array([24. , 21.6, 34.7, 33.4, 36.2])

### Split the dataset in train and test

We will use the `get_split` function from the dataset object
to split the data in two parts:

- `X_train` and `y_train` is the data that we will use to tune and fit our pipeline.
- `X_test` and `y_test` is the data that we will use to evaluate our pipeline performance.

In [124]:
X_train, X_test, y_train, y_test = dataset.get_splits(1)

### Create the Pipeline

In this case we will create a very simple pipeline with only two primitives:

- A StandardScaler, that calculates the Z-score of each variable
- A Lasso Regression

In [125]:
primitives = [
    "sklearn.preprocessing.StandardScaler",
    "sklearn.linear_model.Lasso"
]

In [126]:
pipeline = MLPipeline(primitives)

### Evaluate the Pipeline

Before attempting to tune the pipeline, we will use it with its default hyperparameter
values to see how well it performs.

In [131]:
default_hyperparams = pipeline.get_hyperparameters(flat=True)
default_hyperparams

{('sklearn.preprocessing.StandardScaler#1', 'with_mean'): True,
 ('sklearn.preprocessing.StandardScaler#1', 'with_std'): True,
 ('sklearn.linear_model.Lasso#1', 'copy_X'): True,
 ('sklearn.linear_model.Lasso#1', 'warm_start'): False,
 ('sklearn.linear_model.Lasso#1', 'random_state'): None,
 ('sklearn.linear_model.Lasso#1', 'precompute'): False,
 ('sklearn.linear_model.Lasso#1', 'normalize'): False,
 ('sklearn.linear_model.Lasso#1', 'fit_intercept'): True,
 ('sklearn.linear_model.Lasso#1', 'alpha'): 1.0,
 ('sklearn.linear_model.Lasso#1', 'max_iter'): 1000,
 ('sklearn.linear_model.Lasso#1', 'tol'): 0.0001,
 ('sklearn.linear_model.Lasso#1', 'positive'): False,
 ('sklearn.linear_model.Lasso#1', 'selection'): 'cyclic'}

In [132]:
pipeline.fit(X_train, y_train)

In [133]:
predictions = pipeline.predict(X_test)

In [138]:
default_score = dataset.score(y_test, predictions)
default_score

0.6935445588519191

### Prepare for tuning

Now we will obtain the list of tunable hyperparameters and their possible ranges
from the pipeline object, which we will later on use to tune them.

In [135]:
tunable_hyperparameters = pipeline.get_tunable_hyperparameters()
pprint(tunable_hyperparameters)

{
    "sklearn.preprocessing.StandardScaler#1": {
        "with_mean": {
            "type": "bool",
            "default": true
        },
        "with_std": {
            "type": "bool",
            "default": true
        }
    },
    "sklearn.linear_model.Lasso#1": {
        "normalize": {
            "type": "bool",
            "default": false,
            "description": "This parameter is ignored when fit_intercept is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm."
        },
        "fit_intercept": {
            "type": "bool",
            "default": true,
            "description": "Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (e.g. data is expected to be already centered)."
        },
        "alpha": {
            "type": "float",
            "description": "Regularization strength. Alpha corresponds to C^-1 in other linear mod

### Start the tuning loop

Now we will start using a BTB tuner to try to find the optimal hyperparameter
values for this problem, with the goal of improving the default score that
we have previously obtained.

For this, we will start a tuning loop of 300 iterations where, for each iteration,
we will:

- Ask the tuner for new hyperparameter values to try
- Cross validate the pipeline over the training data using the obtained hyperparameters
- Inform the tuner about the score obtained using the proposed hyperparameters.

In [139]:
# Setup the tuner
tunables = get_tunables(tunable_hyperparameters)
tuner = GP(tunables, r_minimum=10)
tuner.add(default_hyperparams, default_score)

# Set up the KFold splitter
kfold = KFold(n_splits=5)

# Start the tuning loop
best = 0
for i in tnrange(300):
    params = tuner.propose()
    pipeline.set_hyperparameters(params)
    
    scores = []
    for train, test in kfold.split(X_train, y_train):
        pipeline.fit(X_train[train], y_train[train])
        predictions = pipeline.predict(X_train[test])
        scores.append(dataset.score(y_train[test], predictions))
    
    score = np.mean(scores)
    tuner.add(params, score)
    
    if score > best:
        best = score
        best_params = params
        print("{} - Best score so far: {}".format(i, best))

HBox(children=(IntProgress(value=0, max=300), HTML(value='')))

0 - Best score so far: 0.33732400114209027
1 - Best score so far: 0.5198225258588864
18 - Best score so far: 0.5972870399338879
56 - Best score so far: 0.6505453962748078
97 - Best score so far: 0.6510096455741543
223 - Best score so far: 0.6772882922397171



In [140]:
best_params

{('sklearn.preprocessing.StandardScaler#1', 'with_mean'): True,
 ('sklearn.preprocessing.StandardScaler#1', 'with_std'): True,
 ('sklearn.linear_model.Lasso#1', 'normalize'): False,
 ('sklearn.linear_model.Lasso#1', 'fit_intercept'): True,
 ('sklearn.linear_model.Lasso#1', 'alpha'): 0.015689893350340295,
 ('sklearn.linear_model.Lasso#1', 'max_iter'): 5514,
 ('sklearn.linear_model.Lasso#1', 'tol'): 0.005459799384172081,
 ('sklearn.linear_model.Lasso#1', 'positive'): False,
 ('sklearn.linear_model.Lasso#1', 'selection'): 'random'}

### Evaluate the found hyperparameters

After the tuning has been finished, we evaluate again the performance of the pipeline
over the training data when using the new hyperparameter values.

In [141]:
pipeline.set_hyperparameters(best_params)

In [142]:
pipeline.fit(X_train, y_train)

In [143]:
predictions = pipeline.predict(X_test)

In [144]:
dataset.score(y_test, predictions)

0.7438447686752354