# Single Table Regression + Tuning Demo

In [1]:
# Imports and logging/Pretty print setup

import logging;

logging.basicConfig(level=logging.ERROR)
logging.getLogger().setLevel(level=logging.ERROR)

import warnings
warnings.simplefilter("ignore")

import json

def pp(a_dict):
    print(json.dumps(a_dict, indent=4))
    
import numpy as np

from btb import HyperParameter
from btb.tuning import GP
from mlblocks import MLPipeline
from sklearn.model_selection import KFold

### Load the Dataset

In [3]:
from mlprimitives.datasets import load_boston

dataset = load_boston()

In [4]:
dataset.describe()

Boston House Prices Dataset.


In [4]:
dataset.data.shape

(506, 13)

In [6]:
dataset.data[0]

array([6.320e-03, 1.800e+01, 2.310e+00, 0.000e+00, 5.380e-01, 6.575e+00,
       6.520e+01, 4.090e+00, 1.000e+00, 2.960e+02, 1.530e+01, 3.969e+02,
       4.980e+00])

In [5]:
dataset.target.shape

(506,)

In [7]:
dataset.target[0:5]

array([24. , 21.6, 34.7, 33.4, 36.2])

### Split the dataset in train and test

In [10]:
X_train, X_test, y_train, y_test = dataset.get_splits(1)

### Create the Pipeline

In [8]:
primitives = [
    "xgboost.XGBRegressor"
]
hyperparameters = {
    'xgboost.XGBRegressor#1': {
        'objective': 'reg:squarederror'
    }
}

In [9]:
pipeline = MLPipeline(primitives)

### Evaluate the Pipeline

In [11]:
pipeline.fit(X_train, y_train)

In [12]:
predictions = pipeline.predict(X_test)

In [13]:
dataset.score(y_test, predictions)

0.8840473293667634

### Prepare for tuning

In [14]:
pp(pipeline.get_tunable_hyperparameters())

{
    "xgboost.XGBRegressor#1": {
        "n_estimators": {
            "type": "int",
            "default": 100,
            "range": [
                10,
                1000
            ]
        },
        "max_depth": {
            "type": "int",
            "default": 3,
            "range": [
                3,
                10
            ]
        },
        "learning_rate": {
            "type": "float",
            "default": 0.1,
            "range": [
                0,
                1
            ]
        },
        "gamma": {
            "type": "float",
            "default": 0.1,
            "range": [
                0,
                1
            ]
        },
        "min_child_weight": {
            "type": "int",
            "default": 1,
            "range": [
                1,
                10
            ]
        }
    }
}


In [15]:
tunables = list()
defaults = dict()
for block_name, params in pipeline.get_tunable_hyperparameters().items():
    for param_name, param_details in params.items():
        key = (block_name, param_name)
        param_type = param_details['type']
        param_type = 'string' if param_type == 'str' else param_type

        if param_type == 'bool':
            param_range = [True, False]
        else:
            param_range = param_details.get('range') or param_details.get('values')

        value = HyperParameter(param_type, param_range)
        tunables.append((key, value))
        defaults[key] = param_details['default']

In [16]:
tunables

[(('xgboost.XGBRegressor#1', 'n_estimators'),
  <btb.hyper_parameter.IntHyperParameter at 0x7f38de05ca20>),
 (('xgboost.XGBRegressor#1', 'max_depth'),
  <btb.hyper_parameter.IntHyperParameter at 0x7f38de05cdd8>),
 (('xgboost.XGBRegressor#1', 'learning_rate'),
  <btb.hyper_parameter.FloatHyperParameter at 0x7f38de05cb38>),
 (('xgboost.XGBRegressor#1', 'gamma'),
  <btb.hyper_parameter.FloatHyperParameter at 0x7f38de05cc18>),
 (('xgboost.XGBRegressor#1', 'min_child_weight'),
  <btb.hyper_parameter.IntHyperParameter at 0x7f38de05cd30>)]

In [17]:
defaults

{('xgboost.XGBRegressor#1', 'n_estimators'): 100,
 ('xgboost.XGBRegressor#1', 'max_depth'): 3,
 ('xgboost.XGBRegressor#1', 'learning_rate'): 0.1,
 ('xgboost.XGBRegressor#1', 'gamma'): 0.1,
 ('xgboost.XGBRegressor#1', 'min_child_weight'): 1}

### Start the tuning loop

In [18]:
tuner = GP(tunables, r_minimum=10)
kfold = KFold(n_splits=5)

best = 0
for i in range(100):
    params = tuner.propose()
    pipeline.set_hyperparameters(params)
    
    scores = []
    for train, test in kfold.split(X_train, y_train):
        pipeline.fit(X_train[train], y_train[train])
        predictions = pipeline.predict(X_train[test])
        scores.append(dataset.score(y_train[test], predictions))
    
    score = np.mean(scores)
    tuner.add(params, score)
    
    if score > best:
        best = score
        best_params = params
        print("{} - Best score so far: {}".format(i, best))

0 - Best score so far: 0.8131233290167268
2 - Best score so far: 0.8185586557020965
5 - Best score so far: 0.82290229127515
13 - Best score so far: 0.8244420866234726
17 - Best score so far: 0.829687744856996
39 - Best score so far: 0.8373185176550653
47 - Best score so far: 0.8398385688230501
49 - Best score so far: 0.8411160438899634


### Evaluate the found hyperparameters

In [19]:
pipeline.set_hyperparameters(best_params)

In [20]:
pipeline.fit(X_train, y_train)

In [21]:
predictions = pipeline.predict(X_test)

In [22]:
dataset.score(y_test, predictions)

0.8952775404471299