# Basic utilities

In [17]:
import random


random.seed(0)
SEED = random.randint(0, 10**6)

In [18]:
from sklearn.metrics import mean_squared_error
import numpy as np


def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Loading house prices from openml (ID 42092)

In [19]:
import numpy as np

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split


df = fetch_openml(data_id=42092, as_frame=True)["frame"]
df = df.assign(
    year = lambda x: x.date.str[0:4].astype(int),
    zipcode = lambda x: x.zipcode.astype(int),
    building_age = lambda x: x.year - x.yr_built,
)

xvars = [
    "grade", "year", "building_age", "sqft_living", 
    "sqft_lot", "bedrooms", "bathrooms", "floors", 
    "zipcode", "lat", "long", "condition", "waterfront"
]

X, y = df[xvars], np.log(df["price"])
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.66, random_state=SEED)

# Hyper optimized XGBRegressor

In [20]:
from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical
from math import sqrt, floor


m = len(xvars)

cs = ConfigurationSpace({
    "learning_rate":        Float(name="learning_rate", bounds=(0.01, 1.0), log=True, default=0.2),
    "subsample":            Float(name="subsample", bounds=(0.01, 1.0), default=0.63),
    "colsample_by":         Categorical(name="colsample_by", items=["node", "level", "tree"], default="node"),
    "colsample_amount":     Float(name="colsample_amount", bounds=(0.01, 1.0), default=floor(sqrt(m))/m, log=True),
    "n_estimators":         Integer(name="n_estimators", bounds=(10, 25), default=10),
    "num_parallel_tree":    Integer(name="num_parallel_tree", bounds=(10, 25), default=10),
    "reg_lambda":           Float(name="reg_lambda", bounds=(0.0, 10.0), default=0),
    "min_child_weight":     Float(name="min_child_weight", bounds=(0.0, 10.0), default=2.0),
})

In [21]:
from xgboost import XGBRegressor


BEST_MODEL = None
BEST_SCORE = 1.0
def train(config, seed=0):
    params = dict(config)
    col_mode = params["colsample_by"]
    params[f"colsample_by{col_mode}"] = params["colsample_amount"]
    del params["colsample_by"]
    del params["colsample_amount"]

    model = XGBRegressor(
        **params,
        objective = "reg:squarederror",
        seed=seed, 
        seed_per_iteration=True
    )
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    res = rmse(y_test, y_pred)
    global BEST_SCORE, BEST_MODEL
    if res < BEST_SCORE:
        BEST_SCORE, BEST_MODEL = res, model
    return res

In [22]:
from smac import HyperparameterOptimizationFacade, Scenario


scenario = Scenario(
    cs, 
    deterministic=False, 
    n_trials=100,
    # walltime_limit=1000, 
    # trial_walltime_limit=10000
)
smac = HyperparameterOptimizationFacade(scenario, train)
print()
print(smac.optimize())
print(f"Best score: {BEST_SCORE}")

[INFO][abstract_initial_design.py:95] Reducing the number of initial configurations from 80 to 25 (max_ratio == 0.25).
[INFO][abstract_initial_design.py:147] Using 25 initial design configurations and 0 additional configurations.



[INFO][abstract_intensifier.py:515] Added config f3e8b7 as new incumbent because there are no incumbents yet.
[INFO][abstract_intensifier.py:590] Added config 81127e and rejected config f3e8b7 as incumbent because it is not better than the incumbents on 2 instances:
[INFO][abstract_intensifier.py:590] Added config ee6dae and rejected config 81127e as incumbent because it is not better than the incumbents on 3 instances:
[INFO][abstract_intensifier.py:590] Added config 768116 and rejected config ee6dae as incumbent because it is not better than the incumbents on 3 instances:
[INFO][smbo.py:319] Finished 50 trials.
[INFO][abstract_intensifier.py:590] Added config 284f12 and rejected config 768116 as incumbent because it is not better than the incumbents on 3 instances:
[INFO][abstract_intensifier.py:590] Added config 5893c4 and rejected config 284f12 as incumbent because it is not better than the incumbents on 3 instances:
[INFO][smbo.py:319] Finished 100 trials.
[INFO][smbo.py:327] Conf