# Basic utilities

In [5]:
import random


random.seed(0)
SEED = random.randint(0, 10**6)

In [6]:
from sklearn.metrics import mean_squared_error
import numpy as np


def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def score(y_test, y_pred):
    print(f"RMSE: {rmse(y_test, y_pred):.03f}")


# Loading house prices from openml (ID 42092)

In [7]:
import numpy as np

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split


df = fetch_openml(data_id=42092, as_frame=True)["frame"]
df = df.assign(
    year = lambda x: x.date.str[0:4].astype(int),
    zipcode = lambda x: x.zipcode.astype(int)
).assign(
    building_age = lambda x: x.year - x.yr_built,
)

xvars = [
    "grade", "year", "building_age", "sqft_living", 
    "sqft_lot", "bedrooms", "bathrooms", "floors", 
    "zipcode", "lat", "long", "condition", "waterfront"
]

X, y = df[xvars], np.log(df["price"])
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.66, random_state=SEED)

# sklearn.RandomForest

In [8]:
from sklearn.ensemble import RandomForestRegressor


rf = RandomForestRegressor(
    n_estimators=500, 
    max_features="sqrt", 
    max_depth=20,
    n_jobs=-1, 
    random_state=SEED
)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
score(y_test, y_pred)

RMSE: 0.178


# XGBoost with random forest mode (equivalent to sklearn.RandomForest)

In [9]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)

m = len(xvars)

params = {
    'objective': "reg:squarederror",
    'learning_rate': 1,
    'num_parallel_tree': 500,
    'subsample': 0.63,
    'colsample_bynode': int(np.sqrt(m)) / m,
    'reg_lambda': 0,
    'max_depth': 20,
    'min_child_weight': 2,
    'seed': SEED,
    'seed_per_iteration': True
}
model = xgb.train(params, dtrain, num_boost_round = 1)
y_pred = model.predict(xgb.DMatrix(X_test))
score(y_test, y_pred)

RMSE: 0.180


# XGBRegressor with better hyper parameters

In [10]:
from xgboost import XGBRegressor
from math import sqrt, floor


m = len(xvars)

model = XGBRegressor(
    learning_rate = 0.2, # 0.0 - 1.0, log, not sure if each forest can have its own, it might lead to overfitting anyway

    max_depth = 20, # 2 - 50, 6 - 20 is another more conservative option
    subsample = 0.63, # 0.0 - 1.0
    # u can set only one of the colsample_by*
    # colsample_bytree: Optional[float] = None,
    # colsample_bylevel: Optional[float] = None,
    colsample_bynode = floor(sqrt(m))/m, # m..num_of_features, 0 - m, log?
    n_estimators = 100, # 100 - 500 = number of random forests in booster
    num_parallel_tree = 100, # 100 - 500 = number of trees in each random forest
    reg_lambda = 0, # -10. - 10.0, log = prunning of trees, higher value -> more prunning, not sure if negative values do anything
    min_child_weight = 2, # 0.0 - 10.0, log, higher value -> less options to choose from when selecting new nodes in trees

    objective = "reg:squarederror", # list at https://xgboost.readthedocs.io/en/stable/parameter.html, search for objective
    seed=SEED, 
    seed_per_iteration=True

)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score(y_test, y_pred)