In [None]:
from itertools import product
import joblib

import optuna as opt
import dalex as dx
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_error as MAE,
    mean_squared_error as MSE
)

In [None]:
main_df = pd.read_csv('../data/boston_housing.csv')
print(main_df.shape)
print(main_df.info())

In [None]:
main_df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 
                   'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

print(main_df.head())

In [None]:
X, y = main_df.iloc[:, :-1], main_df.iloc[:, -1]

train_X, test_X, train_y, test_y = train_test_split(X, y,
                      test_size = 0.3, random_state = 123)

In [None]:
def calculate_errors(predicted, test_y):
    mae = MAE(test_y, predicted)
    mse = MSE(test_y, predicted)
    print(f'MAE: {mae}\nMSE: {mse}\n')
    return None

In [None]:
max_depths = [5, 7, 9]
n_estimators = [100, 1000, 10_000]

In [None]:
for n, depth in product(n_estimators, max_depths):

    model = xgb.XGBRegressor(
        n_estimators=n, 
        max_depth=depth, 
        eta=0.1, 
        subsample=0.7, 
        colsample_bytree=0.8
    
    )
    model.fit(train_X, train_y)
    pred = model.predict(test_X)

    print(f'n_estimators: {n}\nmax_depth: {depth}')
    calculate_errors(pred, test_y)

In [None]:
dtrain = xgb.DMatrix(X.values, label=train_y, feature_names=X.columns)
dval = xgb.DMatrix(y.values, label=test_y, feature_names=y.columns)

In [None]:
def objective(trial: opt.Trial):

    param = {'objective': 'reg:squarederror',
             'eval_metric': 'mae',
             "max_depth": trial.suggest_int('max_depth', 1, 12),
             "eta": trial.suggest_float('eta', 0.001, 0.99),
             "gamma": trial.suggest_float('gamma', 0, 50000),
             "subsample": trial.suggest_float('subsample', 0, 1),
             "lambda": trial.suggest_float('lambda', 1, 20),
             "alpha": trial.suggest_float('alpha', 0, 20),
             "colsample_bytree": trial.suggest_float("colsample_bytree", 0, 1),
             "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0, 1),
             "colsample_bynode": trial.suggest_float("colsample_bynode", 0, 1),
             "verbosity": 0}

    n_trees = trial.suggest_int('ntrees', 10, 3000)
    results = {}
    reg = xgb.train(param, dtrain, n_trees, evals=[
                    (dval, 'val')], evals_result=results, early_stopping_rounds=10)
    loss = min(results['val']['mae'])
    trial.set_user_attr('best_ntree', reg.best_ntree_limit)
    return loss

In [None]:
explainer = dx.Explainer(model, test_X, test_y)

In [None]:
bd = dx.predict_explanations.BreakDown()
bd.fit(explainer, test_X.loc[42])
bd.plot()

In [None]:
sh = dx.predict_explanations.Shap()
sh.fit(explainer, test_X.loc[42])
sh.plot()

In [None]:
joblib.dump(model, '../models/boston_model.pkl', compress=True)