In [None]:
# import
import numpy as np
import pandas as pd
import optuna
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import xgboost as xgb

In [None]:
# load file
with open('preprocessed_data/preprocessed_XGboost.pkl', 'rb') as f:
    XGBoost_preprocessed_data = pickle.load(f)

In [None]:
# The variable XGBoost_preprocessed_data is a list of our 5 bins [0-4][][]. Each bin contains the 15 belonging stocks or ETFs [0-4][0-14][].
# After accessing the stock we can get X_train, y_train, X_val, y_val, X_test, y_test [0-4][0-14][0-5].

# For the Tree based model the accessed data is 2D (sample_size, features)

XGBoost_preprocessed_data[0][0][4].shape
# This accesses the X_train set of the first stock in bin 1.

In [None]:
# Create XGBoost objective with outer function to pass data sets
def createXGB_objective(X_train, y_train, X_val, y_val):
    def xgb_objective(trial):
        # Suggest hyperparameters
        param = {
            "verbosity": 0,
            "objective": "reg:squarederror",
            "booster": "gbtree",
            "seed": 42,
            "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),
            "alpha": trial.suggest_float("alpha", 1e-3, 10.0, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "eta": trial.suggest_float("eta", 1e-4, 1e-1, log=True),
            "gamma": trial.suggest_float("gamma", 0.0, 5.0),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        }

        # prepare data for training
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)

        pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-rmse")

        # Train with early stopping and pruning
        trained_xgb = xgb.train(
            param,
            dtrain,
            num_boost_round=1000,
            evals=[(dval, "validation")],
            early_stopping_rounds=20,
            callbacks=[pruning_callback],
            verbose_eval=False
        )

        # predict
        y_pred = trained_xgb.predict(dval)
        
        #calculate errors
        mape   = np.mean(np.abs((y_pred - y_val) / y_val)) * 100
        rmse   = np.sqrt(mean_squared_error(y_val, y_pred))
        mae    = mean_absolute_error(y_val, y_pred)
        r2     = r2_score(y_val, y_pred)

        # store results in trial
        trial.set_user_attr("mape", mape)
        trial.set_user_attr("rmse", rmse)
        trial.set_user_attr("mae", mae)
        trial.set_user_attr("r2", r2)



        return mape
    return xgb_objective

In [None]:
# 5.3. XGBoost study
# number of trials
N_TRIALS = 70

# create outer list for bins
XGBoost_data_results = []
for i in range(len(XGBoost_preprocessed_data)):
    #create list per bin to store assets
    XGBoost_bin_results = []
    for j in range(len(XGBoost_preprocessed_data[i])):
        X_train, y_train, X_val, y_val, X_test, y_test = XGBoost_preprocessed_data[i][j]
        xgb_study = optuna.create_study(
            direction="minimize",
            study_name="xgb_regression_study",
            sampler=optuna.samplers.TPESampler(),
            pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=10)
        )
        xgb_study.optimize(createXGB_objective(X_train, y_train, X_val, y_val), n_trials=N_TRIALS)

        # prepare the data sets
        params = xgb_study.best_params
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)
        dtest = xgb.DMatrix(X_test, label=y_test)


        # train with early stopping and pruning
        trained_xgb = xgb.train(
            params,
            dtrain,
            num_boost_round=1000,
            evals=[(dval, "validation")],
            early_stopping_rounds=20,
            verbose_eval=False
        )

        y_pred = trained_xgb.predict(dtest) 

        # calculate errors
        test_mape   = np.mean(np.abs((y_pred - y_test) / y_test)) * 100
        test_rmse   = np.sqrt(mean_squared_error(y_test, y_pred))
        test_mae    = mean_absolute_error(y_test, y_pred)
        test_r2     = r2_score(y_test, y_pred)

        # store results in trial
        results = {
            "test_mape": test_mape,
            "test_rmse": test_rmse,
            "test_mae":  test_mae,
            "test_r2":   test_r2
        }


        XGBoost_bin_results.append(results)
    # write out
    with open(f'results_simple_ret/XGBoost/Performance_metrices_XGBoost_bin_{i}.pkl', 'wb') as f:
        pickle.dump(XGBoost_bin_results, f)
    XGBoost_data_results.append(XGBoost_bin_results)
# write out
with open(f'results_simple_ret/XGBoost/Performance_metrices_XGBoost_full.pkl', 'wb') as f:
    pickle.dump(XGBoost_data_results, f)