In [2]:
import numpy as np
import optuna
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import xgboost as xgb

In [6]:
import pickle
import pandas as pd
with open('preprocessed_data/preprocessed_XGboost.pkl', 'rb') as f:
    XGBoost_preprocessed_data = pickle.load(f)

In [7]:
import sys

In [8]:
print(sys.executable)

c:\Users\jerem\miniconda3\envs\tf-gpu\python.exe


In [9]:
# The variable XGBoost_preprocessed_data is a list of our 5 bins [0-4][][]. Each bin contains the 15 belonging stocks or ETFs [0-4][0-14][].
# After accessing the stock we can get X_train, y_train, X_val, y_val, X_test, y_test [0-4][0-14][0-5].

# For the Tree based model the accessed data is 2D (sample_size, features)

XGBoost_preprocessed_data[0][0][0].shape
# This accesses the X_train set of the first stock in bin 1.

(976, 153)

In [None]:
# Create XGBoost objective
def createXGB_objective(X_train, y_train, X_val, y_val):
    def xgb_objective(trial):
        # Suggest hyperparameters
        param = {
            "verbosity": 0,
            "objective": "reg:squarederror",
            "booster": "gbtree",
            "seed": 42,
            "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),
            "alpha": trial.suggest_float("alpha", 1e-3, 10.0, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "eta": trial.suggest_float("eta", 1e-4, 1e-1, log=True),
            "gamma": trial.suggest_float("gamma", 0.0, 5.0),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        }

        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)

        pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-rmse")

        # Train with early stopping and pruning
        trained_xgb = xgb.train(
            param,
            dtrain,
            num_boost_round=1000,
            evals=[(dval, "validation")],
            early_stopping_rounds=20,
            callbacks=[pruning_callback],
            verbose_eval=False
        )

        y_pred = trained_xgb.predict(dval)
        
        mape   = np.mean(np.abs((y_pred - y_val) / y_val)) * 100
        rmse   = np.sqrt(mean_squared_error(y_val, y_pred))
        mae    = mean_absolute_error(y_val, y_pred)
        r2     = r2_score(y_val, y_pred)

        trial.set_user_attr("mape", mape)
        trial.set_user_attr("rmse", rmse)
        trial.set_user_attr("mae", mae)
        trial.set_user_attr("r2", r2)



        return mape
    return xgb_objective

In [12]:
# 5.3. XGBoost study
N_TRIALS = 70

XGBoost_data_results = []
for i in range(len(XGBoost_preprocessed_data)):
    XGBoost_bin_results = []
    for j in range(len(XGBoost_preprocessed_data[i])):
        X_train, y_train, X_val, y_val, X_test, y_test = XGBoost_preprocessed_data[i][j]
        xgb_study = optuna.create_study(
            direction="minimize",
            study_name="xgb_regression_study",
            sampler=optuna.samplers.TPESampler(),
            pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=10)
        )
        xgb_study.optimize(createXGB_objective(X_train, y_train, X_val, y_val), n_trials=N_TRIALS)

        params = xgb_study.best_params
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)
        dtest = xgb.DMatrix(X_test, label=y_test)


        # Train with early stopping and pruning
        trained_xgb = xgb.train(
            params,
            dtrain,
            num_boost_round=1000,
            evals=[(dval, "validation")],
            early_stopping_rounds=20,
            verbose_eval=False
        )

        y_pred = trained_xgb.predict(dtest)


        test_mape   = np.mean(np.abs((y_pred - y_test) / y_test)) * 100
        test_rmse   = np.sqrt(mean_squared_error(y_test, y_pred))
        test_mae    = mean_absolute_error(y_test, y_pred)
        test_r2     = r2_score(y_test, y_pred)

        results = {
            "test_mape": test_mape,
            "test_rmse": test_rmse,
            "test_mae":  test_mae,
            "test_r2":   test_r2
        }


        XGBoost_bin_results.append(results)
    # Save the data
    with open(f'results/XGBoost/Performance_metrices_XGBoost_bin_{i}.pkl', 'wb') as f:
        pickle.dump(XGBoost_bin_results, f)
    XGBoost_data_results.append(XGBoost_bin_results)
# Save the data
with open(f'results/XGBoost/Performance_metrices_XGBoost_full.pkl', 'wb') as f:
    pickle.dump(XGBoost_data_results, f)

[I 2025-06-23 13:57:46,706] A new study created in memory with name: xgb_regression_study
[I 2025-06-23 13:57:46,872] Trial 0 finished with value: 0.528053932918385 and parameters: {'lambda': 2.2521622013717137, 'alpha': 0.0036565889283123854, 'max_depth': 10, 'eta': 0.06935524366096003, 'gamma': 1.1502113012234183, 'subsample': 0.7479358082668809, 'colsample_bytree': 0.5058516695683815}. Best is trial 0 with value: 0.528053932918385.
[I 2025-06-23 13:57:47,303] Trial 1 finished with value: 0.3378392965119025 and parameters: {'lambda': 5.008719377640034, 'alpha': 0.021245846719205933, 'max_depth': 3, 'eta': 0.030360903641257077, 'gamma': 0.3150518604224517, 'subsample': 0.9293614828471425, 'colsample_bytree': 0.9859109941587683}. Best is trial 1 with value: 0.3378392965119025.
[I 2025-06-23 13:57:47,900] Trial 2 finished with value: 0.9425389307905454 and parameters: {'lambda': 3.468901241921573, 'alpha': 3.901702123208081, 'max_depth': 8, 'eta': 0.010054746004809965, 'gamma': 2.484329