In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import gc, time, warnings, joblib, pickle
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

from itertools import combinations
from warnings import simplefilter

pd.set_option('display.float_format', lambda x: '%.4f' % x)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.width', 1000)

warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

/kaggle/input/optiver-trading-at-the-close/public_timeseries_testing_util.py
/kaggle/input/optiver-trading-at-the-close/train.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv
/kaggle/input/optiver-trading-at-the-close/optiver2023/competition.cpython-310-x86_64-linux-gnu.so
/kaggle/input/optiver-trading-at-the-close/optiver2023/__init__.py
/kaggle/input/data1030-optiver-trading-at-close/test_dataset.pkl
/kaggle/input/data1030-optiver-trading-at-close/train_val_dataset.pkl


#### Import Packages

In [2]:
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_absolute_error

from mlxtend.evaluate.time_series import (
    GroupTimeSeriesSplit,
    plot_splits,
    print_cv_info,
    print_split_info,
)

## Pipeline Function

In [1]:
def gb_CV_MAE(X_other, y_other, groups_other,
              X_test, y_test, groups_test,
              preprocessor, model, param_grid, 
              path, model_name):
    prep = Pipeline(steps=[('preprocessor', preprocessor)]) # for now we only preprocess, later we will add other steps here
    
    n_split = 4
    count = 0
    best_models = []
    train_scores = np.zeros(n_split)
    val_scores = np.zeros(n_split)
    test_scores = np.zeros(n_split)
    
    # Split train-validation data: 
    len_group_other = groups_other.nunique()
    gts = GroupTimeSeriesSplit(test_size=int(len_group_other*0.25), n_splits=n_split)
    for i_train, i_val in gts.split(X_other, y_other, groups_other):
        print(f'\nFold {count+1} Parameter Searching:')
        print("\t Train index:", i_train, "Val index:", i_val)
        print("\t Train size:", len(i_train), "Val size:", len(i_val))
        X_train, y_train, groups_train = X_other.iloc[i_train], y_other.iloc[i_train], groups_other.iloc[i_train]
        X_val, y_val, groups_val = X_other.iloc[i_val], y_other.iloc[i_val], groups_other.iloc[i_val]
        
        X_train_preprocessed = prep.fit_transform(X_train)
        X_val_preprocessed = prep.transform(X_val)
        X_test_preprocessed  = prep.transform(X_test)
        
        feature_names = prep.get_feature_names_out()
        
        # Parameter Searching
        pg = ParameterGrid(param_grid)
        scores = np.zeros(len(pg))
        
        print(f'\t Fitting {len(pg)} Parameters...')
        for i in range(len(pg)):
            params = pg[i]
            model.set_params(**params)
            eval_set = [(X_val_preprocessed, y_val)]
            model.fit(X_train_preprocessed, y_train, 
                      eval_set=eval_set, early_stopping_rounds=50,verbose=False)
            y_val_pred = model.predict(X_val_preprocessed)
            scores[i] = mean_absolute_error(y_val, y_val_pred)

        best_params = np.array(pg)[scores == np.min(scores)]
        
        # apply the model with best parameter set
        print('\t Fitting Model with Best Parameter Set...')
        model.set_params(**best_params[0])
        model.fit(X_train_preprocessed, y_train,
                  eval_set=eval_set, early_stopping_rounds=50, verbose=False)
        best_models.append(model)
        
        # predict result
        print('\t Predicting Results...')
        y_train_pred = model.predict(X_train_preprocessed)
        y_val_pred = model.predict(X_val_preprocessed)
        y_test_pred = model.predict(X_test_preprocessed)

        # calculate the scores
        train_mae = mean_absolute_error(y_train, y_train_pred)
        val_mae = mean_absolute_error(y_val, y_val_pred)
        test_mae = mean_absolute_error(y_test, y_test_pred)
        
        print('\t Best Paramter:', best_params[0])
        print('\t Training MAE:', np.round(train_mae,4))
        print('\t Val MAE:', np.round(val_mae,4))
        print('\t Test MAE:', np.round(test_mae,4))
        
        # save results
        train_scores[count] = train_mae
        val_scores[count] = val_mae
        test_scores[count] = test_mae
        
        count += 1
        
    # summary report
    scores_dict = {'Train_Scores': train_scores, 
                   'Val_Scores': val_scores, 
                   'Test_Scores': test_scores}
    print(scores_dict)
    print('test mean:', np.mean(test_scores))
    print('test std:', np.std(test_scores))
    
    # save train and test scores
    with open(os.path.join(path, f'{model_name}_scores.pkl'), 'wb') as file:
        pickle.dump(scores_dict, file)
    with open(os.path.join(path, f'{model_name}_best_models.pkl'), 'wb') as file:
        pickle.dump(best_models, file)
        
    # Free up memory by deleting fold-specific variables
    del X_train, X_train_preprocessed, y_train, groups_train
    del X_val, X_val_preprocessed, y_val, groups_val
    gc.collect()
 
    return best_models

## Data Preparation

In [4]:
# Load Data
data_folder = '/kaggle/input/data1030-optiver-trading-at-close/'
# data_folder = '../data/preprocessed/'

with open(os.path.join(data_folder, 'train_val_dataset.pkl'), 'rb') as file:
    train_val_dataset = pickle.load(file)
    
with open(os.path.join(data_folder, 'test_dataset.pkl'), 'rb') as file:
    test_dataset = pickle.load(file)

X_other, y_other, groups_other = train_val_dataset['X_other'], train_val_dataset['y_other'], train_val_dataset['groups_other']
X_test, y_test, groups_test, submission_id = test_dataset['X_test'], test_dataset['y_test'], test_dataset['groups_test'], test_dataset['submission_id']

# collect which encoder to use on each feature
onehot_ftrs = ['imbalance_buy_sell_flag', 'stock_id']
std_ftrs = ['seconds_in_bucket', 'imbalance_size', 'reference_price', 'matched_size', 
            'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 
            'wap', 'lagged_target_1d_0', 'lagged_target_1d_10', 'lagged_target_1d_20', 
            'lagged_target_1d_30', 'lagged_target_1d_40', 'lagged_target_1d_50', 
            'lagged_target_1d_60', 'lagged_target_1d_70', 'lagged_target_1d_80', 
            'lagged_target_1d_90', 'lagged_target_1d_100', 'lagged_target_1d_110', 
            'lagged_target_1d_120', 'lagged_target_1d_130', 'lagged_target_1d_140', 
            'lagged_target_1d_150', 'lagged_target_1d_160', 'lagged_target_1d_170', 
            'lagged_target_1d_180', 'lagged_target_1d_190', 'lagged_target_1d_200', 
            'lagged_target_1d_210', 'lagged_target_1d_220', 'lagged_target_1d_230', 
            'lagged_target_1d_240', 'lagged_target_1d_250', 'lagged_target_1d_260', 
            'lagged_target_1d_270', 'lagged_target_1d_280', 'lagged_target_1d_290', 
            'lagged_target_1d_300', 'lagged_target_1d_310', 'lagged_target_1d_320', 
            'lagged_target_1d_330', 'lagged_target_1d_340', 'lagged_target_1d_350', 
            'lagged_target_1d_360', 'lagged_target_1d_370', 'lagged_target_1d_380', 
            'lagged_target_1d_390', 'lagged_target_1d_400', 'lagged_target_1d_410', 
            'lagged_target_1d_420', 'lagged_target_1d_430', 'lagged_target_1d_440', 
            'lagged_target_1d_450', 'lagged_target_1d_460', 'lagged_target_1d_470', 
            'lagged_target_1d_480', 'lagged_target_1d_490', 'lagged_target_1d_500', 
            'lagged_target_1d_510', 'lagged_target_1d_520', 'lagged_target_1d_530', 
            'lagged_target_1d_540', 'volume', 'mid_price', 'liquidity_imbalance', 
            'matched_imbalance', 'size_imbalance', 'reference_price_far_price_imb', 
            'reference_price_near_price_imb', 'reference_price_ask_price_imb', 
            'reference_price_bid_price_imb', 'reference_price_wap_imb', 'far_price_near_price_imb', 
            'far_price_ask_price_imb', 'far_price_bid_price_imb', 'far_price_wap_imb', 
            'near_price_ask_price_imb', 'near_price_bid_price_imb', 'near_price_wap_imb', 
            'ask_price_bid_price_imb', 'ask_price_wap_imb', 'bid_price_wap_imb', 'price_spread', 
            'price_pressure', 'market_urgency', 'depth_pressure', 'all_prices_mean', 
            'all_sizes_mean', 'all_prices_std', 'all_sizes_std', 'all_prices_skew', 
            'all_sizes_skew', 'all_prices_kurt', 'all_sizes_kurt', 'dow', 'seconds', 'minute']

# collect all the encoders
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'), onehot_ftrs),
        ('std', StandardScaler(), std_ftrs)])

model_save_path = 'result' 
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

result_path = '/kaggle/working/result/'
# result_path = '../result/'

# XGB Model

In [5]:
param_grid = {"learning_rate": [0.01],
              "n_estimators": [6000],
              'reg_alpha': [0, 0.01, 0.1, 1],
              'reg_lambda': [0.1, 0.5, 1, 2],
              "missing": [np.nan], 
              "max_depth": [1, 3, 7, 11, 13],
              "colsample_bytree": [0.9],              
              "subsample": [0.66],
              "eval_metric": ['mae'],
              "seed": [42],
              "n_thread": [-1],
              "tree_method": ['gpu_hist'],
              "verbosity": [0]
             }
model = XGBRegressor()

start_time = time.time()
xgb_best_models = gb_CV_MAE(X_other, y_other, groups_other,
                            X_test, y_test, groups_test,
                            preprocessor, model, param_grid, 
                            result_path, 'xgboost')
print('XGB Running time:', time.time()-start_time)


Fold 1 Parameter Searching:
	 Train index: [     0      1      2 ... 743181 743182 743183] Val index: [ 743184  743185  743186 ... 1002231 1002232 1002233]
	 Train size: 743184 Val size: 259050
	 Fitting 80 Parameters...
	 Fitting Model with Best Parameter Set...
	 Predicting Results...
	 Best Paramter: {'colsample_bytree': 0.9, 'eval_metric': 'mae', 'learning_rate': 0.01, 'max_depth': 3, 'missing': nan, 'n_estimators': 6000, 'n_thread': -1, 'reg_alpha': 0.1, 'reg_lambda': 2, 'seed': 42, 'subsample': 0.66, 'tree_method': 'gpu_hist', 'verbosity': 0}
	 Training MAE: 5.2573
	 Val MAE: 6.5528
	 Test MAE: 6.7388

Fold 2 Parameter Searching:
	 Train index: [ 10505  10506  10507 ... 753961 753962 753963] Val index: [ 753964  753965  753966 ... 1013066 1013067 1013068]
	 Train size: 743459 Val size: 259105
	 Fitting 80 Parameters...
	 Fitting Model with Best Parameter Set...
	 Predicting Results...
	 Best Paramter: {'colsample_bytree': 0.9, 'eval_metric': 'mae', 'learning_rate': 0.01, 'max_de