In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import gc, time, warnings, joblib, pickle
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
from itertools import combinations
from warnings import simplefilter

pd.set_option('display.float_format', lambda x: '%.4f' % x)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.width', 1000)

warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

/kaggle/input/data1030-optiver-trading-at-close/test_dataset.pkl
/kaggle/input/data1030-optiver-trading-at-close/train_val_dataset.pkl


In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_absolute_error

from mlxtend.evaluate.time_series import (
    GroupTimeSeriesSplit,
    plot_splits,
    print_cv_info,
    print_split_info,
)

In [3]:
# Load Data
data_folder = '/kaggle/input/data1030-optiver-trading-at-close/'
# data_folder = '../data/preprocessed/'

with open(os.path.join(data_folder, 'train_val_dataset.pkl'), 'rb') as file:
    train_val_dataset = pickle.load(file)
    
with open(os.path.join(data_folder, 'test_dataset.pkl'), 'rb') as file:
    test_dataset = pickle.load(file)

X_other, y_other, groups_other = train_val_dataset['X_other'], train_val_dataset['y_other'], train_val_dataset['groups_other']
X_test, y_test, groups_test, submission_id = test_dataset['X_test'], test_dataset['y_test'], test_dataset['groups_test'], test_dataset['submission_id']

# collect which encoder to use on each feature
onehot_ftrs = ['imbalance_buy_sell_flag', 'stock_id']
std_ftrs = ['seconds_in_bucket', 'imbalance_size', 'reference_price', 'matched_size', 
            'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 
            'wap', 'lagged_target_1d_0', 'lagged_target_1d_10', 'lagged_target_1d_20', 
            'lagged_target_1d_30', 'lagged_target_1d_40', 'lagged_target_1d_50', 
            'lagged_target_1d_60', 'lagged_target_1d_70', 'lagged_target_1d_80', 
            'lagged_target_1d_90', 'lagged_target_1d_100', 'lagged_target_1d_110', 
            'lagged_target_1d_120', 'lagged_target_1d_130', 'lagged_target_1d_140', 
            'lagged_target_1d_150', 'lagged_target_1d_160', 'lagged_target_1d_170', 
            'lagged_target_1d_180', 'lagged_target_1d_190', 'lagged_target_1d_200', 
            'lagged_target_1d_210', 'lagged_target_1d_220', 'lagged_target_1d_230', 
            'lagged_target_1d_240', 'lagged_target_1d_250', 'lagged_target_1d_260', 
            'lagged_target_1d_270', 'lagged_target_1d_280', 'lagged_target_1d_290', 
            'lagged_target_1d_300', 'lagged_target_1d_310', 'lagged_target_1d_320', 
            'lagged_target_1d_330', 'lagged_target_1d_340', 'lagged_target_1d_350', 
            'lagged_target_1d_360', 'lagged_target_1d_370', 'lagged_target_1d_380', 
            'lagged_target_1d_390', 'lagged_target_1d_400', 'lagged_target_1d_410', 
            'lagged_target_1d_420', 'lagged_target_1d_430', 'lagged_target_1d_440', 
            'lagged_target_1d_450', 'lagged_target_1d_460', 'lagged_target_1d_470', 
            'lagged_target_1d_480', 'lagged_target_1d_490', 'lagged_target_1d_500', 
            'lagged_target_1d_510', 'lagged_target_1d_520', 'lagged_target_1d_530', 
            'lagged_target_1d_540', 'volume', 'mid_price', 'liquidity_imbalance', 
            'matched_imbalance', 'size_imbalance', 'reference_price_far_price_imb', 
            'reference_price_near_price_imb', 'reference_price_ask_price_imb', 
            'reference_price_bid_price_imb', 'reference_price_wap_imb', 'far_price_near_price_imb', 
            'far_price_ask_price_imb', 'far_price_bid_price_imb', 'far_price_wap_imb', 
            'near_price_ask_price_imb', 'near_price_bid_price_imb', 'near_price_wap_imb', 
            'ask_price_bid_price_imb', 'ask_price_wap_imb', 'bid_price_wap_imb', 'price_spread', 
            'price_pressure', 'market_urgency', 'depth_pressure', 'all_prices_mean', 
            'all_sizes_mean', 'all_prices_std', 'all_sizes_std', 'all_prices_skew', 
            'all_sizes_skew', 'all_prices_kurt', 'all_sizes_kurt', 'dow', 'seconds', 'minute']

# collect all the encoders
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'), onehot_ftrs),
        ('std', StandardScaler(), std_ftrs)])

model_save_path = 'result'
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)
    
result_path = '/kaggle/working/result/'
# result_path = '../result/'

# Pipeline

In [4]:
from  sklearn.linear_model import Lasso, Ridge
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [1]:
def MLpipe_reduced_feature(X_other, y_other, groups_other, X_test, y_test, groups_test,
                           preprocessor, model, param_grid, path, model_Name):
    prep = Pipeline(steps=[('preprocessor', preprocessor)])

    n_split = 4
    count = 0
    test_scores = np.zeros(n_split)
    pred_results = []

    # Split train-val data
    len_group_other = groups_other.nunique()
    gts = GroupTimeSeriesSplit(test_size=int(len_group_other*0.25), n_splits=n_split)

    for i_train, i_val in gts.split(X_other, y_other, groups_other):
        print(f'\nFold {count+1} Reduced Features:')
        best_models = []
        
        print("\t Train index:", i_train, "Val index:", i_val)
        print("\t Train size:", len(i_train), "Val size:", len(i_val))
        X_train, y_train, groups_train = X_other.iloc[i_train], y_other.iloc[i_train], groups_other.iloc[i_train]
        X_val, y_val, groups_val = X_other.iloc[i_val], y_other.iloc[i_val], groups_other.iloc[i_val]
        
        X_train_preprocessed = prep.fit_transform(X_train)
        feature_names = prep.get_feature_names_out()
        
        # parameter Searching
        pg = ParameterGrid(param_grid)
        scores = np.zeros(len(pg))
        
        print('\t Preparing datasets...')
        df_train = pd.DataFrame(data = X_train_preprocessed, columns = feature_names, index=y_train.index)
        del X_train_preprocessed, X_train
        gc.collect()
        
        X_val_preprocessed = prep.transform(X_val)
        df_val = pd.DataFrame(data = X_val_preprocessed, columns = feature_names, index=y_val.index)
        del X_val_preprocessed, X_val
        gc.collect()
        
        X_test_preprocessed = prep.transform(X_test)
        df_test = pd.DataFrame(data = X_test_preprocessed, columns = feature_names, index=y_test.index)

        # Free up memory
        del X_test_preprocessed
        gc.collect()

        # reduced feature
        # find all unique patterns of missing value in test set
        mask = df_test.isnull()
        unique_rows = np.array(np.unique(mask, axis=0))
        all_y_test_pred = pd.DataFrame()
        print('\t there are', len(unique_rows), 'unique missing value patterns.')

        # divide test sets into subgroups according to the unique patterns
        for i in range(len(unique_rows)):
            print ('\t working on unique pattern', i)
            ## generate X_test subset that matches the unique pattern i: optimized code
            index_subset = df_test[mask.eq(unique_rows[i], axis=1).all(axis=1)].index
            sub_X_test = df_test.loc[index_subset] 
            sub_X_test = sub_X_test[df_test.columns[~unique_rows[i]]] # drop nan columns
            sub_y_test = y_test.loc[index_subset]

            ## prepare train-val subset
            # 1.cut the feature columns that have nans in the according sub_X_test
            sub_X_train = df_train[df_train.columns[~unique_rows[i]]].copy()
            sub_X_val = df_val[df_val.columns[~unique_rows[i]]].copy()
            # 2.cut the rows in the sub_X_train and sub_X_CV that have any nans
            sub_X_train = sub_X_train.dropna()
            sub_X_val = sub_X_val.dropna()   
            # 3.cut the sub_Y_train and sub_y_CV accordingly
            sub_y_train = y_train.loc[sub_X_train.index]
            sub_y_val = y_val.loc[sub_X_val.index]

            # run ML algo
            # change to 1-D array
            sub_y_train_array = sub_y_train.values.ravel()
            sub_y_val_array = sub_y_val.values.ravel()
            sub_y_test_array = sub_y_test.values.ravel()

            # run model
            ML_algo = model
            pg = ParameterGrid(param_grid)
            val_scores = np.zeros(len(pg))
                
            for p in range(len(pg)):
                params = pg[p]
                print('\t\t params:', params)
                ML_algo.set_params(**params)
                ML_algo.fit(sub_X_train, sub_y_train_array)
                sub_y_val_pred = ML_algo.predict(sub_X_val)
                val_scores[p] = mean_absolute_error(sub_y_val_array, sub_y_val_pred)
                print('\t\t val_score:', val_scores[p])
            
            best_params = np.array(pg)[val_scores == np.min(val_scores)]
            print(f'\t best model parameters for pattern {i}:\n', best_params)
            print('\t corresponding validation score:', np.min(val_scores))
                
            ML_algo.set_params(**best_params[0])
            ML_algo.fit(sub_X_train, sub_y_train_array)
            best_models.append(ML_algo)
            sub_y_test_pred = pd.DataFrame(ML_algo.predict(sub_X_test), index = sub_y_test.index,
                                           columns = ['sub_y_test_pred']) # convert in to data frame
            all_y_test_pred = pd.concat([all_y_test_pred, sub_y_test_pred])
                    
        all_y_test_pred = all_y_test_pred.sort_index()
        y_test = y_test.sort_index()
        pred_results.append(all_y_test_pred)

        # test mae in one-fold
        test_mae = mean_absolute_error(all_y_test_pred, y_test)
        test_scores[count] = test_mae
        
        # save best models
        with open(os.path.join(path, f'{model_name}_reduced_feature_best_models_{count}.pkl'), 'wb') as file:
            pickle.dump(best_models, file)
        
        count = count + 1
        print('\t test MAE:', test_mae)
        
    # mean accuracy for all random states
    print(f'overall test mean: {np.mean(test_scores)}')
    print(f'overall test std: {np.std(test_scores)}\n')

    # save test scores and prediction results
    with open(os.path.join(path, f'{model_name}_reduced_feature_test_scores.pkl'), 'wb') as file:
        pickle.dump(test_scores, file)
        
    with open(os.path.join(path, f'{model_name}_reduced_feature_pred_results.pkl'), 'wb') as file:
        pickle.dump(pred_results, file)
    
    return best_models

## LASSO Result

In [6]:
model_name = 'lasso'
model = Lasso(max_iter=100000000)
param_grid = {'alpha': np.logspace(-2, 1, 15),
              'random_state': [42]}

start_time = time.time()
best_models = MLpipe_reduced_feature(X_other, y_other, groups_other, X_test, y_test, groups_test,
                                     preprocessor, model, param_grid, result_path, model_name)
print('LASSO reduced features model running time:', time.time()-start_time)


Fold 1 Reduced Features:
	 Train index: [     0      1      2 ... 743181 743182 743183] Val index: [ 743184  743185  743186 ... 1002231 1002232 1002233]
	 Train size: 743184 Val size: 259050
	 Preparing datasets...
	 there are 3 unique missing value patterns.
	 working on unique pattern 0
		 params: {'random_state': 42, 'alpha': 0.01}
		 val_score: 5.864886135817049
		 params: {'random_state': 42, 'alpha': 0.016378937069540637}
		 val_score: 5.863475188786026
		 params: {'random_state': 42, 'alpha': 0.02682695795279726}
		 val_score: 5.862385569110619
		 params: {'random_state': 42, 'alpha': 0.043939705607607904}
		 val_score: 5.860803881702747
		 params: {'random_state': 42, 'alpha': 0.07196856730011521}
		 val_score: 5.859857765453186
		 params: {'random_state': 42, 'alpha': 0.11787686347935872}
		 val_score: 5.85841840229215
		 params: {'random_state': 42, 'alpha': 0.19306977288832497}
		 val_score: 5.857890658944066
		 params: {'random_state': 42, 'alpha': 0.31622776601683794}
		 

## Ridge Result

In [7]:
model_name = 'ridge'
model = Ridge()
param_grid = {'alpha': np.logspace(-2, 1, 15),
              'random_state': [42]}

start_time = time.time()
best_models = MLpipe_reduced_feature(X_other, y_other, groups_other, X_test, y_test, groups_test,
                                     preprocessor, model, param_grid, result_path, model_name)
print('Ridge reduced features model running time:', time.time()-start_time)


Fold 1 Reduced Features:
	 Train index: [     0      1      2 ... 743181 743182 743183] Val index: [ 743184  743185  743186 ... 1002231 1002232 1002233]
	 Train size: 743184 Val size: 259050
	 Preparing datasets...
	 there are 3 unique missing value patterns.
	 working on unique pattern 0
		 params: {'random_state': 42, 'alpha': 0.01}
		 val_score: 5.873671278218968
		 params: {'random_state': 42, 'alpha': 0.016378937069540637}
		 val_score: 5.8735957993408086
		 params: {'random_state': 42, 'alpha': 0.02682695795279726}
		 val_score: 5.8735051242526835
		 params: {'random_state': 42, 'alpha': 0.043939705607607904}
		 val_score: 5.87339838578866
		 params: {'random_state': 42, 'alpha': 0.07196856730011521}
		 val_score: 5.873276855838279
		 params: {'random_state': 42, 'alpha': 0.11787686347935872}
		 val_score: 5.87313838914267
		 params: {'random_state': 42, 'alpha': 0.19306977288832497}
		 val_score: 5.872976536159286
		 params: {'random_state': 42, 'alpha': 0.31622776601683794}
		