In [1]:
import gc
import os
import time
import warnings
import joblib
import pickle
import xgboost

import pandas as pd
import numpy as np

from itertools import combinations
from warnings import simplefilter
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_absolute_error

from mlxtend.evaluate.time_series import (
    GroupTimeSeriesSplit,
    plot_splits,
    print_cv_info,
    print_split_info,
)

pd.set_option('display.float_format', lambda x: '%.4f' % x)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.width', 1000)

warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [2]:
# Load Data
data_folder = '../data/preprocessed/'

with open(os.path.join(data_folder, 'train_val_dataset.pkl'), 'rb') as file:
    train_val_dataset = pickle.load(file)
    
with open(os.path.join(data_folder, 'test_dataset.pkl'), 'rb') as file:
    test_dataset = pickle.load(file)

X_other, y_other, groups_other = train_val_dataset['X_other'], train_val_dataset['y_other'], train_val_dataset['groups_other']
X_test, y_test, groups_test, submission_id = test_dataset['X_test'], test_dataset['y_test'], test_dataset['groups_test'], test_dataset['submission_id']

# collect which encoder to use on each feature
onehot_ftrs = ['imbalance_buy_sell_flag', 'stock_id']
std_ftrs = ['seconds_in_bucket', 'imbalance_size', 'reference_price', 'matched_size', 
            'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 
            'wap', 'lagged_target_1d_0', 'lagged_target_1d_10', 'lagged_target_1d_20', 
            'lagged_target_1d_30', 'lagged_target_1d_40', 'lagged_target_1d_50', 
            'lagged_target_1d_60', 'lagged_target_1d_70', 'lagged_target_1d_80', 
            'lagged_target_1d_90', 'lagged_target_1d_100', 'lagged_target_1d_110', 
            'lagged_target_1d_120', 'lagged_target_1d_130', 'lagged_target_1d_140', 
            'lagged_target_1d_150', 'lagged_target_1d_160', 'lagged_target_1d_170', 
            'lagged_target_1d_180', 'lagged_target_1d_190', 'lagged_target_1d_200', 
            'lagged_target_1d_210', 'lagged_target_1d_220', 'lagged_target_1d_230', 
            'lagged_target_1d_240', 'lagged_target_1d_250', 'lagged_target_1d_260', 
            'lagged_target_1d_270', 'lagged_target_1d_280', 'lagged_target_1d_290', 
            'lagged_target_1d_300', 'lagged_target_1d_310', 'lagged_target_1d_320', 
            'lagged_target_1d_330', 'lagged_target_1d_340', 'lagged_target_1d_350', 
            'lagged_target_1d_360', 'lagged_target_1d_370', 'lagged_target_1d_380', 
            'lagged_target_1d_390', 'lagged_target_1d_400', 'lagged_target_1d_410', 
            'lagged_target_1d_420', 'lagged_target_1d_430', 'lagged_target_1d_440', 
            'lagged_target_1d_450', 'lagged_target_1d_460', 'lagged_target_1d_470', 
            'lagged_target_1d_480', 'lagged_target_1d_490', 'lagged_target_1d_500', 
            'lagged_target_1d_510', 'lagged_target_1d_520', 'lagged_target_1d_530', 
            'lagged_target_1d_540', 'volume', 'mid_price', 'liquidity_imbalance', 
            'matched_imbalance', 'size_imbalance', 'reference_price_far_price_imb', 
            'reference_price_near_price_imb', 'reference_price_ask_price_imb', 
            'reference_price_bid_price_imb', 'reference_price_wap_imb', 'far_price_near_price_imb', 
            'far_price_ask_price_imb', 'far_price_bid_price_imb', 'far_price_wap_imb', 
            'near_price_ask_price_imb', 'near_price_bid_price_imb', 'near_price_wap_imb', 
            'ask_price_bid_price_imb', 'ask_price_wap_imb', 'bid_price_wap_imb', 'price_spread', 
            'price_pressure', 'market_urgency', 'depth_pressure', 'all_prices_mean', 
            'all_sizes_mean', 'all_prices_std', 'all_sizes_std', 'all_prices_skew', 
            'all_sizes_skew', 'all_prices_kurt', 'all_sizes_kurt', 'dow', 'seconds', 'minute']

# collect all the encoders
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'), onehot_ftrs),
        ('std', StandardScaler(), std_ftrs)])

result_path = '../result/'

In [3]:
def MLpipe_reduced_feature(X_other, y_other, groups_other, X_test, y_test, groups_test,
                           preprocessor, model, param_grid, path, model_Name):
    prep = Pipeline(steps=[('preprocessor', preprocessor)])

    n_split = 4
    count = 0
    test_scores = np.zeros(n_split)

    # Split train-val data
    len_group_other = groups_other.nunique()
    gts = GroupTimeSeriesSplit(test_size=int(len_group_other*0.25), n_splits=n_split)

    for i_train, i_val in gts.split(X_other, y_other, groups_other):
        print(f'\nFold {count+1} Reduced Features:')
        best_models = []
        
        print("\t Train index:", i_train, "Val index:", i_val)
        print("\t Train size:", len(i_train), "Val size:", len(i_val))
        X_train, y_train, groups_train = X_other.iloc[i_train], y_other.iloc[i_train], groups_other.iloc[i_train]
        X_val, y_val, groups_val = X_other.iloc[i_val], y_other.iloc[i_val], groups_other.iloc[i_val]
        
        X_train_preprocessed = prep.fit_transform(X_train)
        feature_names = preprocessor.get_feature_names_out()
        
        # parameter Searching
        pg = ParameterGrid(param_grid)
        scores = np.zeros(len(pg))
        
        print('\t Preparing datasets...')
        df_train = pd.DataFrame(data = X_train_preprocessed, columns = feature_names, index=y_train.index)
        del X_train_preprocessed, X_train
        gc.collect()
        
        X_val_preprocessed = prep.transform(X_val)
        df_val = pd.DataFrame(data = X_val_preprocessed, columns = feature_names, index=y_val.index)
        del X_val_preprocessed, X_val
        gc.collect()
        
        X_test_preprocessed = prep.transform(X_test)
        df_test = pd.DataFrame(data = X_test_preprocessed, columns = feature_names, index=y_test.index)

        # Free up memory
        del X_test_preprocessed
        gc.collect()

        # reduced feature
        # find all unique patterns of missing value in test set
        mask = df_test.isnull()
        unique_rows = np.array(np.unique(mask, axis=0))
        all_y_test_pred = pd.DataFrame()
        print('\t there are', len(unique_rows), 'unique missing value patterns.')

        # divide test sets into subgroups according to the unique patterns
        for i in range(len(unique_rows)):
            print ('\t working on unique pattern', i)
            ## generate X_test subset that matches the unique pattern i: optimized code
            index_subset = df_test[mask.eq(unique_rows[i], axis=1).all(axis=1)].index
            sub_X_test = df_test.loc[index_subset] 
            sub_X_test = sub_X_test[df_test.columns[~unique_rows[i]]] # drop nan columns
            sub_y_test = y_test.loc[index_subset]

            ## prepare train-val subset
            # 1.cut the feature columns that have nans in the according sub_X_test
            sub_X_train = df_train[df_train.columns[~unique_rows[i]]].copy()
            sub_X_val = df_val[df_val.columns[~unique_rows[i]]].copy()
            # 2.cut the rows in the sub_X_train and sub_X_CV that have any nans
            sub_X_train = sub_X_train.dropna()
            sub_X_val = sub_X_val.dropna()   
            # 3.cut the sub_Y_train and sub_y_CV accordingly
            sub_y_train = y_train.loc[sub_X_train.index]
            sub_y_val = y_val.loc[sub_X_val.index]

            # run ML algo
            # change to 1-D array
            sub_y_train_array = sub_y_train.values.ravel()
            sub_y_val_array = sub_y_val.values.ravel()
            sub_y_test_array = sub_y_test.values.ravel()

            # run model
            ML_algo = model
            pg = ParameterGrid(param_grid)
            val_scores = np.zeros(len(pg))
                
            for p in range(len(pg)):
                params = pg[p]
                print('\t\t params:', params)
                ML_algo.set_params(**params)
                ML_algo.fit(sub_X_train, sub_y_train_array)
                sub_y_val_pred = ML_algo.predict(sub_X_val)
                val_scores[p] = mean_absolute_error(sub_y_val_array, sub_y_val_pred)
                print('\t\t val_score:', val_scores[p])
            
            best_params = np.array(pg)[val_scores == np.min(val_scores)]
            print(f'\t best model parameters for pattern {i}:\n', best_params)
            print('\t corresponding validation score:', np.min(val_scores))
                
            ML_algo.set_params(**best_params[0])
            ML_algo.fit(sub_X_train, sub_y_train_array)
            best_models.append(ML_algo)
            sub_y_test_pred = pd.DataFrame(ML_algo.predict(sub_X_test), index = sub_y_test.index,
                                           columns = ['sub_y_test_pred']) # convert in to data frame
            all_y_test_pred = pd.concat([all_y_test_pred, sub_y_test_pred])
                    
        all_y_test_pred = all_y_test_pred.sort_index()
        y_test = y_test.sort_index()

        # test mae in one-fold
        test_mae = mean_absolute_error(all_y_test_pred, y_test)
        test_scores[count] = test_mae
        count = count + 1
        print('\t test MAE:', test_mae)
        
    # mean accuracy for all random states
    print(f'overall test mean: {np.mean(test_scores)}')
    print(f'overall test std: {np.std(test_scores)}\n')

    # save best models for the last fold
    with open(os.path.join(path, f'{model_name}_reduced_feature_test_scores.pkl'), 'wb') as file:
        pickle.dump(test_scores, file)

    with open(os.path.join(path, f'{model_name}_reduced_feature_best_models.pkl'), 'wb') as file:
        pickle.dump(best_models, file)
    
    return best_models

In [9]:
from  sklearn.linear_model import Lasso, Ridge
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

## LASSO

In [None]:
model_name = 'lasso'
model = Lasso(max_iter=100000000)
param_grid = {'alpha': np.logspace(-2, 1, 10),
              'random_state': [42]}

start_time = time.time()
best_models = MLpipe_reduced_feature(X_other, y_other, groups_other, X_test, y_test, groups_test,
                                     preprocessor, model, param_grid, result_path, model_name)
print('LASSO reduced features model running time:', time.time()-start_time)

## Ridge

In [None]:
model_name = 'ridge'
model = Ridge()
param_grid = {'alpha': np.logspace(-2, 1, 10),
              'random_state': [42]}

start_time = time.time()
best_models = MLpipe_reduced_feature(X_other, y_other, groups_other, X_test, y_test, groups_test,
                                     preprocessor, model, param_grid, result_path, model_name)
print('Ridge reduced features model running time:', time.time()-start_time)

## Random Forest

In [None]:
model_name = 'randomforest'
model = RandomForestRegressor()
param_grid = {'max_features': [0.5, 0.75, 1.0, None],
              'max_depth': [1, 5, 7, 11, 13, None],
              'n_estimators': [100],
              'random_state': [42]}

start_time = time.time()
best_models = MLpipe_reduced_feature(X_other, y_other, groups_other, X_test, y_test, groups_test,
                                     preprocessor, model, param_grid, result_path, model_name)
print('RF reduced features model running time:', time.time()-start_time)