In [1]:
from IPython.core.display import display, HTML

import pandas as pd
import numpy as np
from scipy import stats
import random
import glob
import os
import gc

from joblib import Parallel, delayed

from sklearn import preprocessing, model_selection
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

path_root = './'
data_dir ='./'
path_submissions = '/'

target_name = 'target'

DEBUG = False


In [2]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def realized_mad(series_log_return):
    return np.mean(np.absolute(series_log_return - np.mean(series_log_return)))

def realized_median_abs_dev(series_log_return):
    return stats.median_absolute_deviation(series_log_return, nan_policy='omit')

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

def calc_wap(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1'])/(df['bid_size1'] + df['ask_size1'])
    return wap

def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2'])/(df['bid_size2'] + df['ask_size2'])
    return wap

def count_unique(series):
    return len(np.unique(series))

In [3]:
def preprocessor_book(file_path):
    df = pd.read_parquet(file_path)
    
    df['wap'] = calc_wap(df)
    df['log_return'] = df.groupby('time_id')['wap'].apply(log_return)
    
    df['wap2'] = calc_wap(df)
    df['log_return2'] = df.groupby('time_id')['wap2'].apply(log_return)
    
    df['wap_imbalance'] = abs(df['wap'] - df['wap2'])
    
    df['spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1'])/2)
    
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    agg_dict = {
        'log_return':[realized_volatility,realized_mad,realized_median_abs_dev],
        'log_return2':[realized_volatility,realized_mad,realized_median_abs_dev],
        'wap_imbalance':[np.mean],
        'spread':[np.mean],
        'bid_spread':[np.mean],
        'ask_spread':[np.mean],
        'volume_imbalance':[np.mean],
        'total_volume':[np.mean],
        'wap':[np.mean],
    }
    
    
    df_feature = pd.DataFrame(df.groupby(['time_id']).agg(agg_dict)).reset_index()
    
    df_feature.columns = ['_'.join(col) for col in df_feature.columns] #time_id is changed to time_id_
        
    #create row_id
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature = df_feature.drop(['time_id_'],axis=1)
    
    return df_feature
    

In [6]:
%%time
file_path = data_dir + "book_train.parquet/stock_id=0"
preprocessor_book(file_path)

Wall time: 5.51 s


Unnamed: 0,log_return_realized_volatility,log_return_realized_mad,log_return_realized_median_abs_dev,log_return2_realized_volatility,log_return2_realized_mad,log_return2_realized_median_abs_dev,wap_imbalance_mean,spread_mean,bid_spread_mean,ask_spread_mean,volume_imbalance_mean,total_volume_mean,wap_mean,row_id
0,0.004499,0.000157,0.000053,0.004499,0.000157,0.000053,0.0,0.000852,0.000176,-0.000151,134.894040,323.496689,1.003725,0-5
1,0.001204,0.000038,0.000005,0.001204,0.000038,0.000005,0.0,0.000394,0.000142,-0.000135,142.050000,411.450000,1.000239,0-11
2,0.002369,0.000092,0.000020,0.002369,0.000092,0.000020,0.0,0.000725,0.000197,-0.000198,141.414894,416.351064,0.999542,0-16
3,0.002574,0.000113,0.000012,0.002574,0.000113,0.000012,0.0,0.000860,0.000190,-0.000108,146.216667,435.266667,0.998832,0-31
4,0.001894,0.000068,0.000011,0.001894,0.000068,0.000011,0.0,0.000397,0.000191,-0.000109,123.846591,343.221591,0.999619,0-62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3825,0.002579,0.000078,0.000016,0.002579,0.000078,0.000016,0.0,0.000552,0.000083,-0.000182,197.144781,374.235690,0.997938,0-32751
3826,0.002206,0.000063,0.000001,0.002206,0.000063,0.000001,0.0,0.000542,0.000092,-0.000172,233.781553,621.131068,1.000310,0-32753
3827,0.002913,0.000134,0.000075,0.002913,0.000134,0.000075,0.0,0.000525,0.000202,-0.000083,115.829787,343.734043,0.999552,0-32758
3828,0.003046,0.000107,0.000070,0.003046,0.000107,0.000070,0.0,0.000480,0.000113,-0.000166,132.074919,385.429967,1.002357,0-32763


In [4]:
def preprocessor_trade(file_path):
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    df['dollar_volume'] = df['price'] * df['size']
    
    
    agg_dict = {
        'log_return':[realized_volatility,realized_mad,realized_median_abs_dev],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.mean],
        'dollar_volume':[np.sum],
    }
    
    df_feature = df.groupby('time_id').agg(agg_dict).reset_index()
    
    df_feature.columns = ['_'.join(col) for col in df_feature.columns]

    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature = df_feature.drop(['trade_time_id_'],axis=1)
    
    return df_feature

In [8]:
%%time
file_path = data_dir + "trade_train.parquet/stock_id=0"
preprocessor_trade(file_path)

Wall time: 2.66 s


Unnamed: 0,trade_log_return_realized_volatility,trade_log_return_realized_mad,trade_log_return_realized_median_abs_dev,trade_seconds_in_bucket_count_unique,trade_size_sum,trade_order_count_mean,trade_dollar_volume_sum,row_id
0,0.002006,0.000271,0.000345,40,3179,2.75,3190.139181,0-5
1,0.000901,0.000133,0.000149,30,1289,1.9,1289.353432,0-11
2,0.001961,0.000298,0.000307,25,2161,2.72,2158.608928,0-16
3,0.001561,0.000321,0.000383,15,1962,3.933333,1959.605547,0-31
4,0.000871,0.000138,0.000158,22,1791,4.045455,1790.254496,0-62
...,...,...,...,...,...,...,...,...
3825,0.001519,0.000162,0.000155,52,3450,3.057692,3441.815546,0-32751
3826,0.001411,0.000236,0.000315,28,4547,3.892857,4548.671493,0-32753
3827,0.001521,0.000206,0.000298,36,4250,3.5,4247.563002,0-32758
3828,0.001794,0.000200,0.000227,53,3217,2.150943,3224.421796,0-32763


In [5]:
def preprocessor(list_stock_ids, is_train = True):
    from joblib import Parallel, delayed # parallel computing to save time
    df = pd.DataFrame()
    
    def for_joblib(stock_id):
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
            
        df_tmp = pd.merge(preprocessor_book(file_path_book),preprocessor_trade(file_path_trade),on='row_id',how='left')
     
        return pd.concat([df,df_tmp])
    
    df = Parallel(n_jobs=-1, verbose=1)(
        delayed(for_joblib)(stock_id) for stock_id in list_stock_ids
        )

    df =  pd.concat(df,ignore_index = True)
    return df

In [10]:
list_stock_ids = [0,1]
preprocessor(list_stock_ids, is_train = True)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    9.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    9.8s finished


Unnamed: 0,log_return_realized_volatility,log_return_realized_mad,log_return_realized_median_abs_dev,log_return2_realized_volatility,log_return2_realized_mad,log_return2_realized_median_abs_dev,wap_imbalance_mean,spread_mean,bid_spread_mean,ask_spread_mean,...,total_volume_mean,wap_mean,row_id,trade_log_return_realized_volatility,trade_log_return_realized_mad,trade_log_return_realized_median_abs_dev,trade_seconds_in_bucket_count_unique,trade_size_sum,trade_order_count_mean,trade_dollar_volume_sum
0,0.004499,0.000157,0.000053,0.004499,0.000157,0.000053,0.0,0.000852,0.000176,-0.000151,...,323.496689,1.003725,0-5,0.002006,0.000271,0.000345,40,3179,2.75,3190.139181
1,0.001204,0.000038,0.000005,0.001204,0.000038,0.000005,0.0,0.000394,0.000142,-0.000135,...,411.450000,1.000239,0-11,0.000901,0.000133,0.000149,30,1289,1.9,1289.353432
2,0.002369,0.000092,0.000020,0.002369,0.000092,0.000020,0.0,0.000725,0.000197,-0.000198,...,416.351064,0.999542,0-16,0.001961,0.000298,0.000307,25,2161,2.72,2158.608928
3,0.002574,0.000113,0.000012,0.002574,0.000113,0.000012,0.0,0.000860,0.000190,-0.000108,...,435.266667,0.998832,0-31,0.001561,0.000321,0.000383,15,1962,3.933333,1959.605547
4,0.001894,0.000068,0.000011,0.001894,0.000068,0.000011,0.0,0.000397,0.000191,-0.000109,...,343.221591,0.999619,0-62,0.000871,0.000138,0.000158,22,1791,4.045455,1790.254496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7655,0.003723,0.000122,0.000046,0.003723,0.000122,0.000046,0.0,0.000597,0.000157,-0.000118,...,296.185668,1.000142,1-32751,0.001776,0.000205,0.000253,49,3249,2.77551,3248.982168
7656,0.010829,0.000288,0.000188,0.010829,0.000288,0.000188,0.0,0.000922,0.000159,-0.000125,...,567.840081,1.007503,1-32753,0.008492,0.000484,0.000619,183,75903,7.874317,76486.652579
7657,0.003135,0.000093,0.000013,0.003135,0.000093,0.000013,0.0,0.000648,0.000141,-0.000132,...,426.603834,1.000854,1-32758,0.001927,0.000317,0.000479,26,2239,2.615385,2240.755934
7658,0.003750,0.000122,0.000115,0.003750,0.000122,0.000115,0.0,0.000421,0.000190,-0.000231,...,526.317972,1.003032,1-32763,0.002856,0.000202,0.000182,109,16648,2.93578,16696.638857


In [11]:
%%time
train = pd.read_csv(os.path.join(data_dir,'train.csv'))

train_ids = train.stock_id.unique()

df_train = preprocessor(list_stock_ids=train_ids, is_train=True)

print(f'train shape {df_train.shape}')
display(df_train.head())
display(df_train.tail())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.5min


train shape (428932, 21)


[Parallel(n_jobs=-1)]: Done 112 out of 112 | elapsed:  4.5min finished


Unnamed: 0,log_return_realized_volatility,log_return_realized_mad,log_return_realized_median_abs_dev,log_return2_realized_volatility,log_return2_realized_mad,log_return2_realized_median_abs_dev,wap_imbalance_mean,spread_mean,bid_spread_mean,ask_spread_mean,...,total_volume_mean,wap_mean,row_id,trade_log_return_realized_volatility,trade_log_return_realized_mad,trade_log_return_realized_median_abs_dev,trade_seconds_in_bucket_count_unique,trade_size_sum,trade_order_count_mean,trade_dollar_volume_sum
0,0.004499,0.000157,5.3e-05,0.004499,0.000157,5.3e-05,0.0,0.000852,0.000176,-0.000151,...,323.496689,1.003725,0-5,0.002006,0.000271,0.000345,40,3179.0,2.75,3190.139181
1,0.001204,3.8e-05,5e-06,0.001204,3.8e-05,5e-06,0.0,0.000394,0.000142,-0.000135,...,411.45,1.000239,0-11,0.000901,0.000133,0.000149,30,1289.0,1.9,1289.353432
2,0.002369,9.2e-05,2e-05,0.002369,9.2e-05,2e-05,0.0,0.000725,0.000197,-0.000198,...,416.351064,0.999542,0-16,0.001961,0.000298,0.000307,25,2161.0,2.72,2158.608928
3,0.002574,0.000113,1.2e-05,0.002574,0.000113,1.2e-05,0.0,0.00086,0.00019,-0.000108,...,435.266667,0.998832,0-31,0.001561,0.000321,0.000383,15,1962.0,3.933333,1959.605547
4,0.001894,6.8e-05,1.1e-05,0.001894,6.8e-05,1.1e-05,0.0,0.000397,0.000191,-0.000109,...,343.221591,0.999619,0-62,0.000871,0.000138,0.000158,22,1791.0,4.045455,1790.254496


Unnamed: 0,log_return_realized_volatility,log_return_realized_mad,log_return_realized_median_abs_dev,log_return2_realized_volatility,log_return2_realized_mad,log_return2_realized_median_abs_dev,wap_imbalance_mean,spread_mean,bid_spread_mean,ask_spread_mean,...,total_volume_mean,wap_mean,row_id,trade_log_return_realized_volatility,trade_log_return_realized_mad,trade_log_return_realized_median_abs_dev,trade_seconds_in_bucket_count_unique,trade_size_sum,trade_order_count_mean,trade_dollar_volume_sum
428927,0.003691,0.00011,2.9e-05,0.003691,0.00011,2.9e-05,0.0,0.000878,9.1e-05,-0.000202,...,406.045161,0.999582,126-32751,0.002171,0.000297,0.000409,37,2570.0,2.783784,2568.838117
428928,0.004104,0.00017,8.9e-05,0.004104,0.00017,8.9e-05,0.0,0.000706,0.000126,-0.000142,...,243.32287,1.002476,126-32753,0.00218,0.000225,0.000213,43,2323.0,3.418605,2327.828627
428929,0.003118,9.7e-05,1.6e-05,0.003118,9.7e-05,1.6e-05,0.0,0.000739,0.000189,-0.000192,...,348.09375,1.001082,126-32758,0.001921,0.000261,0.000332,35,3740.0,2.8,3742.254714
428930,0.003661,0.00011,8.1e-05,0.003661,0.00011,8.1e-05,0.0,0.00053,0.000143,-0.000134,...,426.41604,1.001809,126-32763,0.002051,0.000182,0.000224,80,9389.0,2.925,9406.795437
428931,0.002091,8.9e-05,5.7e-05,0.002091,8.9e-05,5.7e-05,0.0,0.000432,0.000109,-0.000159,...,531.313364,1.000272,126-32767,0.001041,0.000136,0.00013,36,5325.0,3.0,5326.415054


Wall time: 4min 29s


In [12]:
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]
df_train = train.merge(df_train, on = ['row_id'], how = 'left')

In [13]:
df_train.head()

Unnamed: 0,row_id,target,log_return_realized_volatility,log_return_realized_mad,log_return_realized_median_abs_dev,log_return2_realized_volatility,log_return2_realized_mad,log_return2_realized_median_abs_dev,wap_imbalance_mean,spread_mean,...,volume_imbalance_mean,total_volume_mean,wap_mean,trade_log_return_realized_volatility,trade_log_return_realized_mad,trade_log_return_realized_median_abs_dev,trade_seconds_in_bucket_count_unique,trade_size_sum,trade_order_count_mean,trade_dollar_volume_sum
0,0-5,0.004136,0.004499,0.000157,5.3e-05,0.004499,0.000157,5.3e-05,0.0,0.000852,...,134.89404,323.496689,1.003725,0.002006,0.000271,0.000345,40,3179.0,2.75,3190.139181
1,0-11,0.001445,0.001204,3.8e-05,5e-06,0.001204,3.8e-05,5e-06,0.0,0.000394,...,142.05,411.45,1.000239,0.000901,0.000133,0.000149,30,1289.0,1.9,1289.353432
2,0-16,0.002168,0.002369,9.2e-05,2e-05,0.002369,9.2e-05,2e-05,0.0,0.000725,...,141.414894,416.351064,0.999542,0.001961,0.000298,0.000307,25,2161.0,2.72,2158.608928
3,0-31,0.002195,0.002574,0.000113,1.2e-05,0.002574,0.000113,1.2e-05,0.0,0.00086,...,146.216667,435.266667,0.998832,0.001561,0.000321,0.000383,15,1962.0,3.933333,1959.605547
4,0-62,0.001747,0.001894,6.8e-05,1.1e-05,0.001894,6.8e-05,1.1e-05,0.0,0.000397,...,123.846591,343.221591,0.999619,0.000871,0.000138,0.000158,22,1791.0,4.045455,1790.254496


In [25]:
# df_train.to_csv('train_processed.csv',index=False)
train = pd.read_csv(os.path.join(data_dir,'train.csv'))

In [6]:
df_train = pd.read_csv('train_processed.csv')

In [7]:
df_train

Unnamed: 0,row_id,target,log_return_realized_volatility,log_return_realized_mad,log_return_realized_median_abs_dev,log_return2_realized_volatility,log_return2_realized_mad,log_return2_realized_median_abs_dev,wap_imbalance_mean,spread_mean,...,volume_imbalance_mean,total_volume_mean,wap_mean,trade_log_return_realized_volatility,trade_log_return_realized_mad,trade_log_return_realized_median_abs_dev,trade_seconds_in_bucket_count_unique,trade_size_sum,trade_order_count_mean,trade_dollar_volume_sum
0,0-5,0.004136,0.004499,0.000157,0.000053,0.004499,0.000157,0.000053,0.0,0.000852,...,134.894040,323.496689,1.003725,0.002006,0.000271,0.000345,40.0,3179.0,2.750000,3190.139181
1,0-11,0.001445,0.001204,0.000038,0.000005,0.001204,0.000038,0.000005,0.0,0.000394,...,142.050000,411.450000,1.000239,0.000901,0.000133,0.000149,30.0,1289.0,1.900000,1289.353432
2,0-16,0.002168,0.002369,0.000092,0.000020,0.002369,0.000092,0.000020,0.0,0.000725,...,141.414894,416.351064,0.999542,0.001961,0.000298,0.000307,25.0,2161.0,2.720000,2158.608928
3,0-31,0.002195,0.002574,0.000113,0.000012,0.002574,0.000113,0.000012,0.0,0.000860,...,146.216667,435.266667,0.998832,0.001561,0.000321,0.000383,15.0,1962.0,3.933333,1959.605547
4,0-62,0.001747,0.001894,0.000068,0.000011,0.001894,0.000068,0.000011,0.0,0.000397,...,123.846591,343.221591,0.999619,0.000871,0.000138,0.000158,22.0,1791.0,4.045455,1790.254496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428927,126-32751,0.003461,0.003691,0.000110,0.000029,0.003691,0.000110,0.000029,0.0,0.000878,...,161.638710,406.045161,0.999582,0.002171,0.000297,0.000409,37.0,2570.0,2.783784,2568.838117
428928,126-32753,0.003113,0.004104,0.000170,0.000089,0.004104,0.000170,0.000089,0.0,0.000706,...,150.578475,243.322870,1.002476,0.002180,0.000225,0.000213,43.0,2323.0,3.418605,2327.828627
428929,126-32758,0.004070,0.003118,0.000097,0.000016,0.003118,0.000097,0.000016,0.0,0.000739,...,254.406250,348.093750,1.001082,0.001921,0.000261,0.000332,35.0,3740.0,2.800000,3742.254714
428930,126-32763,0.003357,0.003661,0.000110,0.000081,0.003661,0.000110,0.000081,0.0,0.000530,...,145.654135,426.416040,1.001809,0.002051,0.000182,0.000224,80.0,9389.0,2.925000,9406.795437


In [8]:
test = pd.read_csv(os.path.join(data_dir,'test.csv'))
test_ids = test.stock_id.unique()

In [9]:
%%time
df_test = preprocessor(list_stock_ids=test_ids, is_train=False)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Wall time: 1.03 s


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.9s finished


In [10]:
df_test = test.merge(df_test, on=['row_id'], how='left')

In [11]:
df_train['stock_id'] = df_train['row_id'].apply(lambda x:x.split('-')[0])
df_test['stock_id'] = df_test['row_id'].apply(lambda x:x.split('-')[0])
df_train['time_id'] = df_train['row_id'].apply(lambda x:x.split('-')[1])
df_test['time_id'] = df_test['row_id'].apply(lambda x:x.split('-')[1])

In [12]:
df_train.columns

Index(['row_id', 'target', 'log_return_realized_volatility',
       'log_return_realized_mad', 'log_return_realized_median_abs_dev',
       'log_return2_realized_volatility', 'log_return2_realized_mad',
       'log_return2_realized_median_abs_dev', 'wap_imbalance_mean',
       'spread_mean', 'bid_spread_mean', 'ask_spread_mean',
       'volume_imbalance_mean', 'total_volume_mean', 'wap_mean',
       'trade_log_return_realized_volatility', 'trade_log_return_realized_mad',
       'trade_log_return_realized_median_abs_dev',
       'trade_seconds_in_bucket_count_unique', 'trade_size_sum',
       'trade_order_count_mean', 'trade_dollar_volume_sum', 'stock_id',
       'time_id'],
      dtype='object')

In [13]:
## import libraries

#PyTorch 

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torch.utils import data

In [14]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [15]:
class OptiveDataset(Dataset):
    def __init__(self, X, Y, emb_cols=['stock_id', 'time_id']):
        X = X.copy()
        self.X1 = X.loc[:,emb_cols].copy().values.astype(np.int64) #categorical columns
        self.X2 = X.drop(columns=emb_cols).copy().values.astype(np.float32) #numerical columns
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return (self.X1[idx], self.X2[idx]), self.y[idx]
    
class OptiveDatasetTest(Dataset):
    def __init__(self, X, emb_cols=['stock_id', 'time_id']):
        X = X.copy()
        self.X1 = X.loc[:,emb_cols].copy().values.astype(np.int64) #categorical columns
        self.X2 = X.drop(columns=emb_cols).copy().values.astype(np.float32) #numerical columns
        
    def __len__(self):
        return len(self.X1)
    
    def __getitem__(self, idx):
        return (self.X1[idx], self.X2[idx])

In [33]:
# is_nan = df_train.isna()
# row_has_nan = is_nan.any(axis=1)
# df_train[row_has_nan]

Unnamed: 0,row_id,target,log_return_realized_volatility,log_return_realized_mad,log_return_realized_median_abs_dev,log_return2_realized_volatility,log_return2_realized_mad,log_return2_realized_median_abs_dev,wap_imbalance_mean,spread_mean,...,wap_mean,trade_log_return_realized_volatility,trade_log_return_realized_mad,trade_log_return_realized_median_abs_dev,trade_seconds_in_bucket_count_unique,trade_size_sum,trade_order_count_mean,trade_dollar_volume_sum,stock_id,time_id


In [34]:
# df_test.isna().sum()

stock_id                                    0
time_id                                     0
row_id                                      0
log_return_realized_volatility              0
log_return_realized_mad                     0
log_return_realized_median_abs_dev          0
log_return2_realized_volatility             0
log_return2_realized_mad                    0
log_return2_realized_median_abs_dev         0
wap_imbalance_mean                          0
spread_mean                                 0
bid_spread_mean                             0
ask_spread_mean                             0
volume_imbalance_mean                       0
total_volume_mean                           0
wap_mean                                    0
trade_log_return_realized_volatility        0
trade_log_return_realized_mad               0
trade_log_return_realized_median_abs_dev    0
trade_seconds_in_bucket_count_unique        0
trade_size_sum                              0
trade_order_count_mean            

In [16]:
df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

In [17]:
train_dataset = OptiveDataset(df_train.drop(['target', 'time_id','row_id'], axis=1), df_train['target'], emb_cols=['stock_id'])
train_dl = DataLoader(train_dataset, batch_size=4, shuffle=True)

#test the dataset class
for (emb, count), target in train_dl:
    print((emb.shape, count.shape), target.shape)
    break;

(torch.Size([4, 1]), torch.Size([4, 20])) torch.Size([4])


In [31]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')

def RMSELoss(yhat,y):
    return torch.sqrt(torch.mean((yhat-y)**2))

def RMSPELoss(y_pred, y_true):
    return torch.sqrt(torch.mean( ((y_true - y_pred) / y_true) ** 2 ))

def train_epoch(train_dl, valid_dl, model, loss_fn, opt, sch, epoch, fold, device=device):
    # taining loop
    model.train()
    running_loss_ = 0
    
    pbar = tqdm(enumerate(train_dl), total=len(train_dl))
    for i, ((cats, counts), targets) in pbar:
        cats, counts, targets = cats.to(device), counts.to(device), targets.unsqueeze(1).to(device)
        
        opt.zero_grad()
        y_pred = model(cats, counts)
        loss = loss_fn(y_pred.float(), targets.float())
        
        loss.backward()
        opt.step()
        
        running_loss_ += loss.item()
        if (i+1) % 100 == 0:
            pbar.set_description(f"running loss:{running_loss_ / (i+1): 0.6f}")
    
    sch.step(loss)

    epoch_loss = running_loss_ / len(train_dl)
    #print(f'==> Epoch {epoch} TRAIN loss: {epoch_loss:.6f}')
    
    # Validation loop
    model.eval()
    valid_loss = 0
    best_loss = np.inf
    
    for i, ((cats, counts), targets) in enumerate(valid_dl):
        cats, counts, targets = cats.to(device), counts.to(device), targets.unsqueeze(1).to(device)
        
        with torch.no_grad():
            y_pred = model(cats, counts)
            val_loss = loss_fn(y_pred.float(), targets.float())
            
        valid_loss += val_loss.item() * targets.shape[0]
    sch.step(valid_loss)
    
    valid_epoch_loss = valid_loss / len(valid_dl)
    print(f'==>F{fold}, Epoch {epoch} VALID loss: {valid_epoch_loss:.8f}')
    
    if valid_epoch_loss < best_loss:
        best_loss = valid_epoch_loss
        torch.save(model.state_dict(), f'FOLD{fold}_optive_model.pth')
    
    model.train()
    return model, epoch_loss, valid_epoch_loss

In [21]:
torch.cuda.get_device_name()

'GeForce GTX 1650 with Max-Q Design'

In [22]:
def perpare_dataset(train, valid, test=None, batch_size=64, drop_cols=['target', 'time_id', 'row_id'], emb_cols=['stock_id']):
    train_dataset = OptiveDataset(train.drop(drop_cols, axis=1), train['target'], emb_cols=emb_cols)
    valid_dataset = OptiveDataset(valid.drop(drop_cols, axis=1), valid['target'], emb_cols=emb_cols)    
    
    train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)
    
    return train_dl, valid_dl

In [29]:
class OptiverModel(nn.Module):
    def __init__(self, embedding_sizes=16, num_embeddings=max(df_train['stock_id'].astype(np.int8))+1):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_sizes)
        self.emb_drop = nn.Dropout(0.25)
        
        self.bn1 = nn.BatchNorm1d(20)
        self.lin1 = nn.Linear(embedding_sizes+20, 32)
        self.lin2 = nn.Linear(32, 16)
        self.lin3 = nn.Linear(16, 8)
        self.lin4 = nn.Linear(8, 4)
        self.lin5 = nn.Linear(4, 1)
        
        

    def forward(self, x_cat, x_cont):
        x1 = self.emb(x_cat)
        x1 = torch.flatten(x1, end_dim=1)
        x1 = self.emb_drop(x1)
        x2 = self.bn1(x_cont)
        x = torch.cat([x1, x2], 1)
        x = F.relu(self.lin1(x))
        x = F.relu(self.lin2(x))
        x = F.relu(self.lin3(x))
        x = F.relu(self.lin4(x))
        x = self.lin5(x)
        x = torch.sigmoid(x)
        
        
        return x

In [35]:
n_folds = 10
epochs = 10

kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=42)
seed_everything(46)

for fold_idx, (dev_index, val_index) in enumerate(kf.split(range(len(train)))):
    
    if fold_idx > 4:
        break #train 5 folds
        
    train_ = df_train.loc[dev_index,].reset_index(drop=True)
    valid_ = df_train.loc[val_index, ].reset_index(drop=True)
    
    train_dl, valid_dl = perpare_dataset(train_, valid_)
    
    model = OptiverModel(embedding_sizes=29,).to(device)
    loss_fn = RMSELoss
    
    opt = optim.Adam(model.parameters(), lr=0.01)
    sch = optim.lr_scheduler.ReduceLROnPlateau(opt, factor=0.2, patience=3)
    
    counter = 0
    for epoch in range(epochs):
        model, epoch_loss, valid_epoch_loss = train_epoch(train_dl, valid_dl, 
                                                                   model, loss_fn, opt, 
                                                                   sch, epoch, fold_idx, device=device)

running loss: 0.005783: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:21<00:00, 287.21it/s]
  0%|▎                                                                              | 21/6032 [00:00<00:29, 205.89it/s]

==>F0, Epoch 0 VALID loss: 0.30995481


running loss: 0.004820: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:21<00:00, 277.43it/s]
  1%|▌                                                                              | 43/6032 [00:00<00:27, 216.73it/s]

==>F0, Epoch 1 VALID loss: 0.31034059


running loss: 0.004821: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:21<00:00, 286.81it/s]
  0%|▎                                                                              | 23/6032 [00:00<00:26, 225.50it/s]

==>F0, Epoch 2 VALID loss: 0.31016487


running loss: 0.004820: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:22<00:00, 273.94it/s]
  0%|▎                                                                              | 21/6032 [00:00<00:29, 203.89it/s]

==>F0, Epoch 3 VALID loss: 0.31009783


running loss: 0.004819: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 288.25it/s]
  0%|▎                                                                              | 23/6032 [00:00<00:26, 225.50it/s]

==>F0, Epoch 4 VALID loss: 0.31015672


running loss: 0.002684: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 294.29it/s]
  1%|▋                                                                              | 50/6032 [00:00<00:23, 251.46it/s]

==>F0, Epoch 5 VALID loss: 0.08447283


running loss: 0.001378: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 289.68it/s]
  0%|▎                                                                              | 23/6032 [00:00<00:27, 221.16it/s]

==>F0, Epoch 6 VALID loss: 0.08710619


running loss: 0.001358: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 288.72it/s]
  1%|▋                                                                              | 51/6032 [00:00<00:23, 251.15it/s]

==>F0, Epoch 7 VALID loss: 0.08241050


running loss: 0.001343: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 288.10it/s]
  1%|▌                                                                              | 47/6032 [00:00<00:25, 232.99it/s]

==>F0, Epoch 8 VALID loss: 0.08289237


running loss: 0.001335: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:21<00:00, 286.74it/s]


==>F0, Epoch 9 VALID loss: 0.08942819


running loss: 0.007501: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 291.37it/s]
  0%|▎                                                                              | 22/6032 [00:00<00:28, 213.65it/s]

==>F1, Epoch 0 VALID loss: 0.30690218


running loss: 0.004824: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 289.86it/s]
  0%|▎                                                                              | 20/6032 [00:00<00:30, 194.17it/s]

==>F1, Epoch 1 VALID loss: 0.30707449


running loss: 0.004825: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 291.16it/s]
  0%|▎                                                                              | 24/6032 [00:00<00:25, 233.01it/s]

==>F1, Epoch 2 VALID loss: 0.30672186


running loss: 0.004825: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:21<00:00, 283.23it/s]
  0%|▏                                                                              | 18/6032 [00:00<00:33, 178.23it/s]

==>F1, Epoch 3 VALID loss: 0.30682029


running loss: 0.004826: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:21<00:00, 277.42it/s]
  0%|▎                                                                              | 20/6032 [00:00<00:30, 198.02it/s]

==>F1, Epoch 4 VALID loss: 0.30712609


running loss: 0.004825: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:21<00:00, 280.99it/s]
  0%|▎                                                                              | 23/6032 [00:00<00:26, 227.80it/s]

==>F1, Epoch 5 VALID loss: 0.30679324


running loss: 0.004825: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:21<00:00, 281.54it/s]
  0%|▎                                                                              | 23/6032 [00:00<00:26, 230.00it/s]

==>F1, Epoch 6 VALID loss: 0.30705644


running loss: 0.004824: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:21<00:00, 279.29it/s]
  0%|▎                                                                              | 21/6032 [00:00<00:29, 203.97it/s]

==>F1, Epoch 7 VALID loss: 0.30672093


running loss: 0.004825: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:21<00:00, 283.13it/s]
  0%|▎                                                                              | 21/6032 [00:00<00:29, 203.89it/s]

==>F1, Epoch 8 VALID loss: 0.30670721


running loss: 0.004824: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:21<00:00, 283.70it/s]


==>F1, Epoch 9 VALID loss: 0.30660016


running loss: 0.005807: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:21<00:00, 274.23it/s]
  0%|▏                                                                              | 18/6032 [00:00<00:33, 180.00it/s]

==>F2, Epoch 0 VALID loss: 0.30633942


running loss: 0.004826: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:22<00:00, 273.08it/s]
  0%|▎                                                                              | 22/6032 [00:00<00:27, 217.82it/s]

==>F2, Epoch 1 VALID loss: 0.30633423


running loss: 0.004830: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 292.19it/s]
  0%|▏                                                                              | 18/6032 [00:00<00:34, 174.82it/s]

==>F2, Epoch 2 VALID loss: 0.30630205


running loss: 0.004826: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 293.94it/s]
  1%|▋                                                                              | 49/6032 [00:00<00:24, 244.15it/s]

==>F2, Epoch 3 VALID loss: 0.30637833


running loss: 0.004824: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 293.38it/s]
  0%|▎                                                                              | 22/6032 [00:00<00:27, 217.82it/s]

==>F2, Epoch 4 VALID loss: 0.30650075


running loss: 0.004828: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:21<00:00, 287.01it/s]
  0%|▎                                                                              | 21/6032 [00:00<00:28, 210.00it/s]

==>F2, Epoch 5 VALID loss: 0.30656387


running loss: 0.004825: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 293.14it/s]
  0%|▎                                                                              | 22/6032 [00:00<00:28, 211.54it/s]

==>F2, Epoch 6 VALID loss: 0.30634347


running loss: 0.004828: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 293.84it/s]
  1%|▋                                                                              | 52/6032 [00:00<00:23, 256.04it/s]

==>F2, Epoch 7 VALID loss: 0.30633633


running loss: 0.004826: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 291.64it/s]
  1%|▌                                                                              | 42/6032 [00:00<00:28, 212.28it/s]

==>F2, Epoch 8 VALID loss: 0.30644360


running loss: 0.004825: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 292.12it/s]


==>F2, Epoch 9 VALID loss: 0.30642188


running loss: 0.015125: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 294.46it/s]
  0%|▎                                                                              | 22/6032 [00:00<00:27, 217.82it/s]

==>F3, Epoch 0 VALID loss: 0.18196693


running loss: 0.002865: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 293.96it/s]
  0%|▎                                                                              | 23/6032 [00:00<00:26, 223.30it/s]

==>F3, Epoch 1 VALID loss: 0.18224991


running loss: 0.002863: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 294.04it/s]
  0%|▎                                                                              | 23/6032 [00:00<00:26, 223.31it/s]

==>F3, Epoch 2 VALID loss: 0.18198008


running loss: 0.002865: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 294.97it/s]
  0%|▎                                                                              | 23/6032 [00:00<00:26, 223.30it/s]

==>F3, Epoch 3 VALID loss: 0.18175023


running loss: 0.002865: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 295.47it/s]
  0%|▎                                                                              | 23/6032 [00:00<00:27, 221.22it/s]

==>F3, Epoch 4 VALID loss: 0.18232707


running loss: 0.002864: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 289.12it/s]
  1%|▋                                                                              | 49/6032 [00:00<00:24, 244.94it/s]

==>F3, Epoch 5 VALID loss: 0.18185335


running loss: 0.002864: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 295.16it/s]
  0%|▎                                                                              | 21/6032 [00:00<00:29, 205.94it/s]

==>F3, Epoch 6 VALID loss: 0.18187032


running loss: 0.002862: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 291.65it/s]
  0%|▎                                                                              | 22/6032 [00:00<00:27, 215.67it/s]

==>F3, Epoch 7 VALID loss: 0.18192098


running loss: 0.002865: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 290.28it/s]
  1%|▋                                                                              | 50/6032 [00:00<00:23, 252.05it/s]

==>F3, Epoch 8 VALID loss: 0.18165210


running loss: 0.002863: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 292.60it/s]


==>F3, Epoch 9 VALID loss: 0.18171885


running loss: 0.005734: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:21<00:00, 278.00it/s]
  0%|▎                                                                              | 24/6032 [00:00<00:25, 235.30it/s]

==>F4, Epoch 0 VALID loss: 0.30780367


running loss: 0.004821: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:22<00:00, 271.76it/s]
  1%|▋                                                                              | 51/6032 [00:00<00:23, 256.21it/s]

==>F4, Epoch 1 VALID loss: 0.30795386


running loss: 0.004824: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 295.54it/s]
  0%|▎                                                                              | 24/6032 [00:00<00:25, 233.09it/s]

==>F4, Epoch 2 VALID loss: 0.30775689


running loss: 0.004823: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 293.53it/s]
  0%|▏                                                                              | 17/6032 [00:00<00:36, 163.47it/s]

==>F4, Epoch 3 VALID loss: 0.30793766


running loss: 0.004822: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 292.93it/s]
  0%|▎                                                                              | 22/6032 [00:00<00:27, 217.83it/s]

==>F4, Epoch 4 VALID loss: 0.30789807


running loss: 0.004683: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 294.33it/s]
  1%|▋                                                                              | 50/6032 [00:00<00:23, 249.45it/s]

==>F4, Epoch 5 VALID loss: 0.12305228


running loss: 0.001547: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 293.54it/s]
  0%|▎                                                                              | 23/6032 [00:00<00:26, 225.58it/s]

==>F4, Epoch 6 VALID loss: 0.08489565


running loss: 0.001398: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 291.73it/s]
  1%|▋                                                                              | 49/6032 [00:00<00:24, 246.51it/s]

==>F4, Epoch 7 VALID loss: 0.08509579


running loss: 0.001372: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 289.85it/s]
  1%|▋                                                                              | 49/6032 [00:00<00:24, 246.94it/s]

==>F4, Epoch 8 VALID loss: 0.08155748


running loss: 0.001337: 100%|█████████████████████████████████████████████████████| 6032/6032 [00:20<00:00, 288.90it/s]


==>F4, Epoch 9 VALID loss: 0.08102972


In [36]:
test_dataset = OptiveDatasetTest(df_test.drop(['row_id','time_id'],axis=1), emb_cols=['stock_id'])
test_dl = DataLoader(test_dataset, batch_size=1, shuffle=False)

test_preds = []

model_paths = glob.glob('./*.pth')

for model_path in model_paths:
    model.load_state_dict(torch.load(model_path))
    model.to(torch.device('cpu'))
    model.eval()
    
    y_preds = []
    
    with torch.no_grad():
        for x_cat, x_cont in test_dl:
            y_preds += [model(x_cat, x_cont).detach().cpu().numpy()[0][0]]
    test_preds.append(y_preds)
        
y_preds = np.mean(test_preds, axis=0)

In [37]:
y_preds

array([0.00122885, 0.0009977 , 0.0009977 ], dtype=float32)

In [40]:
test_preds = test.copy()
test_preds['target'] = y_preds

In [42]:
test_preds[['row_id','target']].to_csv('submission.csv',index=False)