## Main code for Kaggle - Optiver Realized Volatility Prediction
@LaurentMombaerts 

In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


**Lib Import / Data loading**

In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import glob
import time

# ML
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import xgboost as xgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Maths
from scipy.interpolate import interp1d

# Paths tricks
import os
from pathlib import Path

# Support code
from support_file import *
from information_measures import *

###########################
machine = 'local'
###########################

if machine == 'local':
    datapath = os.path.join(str(Path.home()), 'ownCloud', 'Data', 'Kaggle', 'optiver-realized-volatility-prediction')

    # Load dataset
    train = pd.read_csv(os.path.join(datapath,'train.csv')) 
    all_stocks_ids = train['stock_id'].unique()
    all_time_ids = train['time_id'].unique()

    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    train = train[['row_id','target']]

    # Load test ids
    test = pd.read_csv(os.path.join(datapath,'test.csv'))
    test = test.drop(['stock_id','time_id'],axis=1)
    
elif machine == 'kaggle':
    
    # Load dataset
    train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    all_stocks_ids = train['stock_id'].unique()
    all_time_ids = train['time_id'].unique()

    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    train = train[['row_id','target']]

    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv') 
    test = test.drop(['stock_id','time_id'],axis=1)
    

**Functions**

In [65]:
# Competition metric
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

# Prediction function (chose here which prediction strategy to use)
def prediction_function(pred, machine, targets, all_stocks_ids):
        
    if pred == 'entropy':
        # Features computation
        df_features_encoded_test = computeFeatures_wEntropy(machine=machine, dataset='test', all_stocks_ids=all_stocks_ids)
        df_features_encoded_train = computeFeatures_wEntropy(machine=machine, dataset='train', all_stocks_ids=all_stocks_ids)
        
        # Training model
        X = df_features_encoded_train.drop(['row_id'],axis=1)
        y = targets
        
        catboost_default = CatBoostRegressor(verbose=0)
        catboost_default.fit(X,y)
        
        # Predicting targets from test
        X_test = df_features_encoded_test.drop(['row_id'],axis=1)
        yhat = catboost_default.predict(X_test)
        
        # Submission file
        yhat_pd = pd.DataFrame(yhat,columns=['target'])
        submission_file = pd.concat([df_features_encoded_test['row_id'],yhat_pd],axis=1)

    return submission_file

In [None]:
submission = prediction_function(pred='entropy',machine='local',targets=train['target'],all_stocks_ids=all_stocks_ids)

**Test code**

In [8]:
# Computational time optimized with groupby numba

book_all_features = pd.DataFrame()
encoder = np.eye(len(all_stocks_ids))

list_rv, list_rv2, list_rv3 = [], [], []
list_ent, list_fin, list_fin2 = [], [], []
list_others, list_others2, list_others3 = [], [], []

for stock_id in range(127):

    start = time.time()
    
    try:
        book_stock = load_book_data_by_id(stock_id,datapath,'train')
    except:
        continue
    
    # Calculate wap for the book
    book_stock['wap'] = calc_wap(book_stock)
    book_stock['wap2'] = calc_wap2(book_stock)
    book_stock['wap3'] = calc_wap3(book_stock)
    
    # Calculate realized volatility
    df_sub = book_stock.groupby('time_id')['wap'].agg(calc_rv_from_wap_numba, engine='numba').to_frame().reset_index()
    df_sub2 = book_stock.groupby('time_id')['wap2'].agg(calc_rv_from_wap_numba, engine='numba').to_frame().reset_index().drop(['time_id'],axis=1)
    df_sub3 = book_stock.groupby('time_id')['wap3'].agg(calc_rv_from_wap_numba, engine='numba').to_frame().reset_index()
    df_sub['time_id'] = [f'{stock_id}-{time_id}' for time_id in df_sub['time_id']]
    df_sub = pd.concat([df_sub,df_sub2['wap2'],df_sub3['wap3']],axis=1)
    df_sub = df_sub.rename(columns={'time_id':'row_id','wap': 'rv', 'wap2': 'rv2', 'wap3': 'rv3'})
    
    # Calculate realized volatility last 5 min
    df_sub_5 = book_stock.query(f'seconds_in_bucket >= 300').groupby(['time_id'])['wap'].agg(calc_rv_from_wap_numba, engine='numba').to_frame().reset_index()
    df_sub2_5 = book_stock.query(f'seconds_in_bucket >= 300').groupby(['time_id'])['wap2'].agg(calc_rv_from_wap_numba, engine='numba').to_frame().reset_index()
    df_sub3_5 = book_stock.query(f'seconds_in_bucket >= 300').groupby(['time_id'])['wap3'].agg(calc_rv_from_wap_numba, engine='numba').to_frame().reset_index()
    df_sub_5['time_id'] = [f'{stock_id}-{time_id}' for time_id in df_sub_5['time_id']]
    df_sub_5 = pd.concat([df_sub_5,df_sub2_5['wap2'],df_sub3_5['wap3']],axis=1)
    df_sub_5 = df_sub_5.rename(columns={'time_id':'row_id','wap': 'rv_5', 'wap2': 'rv2_5', 'wap3': 'rv3_5'})
    
    # Calculate realized volatility last 2 min
    df_sub_2 = book_stock.query(f'seconds_in_bucket >= 480').groupby(['time_id'])['wap'].agg(calc_rv_from_wap_numba, engine='numba').to_frame().reset_index()
    df_sub2_2 = book_stock.query(f'seconds_in_bucket >= 480').groupby(['time_id'])['wap2'].agg(calc_rv_from_wap_numba, engine='numba').to_frame().reset_index()
    df_sub3_2 = book_stock.query(f'seconds_in_bucket >= 480').groupby(['time_id'])['wap3'].agg(calc_rv_from_wap_numba, engine='numba').to_frame().reset_index()    
    df_sub_2['time_id'] = [f'{stock_id}-{time_id}' for time_id in df_sub_2['time_id']] 
    df_sub_2 = pd.concat([df_sub_2,df_sub2_2['wap2'],df_sub3_2['wap3']],axis=1)
    df_sub_2 = df_sub_2.rename(columns={'time_id':'row_id','wap': 'rv_2', 'wap2': 'rv2_2', 'wap3': 'rv3_2'})
    
    list_rv.append(df_sub)
    list_rv2.append(df_sub_5)
    list_rv3.append(df_sub_2)
    
    # Calculate other financial metrics from book 
    df_sub_book_feats = book_stock.groupby(['time_id']).apply(financial_metrics).to_frame().reset_index()
    df_sub_book_feats = df_sub_book_feats.rename(columns={0:'embedding'})
    df_sub_book_feats[['wap_imbalance','price_spread','bid_spread','ask_spread','total_vol','vol_imbalance']] = pd.DataFrame(df_sub_book_feats.embedding.tolist(), index=df_sub_book_feats.index)
    df_sub_book_feats['time_id'] = [f'{stock_id}-{time_id}' for time_id in df_sub_book_feats['time_id']] 
    df_sub_book_feats = df_sub_book_feats.rename(columns={'time_id':'row_id'}).drop(['embedding'],axis=1)
    
    df_sub_book_feats5 = book_stock.query(f'seconds_in_bucket >= 300').groupby(['time_id']).apply(financial_metrics).to_frame().reset_index()
    df_sub_book_feats5 = df_sub_book_feats5.rename(columns={0:'embedding'})
    df_sub_book_feats5[['wap_imbalance5','price_spread5','bid_spread5','ask_spread5','total_vol5','vol_imbalance5']] = pd.DataFrame(df_sub_book_feats5.embedding.tolist(), index=df_sub_book_feats5.index)
    df_sub_book_feats5['time_id'] = [f'{stock_id}-{time_id}' for time_id in df_sub_book_feats5['time_id']] 
    df_sub_book_feats5 = df_sub_book_feats5.rename(columns={'time_id':'row_id'}).drop(['embedding'],axis=1)
    
    list_fin.append(df_sub_book_feats)
    list_fin2.append(df_sub_book_feats5)
    
    # Compute entropy 
    df_ent = book_stock.query(f'seconds_in_bucket >= 480').groupby(['time_id']).apply(entropy_from_df).to_frame().reset_index().fillna(0)
    df_ent2 = book_stock.query(f'seconds_in_bucket >= 480').groupby(['time_id']).apply(entropy_from_df2).to_frame().reset_index().fillna(0)
    df_ent3 = book_stock.query(f'seconds_in_bucket >= 480').groupby(['time_id']).apply(entropy_from_df3).to_frame().reset_index().fillna(0)
    df_ent['time_id'] = [f'{stock_id}-{time_id}' for time_id in df_ent['time_id']]
    df_ent = df_ent.rename(columns={'time_id':'row_id',0:'entropy'})
    df_ent2 = df_ent2.rename(columns={0:'entropy2'}).drop(['time_id'],axis=1)
    df_ent3 = df_ent3.rename(columns={0:'entropy3'}).drop(['time_id'],axis=1)
    df_ent = pd.concat([df_ent,df_ent2,df_ent3],axis=1)
    list_ent.append(df_ent)
    
    # Compute other metrics
    df_others = book_stock.groupby(['time_id']).apply(other_metrics).to_frame().reset_index().fillna(0)
    df_others = df_others.rename(columns={0:'embedding'})
    df_others[['linearFit1_1','linearFit1_2','linearFit1_3','wap_std1_1','wap_std1_2','wap_std1_3']] = pd.DataFrame(df_others.embedding.tolist(), index=df_others.index)
    df_others['time_id'] = [f'{stock_id}-{time_id}' for time_id in df_others['time_id']] 
    df_others = df_others.rename(columns={'time_id':'row_id'}).drop(['embedding'],axis=1)
    list_others.append(df_others)
    
    df_others2 = book_stock.query(f'seconds_in_bucket >= 300').groupby(['time_id']).apply(other_metrics).to_frame().reset_index().fillna(0)
    df_others2 = df_others2.rename(columns={0:'embedding'})
    df_others2[['linearFit2_1','linearFit2_2','linearFit2_3','wap_std2_1','wap_std2_2','wap_std2_3']] = pd.DataFrame(df_others2.embedding.tolist(), index=df_others2.index)
    df_others2['time_id'] = [f'{stock_id}-{time_id}' for time_id in df_others2['time_id']] 
    df_others2 = df_others2.rename(columns={'time_id':'row_id'}).drop(['embedding'],axis=1)
    list_others2.append(df_others2)
    
    df_others3 = book_stock.query(f'seconds_in_bucket >= 480').groupby(['time_id']).apply(other_metrics).to_frame().reset_index().fillna(0)
    df_others3 = df_others3.rename(columns={0:'embedding'})
    df_others3[['linearFit3_1','linearFit3_2','linearFit3_3','wap_std3_1','wap_std3_2','wap_std3_3']] = pd.DataFrame(df_others3.embedding.tolist(), index=df_others3.index)
    df_others3['time_id'] = [f'{stock_id}-{time_id}' for time_id in df_others3['time_id']] 
    df_others3 = df_others3.rename(columns={'time_id':'row_id'}).drop(['embedding'],axis=1)
    list_others3.append(df_others3)
    
    print('Computing one stock took', time.time() - start, 'seconds for stock ', stock_id)
    
# Create features dataframe
df_submission = pd.concat(list_rv)
df_submission2 = pd.concat(list_rv2)
df_submission3 = pd.concat(list_rv3)
df_ent_concat = pd.concat(list_ent)
df_fin_concat = pd.concat(list_fin)
df_fin2_concat = pd.concat(list_fin2)
df_others = pd.concat(list_others)
df_others2 = pd.concat(list_others2)
df_others3 = pd.concat(list_others3)

df_book_features = df_submission.merge(df_submission2, on = ['row_id'], how='left').fillna(0)
df_book_features = df_book_features.merge(df_submission3, on = ['row_id'], how='left').fillna(0)
df_book_features = df_book_features.merge(df_ent_concat, on = ['row_id'], how='left').fillna(0)
df_book_features = df_book_features.merge(df_fin_concat, on = ['row_id'], how='left').fillna(0)
df_book_features = df_book_features.merge(df_fin2_concat, on = ['row_id'], how='left').fillna(0)
df_book_features = df_book_features.merge(df_others, on = ['row_id'], how='left').fillna(0)
df_book_features = df_book_features.merge(df_others2, on = ['row_id'], how='left').fillna(0)
df_book_features = df_book_features.merge(df_others3, on = ['row_id'], how='left').fillna(0)



Computing one stock took 78.40056204795837 seconds for stock  0
Computing one stock took 78.00121140480042 seconds for stock  1
Computing one stock took 80.0335214138031 seconds for stock  2
Computing one stock took 77.10368728637695 seconds for stock  3
Computing one stock took 76.08033490180969 seconds for stock  4
Computing one stock took 74.6215660572052 seconds for stock  5
Computing one stock took 76.14969182014465 seconds for stock  6




Computing one stock took 70.95528101921082 seconds for stock  7
Computing one stock took 72.4843225479126 seconds for stock  8
Computing one stock took 69.70015931129456 seconds for stock  9
Computing one stock took 74.99949765205383 seconds for stock  10
Computing one stock took 71.16744899749756 seconds for stock  11
Computing one stock took 74.54515361785889 seconds for stock  13
Computing one stock took 74.01098704338074 seconds for stock  14
Computing one stock took 72.29604053497314 seconds for stock  15




Computing one stock took 68.91552662849426 seconds for stock  16
Computing one stock took 73.71882963180542 seconds for stock  17
Computing one stock took 71.53120589256287 seconds for stock  18
Computing one stock took 74.39288830757141 seconds for stock  19
Computing one stock took 75.80600929260254 seconds for stock  20
Computing one stock took 76.04484176635742 seconds for stock  21
Computing one stock took 71.34460830688477 seconds for stock  22
Computing one stock took 71.54472756385803 seconds for stock  23
Computing one stock took 76.63270950317383 seconds for stock  26
Computing one stock took 72.77859354019165 seconds for stock  27
Computing one stock took 73.19689536094666 seconds for stock  28
Computing one stock took 74.00018978118896 seconds for stock  29
Computing one stock took 71.16373562812805 seconds for stock  30
Computing one stock took 72.95074248313904 seconds for stock  31
Computing one stock took 73.59148073196411 seconds for stock  32




Computing one stock took 67.9111602306366 seconds for stock  33
Computing one stock took 74.64365911483765 seconds for stock  34
Computing one stock took 77.53510332107544 seconds for stock  35
Computing one stock took 74.66532444953918 seconds for stock  36




Computing one stock took 71.06928706169128 seconds for stock  37
Computing one stock took 72.09638714790344 seconds for stock  38
Computing one stock took 79.92956733703613 seconds for stock  39
Computing one stock took 70.34637403488159 seconds for stock  40
Computing one stock took 73.83815932273865 seconds for stock  41
Computing one stock took 74.81204462051392 seconds for stock  42
Computing one stock took 75.82460713386536 seconds for stock  43
Computing one stock took 5411.685176610947 seconds for stock  44
Computing one stock took 76.66262698173523 seconds for stock  46
Computing one stock took 73.4518723487854 seconds for stock  47




Computing one stock took 71.74880480766296 seconds for stock  48
Computing one stock took 73.98410868644714 seconds for stock  50
Computing one stock took 76.69444346427917 seconds for stock  51
Computing one stock took 75.22188568115234 seconds for stock  52
Computing one stock took 74.59291529655457 seconds for stock  53
Computing one stock took 72.48686003684998 seconds for stock  55
Computing one stock took 76.00697565078735 seconds for stock  56
Computing one stock took 72.14395666122437 seconds for stock  58
Computing one stock took 76.70502424240112 seconds for stock  59




Computing one stock took 73.61695098876953 seconds for stock  60
Computing one stock took 77.13644814491272 seconds for stock  61
Computing one stock took 75.95353507995605 seconds for stock  62
Computing one stock took 76.01673579216003 seconds for stock  63
Computing one stock took 77.28658199310303 seconds for stock  64
Computing one stock took 73.42819595336914 seconds for stock  66
Computing one stock took 78.18217444419861 seconds for stock  67
Computing one stock took 77.50853395462036 seconds for stock  68
Computing one stock took 77.48137879371643 seconds for stock  69
Computing one stock took 76.56007957458496 seconds for stock  70
Computing one stock took 72.46697616577148 seconds for stock  72
Computing one stock took 78.35754084587097 seconds for stock  73
Computing one stock took 74.36866235733032 seconds for stock  74
Computing one stock took 70.91724133491516 seconds for stock  75
Computing one stock took 75.04919099807739 seconds for stock  76
Computing one stock took 



Computing one stock took 69.38282322883606 seconds for stock  83
Computing one stock took 73.48994994163513 seconds for stock  84
Computing one stock took 75.46828055381775 seconds for stock  85
Computing one stock took 77.88152623176575 seconds for stock  86
Computing one stock took 72.36755323410034 seconds for stock  87




Computing one stock took 68.77231240272522 seconds for stock  88
Computing one stock took 70.66145038604736 seconds for stock  89
Computing one stock took 70.15222787857056 seconds for stock  90
Computing one stock took 73.22786045074463 seconds for stock  93
Computing one stock took 78.39202833175659 seconds for stock  94
Computing one stock took 74.84231734275818 seconds for stock  95
Computing one stock took 73.80405569076538 seconds for stock  96




Computing one stock took 69.95719146728516 seconds for stock  97
Computing one stock took 69.86900925636292 seconds for stock  98
Computing one stock took 73.57280087471008 seconds for stock  99
Computing one stock took 71.64879179000854 seconds for stock  100
Computing one stock took 76.23623132705688 seconds for stock  101
Computing one stock took 73.07208061218262 seconds for stock  102
Computing one stock took 69.06587648391724 seconds for stock  103
Computing one stock took 70.62345790863037 seconds for stock  104
Computing one stock took 72.45526337623596 seconds for stock  105
Computing one stock took 70.85469627380371 seconds for stock  107
Computing one stock took 73.34256720542908 seconds for stock  108
Computing one stock took 71.6957654953003 seconds for stock  109
Computing one stock took 70.00637483596802 seconds for stock  110
Computing one stock took 73.74567747116089 seconds for stock  111
Computing one stock took 68.05250597000122 seconds for stock  112
Computing one 

In [9]:
df_book_features.to_csv('book_features_tot.csv')

In [31]:
# Add encoded stock
encoder = np.eye(len(all_stocks_ids))
encoded = list()

for i in range(df_book_features.shape[0]):
    stock_id = int(df_book_features['row_id'][i].split('-')[0])
    encoded_stock = encoder[np.where(all_stocks_ids == int(stock_id))[0],:]
    encoded.append(encoded_stock)

encoded_pd = pd.DataFrame(np.array(encoded).reshape(df_book_features.shape[0],np.array(all_stocks_ids).shape[0]))
df_book_features_encoded = pd.concat([df_book_features, encoded_pd],axis=1)

In [6]:
book_all_features = pd.DataFrame()
encoder = np.eye(len(all_stocks_ids))

stocks_id_list, row_id_list = [], []
volatility_list, entropy2_list = [], []
linearFit_list, linearFit5_list, linearFit2_list = [], [], []
wap_std_list, wap_std5_list, wap_std2_list = [], [], []

for file in book_path_train:
    start = time.time()
    
    book_stock = pd.read_parquet(file)
    stock_id = file.split('=')[1]
    print('stock id computing = ' + str(stock_id))
    stock_time_ids = book_stock['time_id'].unique()
    for time_id in stock_time_ids:     
        
        # Access book data at this time + stock
        book_stock_time = book_stock[book_stock['time_id'] == time_id]

        # Create feature matrix
        stocks_id_list.append(stock_id)
        row_id_list.append(str(f'{stock_id}-{time_id}'))
        volatility_list.append(realized_volatility_from_book_pd(book_stock_time=book_stock_time))
        entropy2_list.append(entropy_from_book(book_stock_time=book_stock_time,last_min=2))
        linearFit_list.append(linearFit(book_stock_time=book_stock_time,last_min=10))
        linearFit5_list.append(linearFit(book_stock_time=book_stock_time,last_min=5))
        linearFit2_list.append(linearFit(book_stock_time=book_stock_time,last_min=2))
        wap_std_list.append(wapStat(book_stock_time=book_stock_time,last_min=10))
        wap_std5_list.append(wapStat(book_stock_time=book_stock_time,last_min=5))
        wap_std2_list.append(wapStat(book_stock_time=book_stock_time,last_min=2))
        
    print('Computing one stock entropy took', time.time() - start, 'seconds for stock ', stock_id)

# Merge targets
stocks_id_pd = pd.DataFrame(stocks_id_list,columns=['stock_id'])
row_id_pd = pd.DataFrame(row_id_list,columns=['row_id'])
volatility_pd = pd.DataFrame(volatility_list,columns=['volatility'])
entropy2_pd = pd.DataFrame(entropy2_list,columns=['entropy2'])
linearFit_pd = pd.DataFrame(linearFit_list,columns=['linearFit_coef'])
linearFit5_pd = pd.DataFrame(linearFit5_list,columns=['linearFit_coef5'])
linearFit2_pd = pd.DataFrame(linearFit2_list,columns=['linearFit_coef2'])
wap_std_pd = pd.DataFrame(wap_std_list,columns=['wap_std'])
wap_std5_pd = pd.DataFrame(wap_std5_list,columns=['wap_std5'])
wap_std2_pd = pd.DataFrame(wap_std2_list,columns=['wap_std2'])

book_all_features = pd.concat([stocks_id_pd,row_id_pd,volatility_pd,entropy2_pd,linearFit_pd,linearFit5_pd,linearFit2_pd,
                              wap_std_pd,wap_std5_pd,wap_std2_pd],axis=1)

book_all_features = train.merge(book_all_features, on = ['row_id'])

# Add encoded stock
encoded = list()

for i in range(book_all_features.shape[0]):
    stock_id = book_all_features['stock_id'][i]
    encoded_stock = encoder[np.where(all_stocks_ids == int(stock_id))[0],:]
    encoded.append(encoded_stock)

encoded_pd = pd.DataFrame(np.array(encoded).reshape(book_all_features.shape[0],np.array(all_stocks_ids).shape[0]))
book_all_features_encoded = pd.concat([book_all_features, encoded_pd],axis=1)

stock id computing = 0
Computing one stock entropy took 31.55561923980713 seconds for stock  0
stock id computing = 1


KeyboardInterrupt: 

In [62]:
book_all_features_encoded

Unnamed: 0,row_id,target,stock_id,volatility,entropy2,linearFit_coef,linearFit_coef5,linearFit_coef2,wap_std,wap_std5,...,102,103,104,105,106,107,108,109,110,111
0,0-5,0.004136,0,0.004499,0.357092,3.874554e-06,-7.296451e-07,-0.000002,0.000698,0.000498,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0-11,0.001445,0,0.001204,0.173592,6.045011e-07,2.164258e-07,0.000001,0.000258,0.000186,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0-16,0.002168,0,0.002369,0.066508,-3.463848e-06,-8.714889e-06,-0.000008,0.000924,0.000911,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0-31,0.002195,0,0.002574,0.076961,-4.829273e-06,-4.213925e-06,-0.000007,0.000791,0.000401,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0-62,0.001747,0,0.001894,0.164630,-3.157112e-09,2.151035e-06,0.000006,0.000265,0.000184,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428927,126-32751,0.003461,126,0.003691,0.264883,-8.803732e-07,1.123249e-06,0.000005,0.000473,0.000302,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
428928,126-32753,0.003113,126,0.004104,0.218649,7.455736e-06,1.272477e-05,0.000024,0.001142,0.000777,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
428929,126-32758,0.004070,126,0.003118,0.244481,2.550217e-06,7.278551e-06,0.000011,0.000503,0.000443,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
428930,126-32763,0.003357,126,0.003661,0.423108,4.275249e-07,-5.802447e-06,-0.000010,0.000466,0.000477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [66]:
import xgboost as xgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


X = book_all_features_encoded.drop(['row_id','target','stock_id'],axis=1)
y = book_all_features_encoded['target']


print('Persistence model perf :', rmspe(y,book_all_features_encoded['volatility']))

xgboost_default = xgb.XGBRegressor(random_state=0)
xgboost_default.fit(X,y)

yhat_xgb = xgboost_default.predict(X)
print('New model xgb perf : ', rmspe(y, yhat_xgb))

lightgbm_default = LGBMRegressor()
lightgbm_default.fit(X,y)
yhat_light = lightgbm_default.predict(X)
print('New model lgbm perf : ', rmspe(y, yhat_light))

catboost_default = CatBoostRegressor(verbose=0)
catboost_default.fit(X,y)
yhat_cat = catboost_default.predict(X)
print('New model catboost perf : ', rmspe(y, yhat_cat))

print('New model mean gradient boosted trees : ', rmspe(y,(yhat_xgb + yhat_light + yhat_cat)/3))


Persistence model perf : 0.34135449018801606
New model xgb perf :  0.306084377117114
New model lgbm perf :  0.2927659975146637
New model catboost perf :  0.28643356812572324
New model mean gradient boosted trees :  0.2931403264914029


In [38]:
df_book_features_encoded

Unnamed: 0,row_id,rv,rv2,rv3,rv_5,rv2_5,rv3_5,rv_2,rv2_2,rv3_2,...,102,103,104,105,106,107,108,109,110,111
0,0-5,0.004499,0.006999,0.006119,0.002929,0.004861,0.004492,0.001460,0.003019,0.002928,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0-11,0.001204,0.002476,0.002320,0.000980,0.001999,0.001600,0.000857,0.001777,0.001355,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0-16,0.002330,0.004787,0.004684,0.001293,0.003195,0.002354,0.000668,0.002516,0.001668,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0-31,0.002574,0.003637,0.002881,0.001776,0.002713,0.001814,0.000942,0.001351,0.000377,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0-62,0.001880,0.003254,0.003164,0.001518,0.002105,0.002344,0.001154,0.000797,0.001381,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428927,126-32751,0.003691,0.005876,0.006107,0.002899,0.003776,0.004230,0.001557,0.001777,0.002148,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
428928,126-32753,0.004104,0.004991,0.004879,0.003422,0.003392,0.003256,0.002529,0.002465,0.002324,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
428929,126-32758,0.003118,0.006019,0.005305,0.002787,0.005387,0.004479,0.001166,0.002926,0.003021,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
428930,126-32763,0.003661,0.005362,0.005186,0.002378,0.003156,0.003025,0.001035,0.001602,0.001675,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [39]:
# df_book_features_encoded 

X = df_book_features_encoded.drop(['row_id'],axis=1)
y = train['target']


print('Persistence model perf :', rmspe(y,df_book_features_encoded['rv']))

xgboost_default = xgb.XGBRegressor(random_state=0)
xgboost_default.fit(X,y)

yhat_xgb = xgboost_default.predict(X)
print('New model xgb perf : ', rmspe(y, yhat_xgb))

lightgbm_default = LGBMRegressor()
lightgbm_default.fit(X,y)
yhat_light = lightgbm_default.predict(X)
print('New model lgbm perf : ', rmspe(y, yhat_light))

catboost_default = CatBoostRegressor(verbose=0)
catboost_default.fit(X,y)
yhat_cat = catboost_default.predict(X)
print('New model catboost perf : ', rmspe(y, yhat_cat))

print('New model mean gradient boosted trees : ', rmspe(y,(yhat_xgb + yhat_light + yhat_cat)/3))

Persistence model perf : 0.3396985576673107
New model xgb perf :  0.296904151446481
New model lgbm perf :  0.2851266471230485
New model catboost perf :  0.2775810141798436
New model mean gradient boosted trees :  0.2843762726709955


In [64]:
yhat_pd = pd.DataFrame(yhat_cat,columns=['target'])
submission_file = pd.concat([df_book_features['row_id'],yhat_pd],axis=1)
submission_file

Unnamed: 0,row_id,target
0,0-5,0.004207
1,0-11,0.001555
2,0-16,0.002867
3,0-31,0.002766
4,0-62,0.001988
...,...,...
428927,126-32751,0.003823
428928,126-32753,0.004055
428929,126-32758,0.003705
428930,126-32763,0.003727


In [54]:
import lightgbm as lgbm

lgbm_params = {
    'objective': 'rmse', 
    'metric': 'rmse', 
    'boosting': 'gbdt',
    'early_stopping_rounds': 30,
    'learning_rate': 0.01,
    'lambda_l1': 1,
    'lambda_l2': 1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8}

lgbm_train = lgbm.Dataset(X,y)
model = lgbm.train(params=lgbm_params,train_set=lgbm_train, valid_sets=lgbm_train)
yhat_light_bis = model.predict(X)
print('New model lgbm perf : ', rmspe(y, yhat_light_bis))

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10934
[LightGBM] [Info] Number of data points in the train set: 428932, number of used features: 154
[LightGBM] [Info] Start training from score 0.003880
[1]	training's rmse: 0.00291376
Training until validation scores don't improve for 30 rounds
[2]	training's rmse: 0.00289171
[3]	training's rmse: 0.00286982
[4]	training's rmse: 0.00284822
[5]	training's rmse: 0.00282697
[6]	training's rmse: 0.00280587
[7]	training's rmse: 0.00278512
[8]	training's rmse: 0.00276454
[9]	training's rmse: 0.00274428
[10]	training's rmse: 0.00272428
[11]	training's rmse: 0.00270451
[12]	training's rmse: 0.00268488
[13]	training's rmse: 0.00266561
[14]	training's rmse: 0.00264648
[15]	training's rmse: 0.00262755
[16]	training's rmse: 0.00260885
[17]	training's rmse: 0.00259039
[18]	training's rmse: 0.00257219
[19]	training's rmse: 0.00255419
[20]	training's rmse: 0.00253641
[21]	training's rmse: 0.00251885
[22]	training'

**Main evaluation code**

In [5]:
# Glob book file train (contains all paths for each file in this folder)
list_order_book_file_train = glob.glob(os.path.join(datapath,'book_train.parquet','*')) 
list_order_book_file_test = glob.glob(os.path.join(datapath,'book_test.parquet','*'))
list_trade_file_train = glob.glob(os.path.join(datapath,'trade_train.parquet','*')) 
list_trade_file_test = glob.glob(os.path.join(datapath,'trade_test.parquet','*'))

# Compute predictions
predictions = prediction_function(pred='entropy',
                                 book_path_train=list_order_book_file_train,
                                 trade_path_train=list_trade_file_train,
                                 targets=train,
                                 book_path_test=list_order_book_file_test,
                                 trade_path_test=list_trade_file_test,
                                 all_stocks_ids=all_stocks_ids,
                                 test_file=test)

predictions.to_csv('submission.csv',index = False)

stock id computing = 0
Computing one stock entropy took 0.03986024856567383 seconds for stock  0
  row_id stock_id  volatility  entropy2  linearFit_coef  linearFit_coef5  \
0    0-4        0    0.000294         0        0.000059                0   

   linearFit_coef2   wap_std  wap_std5  wap_std2  ...  102  103  104  105  \
0                0  0.000118         0         0  ...  0.0  0.0  0.0  0.0   

   106  107  108  109  110  111  
0  0.0  0.0  0.0  0.0  0.0  0.0  

[1 rows x 122 columns]
stock id computing = 0


KeyboardInterrupt: 

**Notes**