## Main code for Kaggle - Optiver Realized Volatility Prediction
@LaurentMombaerts 

In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


**Lib Import / Data loading**

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import glob
import time

# ML
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Maths
from scipy.interpolate import interp1d

# Paths tricks
import os
from pathlib import Path

# Support code
from support_file import *
from information_measures import *

datapath = os.path.join(str(Path.home()), 'ownCloud', 'Data', 'Kaggle', 'optiver-realized-volatility-prediction')

# Load dataset
train = pd.read_csv(os.path.join(datapath,'train.csv')) 
all_stocks_ids = train['stock_id'].unique()
all_time_ids = train['time_id'].unique()

train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]

# Load test ids
test = pd.read_csv(os.path.join(datapath,'test.csv'))
test = test.drop(['stock_id','time_id'],axis=1)

**Functions**

In [3]:
# Competition metric
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

# Prediction function (chose here which prediction strategy to use)
def prediction_function(pred,book_path_train,trade_path_train,targets,book_path_test,trade_path_test,all_stocks_ids,test_file):
    
    if pred == 'naive':
        # Naive prediction (persistence model)
        prediction = past_realized_volatility_per_stock(list_file=book_path_train,prediction_column_name='pred')
        
        # Merge and evaluate results
        prediction = train.merge(prediction[['row_id','pred']], on = ['row_id'], how = 'left')
        print(prediction.head(5))

        # Estimate performances
        R2 = round(r2_score(y_true = prediction['target'], y_pred = prediction['pred']),3)
        RMSPE = round(rmspe(y_true = prediction['target'], y_pred = prediction['pred']),3)

        print('--')
        print(f'Performance of prediction: R2 score: {R2}, RMSPE: {RMSPE}')
        
        prediction = prediction.drop(columns=['target'])
        prediction = prediction.rename(columns={'pred': 'target'})

    if pred == 'stupid_RF':
        # Stupid nonlinear regression between persistence and next volatility (random forest)
        prediction = stupidForestPrediction(book_path_train=book_path_train,
                                            prediction_column_name='pred',
                                            train_targets_pd=targets,
                                            book_path_test=book_path_test)
        
    if pred == 'entropy':
        prediction = entropy_Prediction(book_path_train=book_path_train,
                                            prediction_column_name='pred',
                                            train_targets_pd=targets,
                                            book_path_test=book_path_test,
                                            all_stocks_ids=all_stocks_ids,
                                            test_file=test_file)
        
    return prediction

**Test code**

In [4]:
# Try a prediction code

# Glob book file train (contains all paths for each file in this folder)
list_order_book_file_train = glob.glob(os.path.join(datapath,'book_train.parquet','*')) 
list_order_book_file_test = glob.glob(os.path.join(datapath,'book_test.parquet','*'))
list_trade_file_train = glob.glob(os.path.join(datapath,'trade_train.parquet','*')) 
list_trade_file_test = glob.glob(os.path.join(datapath,'trade_test.parquet','*'))

# Given variables
pred = 'entropy_based'
book_path_train = list_order_book_file_train
trade_path_train = list_trade_file_train
targets = train
book_path_test = list_order_book_file_test
trade_path_test = list_trade_file_test

In [None]:
# Computational time optimized with groupby numba

def calc_wap(df):
    return (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])

def calc_wap2(df):
    return (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])

def calc_wap3(df):
    return (df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']) / (df['bid_size2'] + df['ask_size2'])

def calc_rv_from_wap_numba(values, index):
    log_return = np.diff(np.log(values))
    realized_vol = np.sqrt(np.sum(np.square(log_return[1:])))
    return realized_vol

def load_book_data_by_id(stock_id,datapath,train_test):
    file_to_read = os.path.join(datapath,'book_' + train_test + str('.parquet'),'stock_id=' + str(stock_id))
    df = pd.read_parquet(file_to_read)
    return df

def entropy_from_df(df):
    
    if df.shape[0] < 3:
        return 0
        
    t_init = df['seconds_in_bucket']
    t_new = np.arange(np.min(t_init),np.max(t_init)) 
    
    # Closest neighbour interpolation (no changes in wap between lines)
    nearest = interp1d(t_init, df['wap'], kind='nearest')
    resampled_wap = nearest(t_new)
    
    # Compute sample entropy
    # sampleEntropy = nolds.sampen(resampled_wap)
    sampleEntropy = sampen(resampled_wap)
    
    return sampleEntropy

def entropy_from_df2(df):
    
    if df.shape[0] < 3:
        return 0
        
    t_init = df['seconds_in_bucket']
    t_new = np.arange(np.min(t_init),np.max(t_init)) 
    
    # Closest neighbour interpolation (no changes in wap between lines)
    nearest = interp1d(t_init, df['wap2'], kind='nearest')
    resampled_wap = nearest(t_new)
    
    # Compute sample entropy
    # sampleEntropy = nolds.sampen(resampled_wap)
    sampleEntropy = sampen(resampled_wap)
    
    return sampleEntropy

def entropy_from_df3(df):
    
    if df.shape[0] < 3:
        return 0
        
    t_init = df['seconds_in_bucket']
    t_new = np.arange(np.min(t_init),np.max(t_init)) 
    
    # Closest neighbour interpolation (no changes in wap between lines)
    nearest = interp1d(t_init, df['wap3'], kind='nearest')
    resampled_wap = nearest(t_new)
    
    # Compute sample entropy
    sampleEntropy = sampen(resampled_wap)
    
    return sampleEntropy

def financial_metrics(df):
    
    wap_imbalance = np.mean(df['wap'] - df['wap2'])
    price_spread = np.mean((df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1'])/2))
    bid_spread = np.mean(df['bid_price1'] - df['bid_price2'])
    ask_spread = np.mean(df['ask_price1'] - df['ask_price2'])
    total_volume = np.mean((df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2']))
    volume_imbalance = np.mean(abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2'])))
    
    return [wap_imbalance,price_spread,bid_spread,ask_spread,total_volume,volume_imbalance]

def other_metrics(df):
    
    linearFit = (df['wap'].iloc[-1] - df['wap'].iloc[0]) / ((np.max(df['seconds_in_bucket']) - np.min(df['seconds_in_bucket']))) 
    linearFit2 = (df['wap2'].iloc[-1] - df['wap2'].iloc[0]) / ((np.max(df['seconds_in_bucket']) - np.min(df['seconds_in_bucket']))) 
    linearFit3 = (df['wap3'].iloc[-1] - df['wap3'].iloc[0]) / ((np.max(df['seconds_in_bucket']) - np.min(df['seconds_in_bucket']))) 
    
    # Resampling
    t_init = df['seconds_in_bucket']
    t_new = np.arange(np.min(t_init),np.max(t_init)) 
    
    # Closest neighbour interpolation (no changes in wap between lines)
    nearest = interp1d(t_init, df['wap'], kind='nearest')
    nearest2 = interp1d(t_init, df['wap2'], kind='nearest')
    nearest3 = interp1d(t_init, df['wap3'], kind='nearest')
    
    std_1 = np.std(nearest(t_new))
    std_2 = np.std(nearest2(t_new))
    std_3 = np.std(nearest3(t_new))
    
    return [linearFit, linearFit2, linearFit3, std_1, std_2, std_3]


book_all_features = pd.DataFrame()
encoder = np.eye(len(all_stocks_ids))

list_rv, list_rv2, list_rv3 = [], [], []
list_ent, list_fin, list_fin2 = [], [], []
list_others, list_others2, list_others3 = [], [], []

for stock_id in range(127):
    
    start = time.time()
    
    try:
        book_stock = load_book_data_by_id(stock_id,datapath,'train')
    except:
        continue
    
    # Calculate wap for the book
    book_stock['wap'] = calc_wap(book_stock)
    book_stock['wap2'] = calc_wap2(book_stock)
    book_stock['wap3'] = calc_wap3(book_stock)
    
    # Calculate realized volatility
    df_sub = book_stock.groupby('time_id')['wap'].agg(calc_rv_from_wap_numba, engine='numba').to_frame().reset_index()
    df_sub2 = book_stock.groupby('time_id')['wap2'].agg(calc_rv_from_wap_numba, engine='numba').to_frame().reset_index().drop(['time_id'],axis=1)
    df_sub3 = book_stock.groupby('time_id')['wap3'].agg(calc_rv_from_wap_numba, engine='numba').to_frame().reset_index()
    df_sub['time_id'] = [f'{stock_id}-{time_id}' for time_id in df_sub['time_id']]
    df_sub = pd.concat([df_sub,df_sub2['wap2'],df_sub3['wap3']],axis=1)
    df_sub = df_sub.rename(columns={'time_id':'row_id','wap': 'rv', 'wap2': 'rv2', 'wap3': 'rv3'})
    
    # Calculate realized volatility last 5 min
    df_sub_5 = book_stock.query(f'seconds_in_bucket >= 300').groupby(['time_id'])['wap'].agg(calc_rv_from_wap_numba, engine='numba').to_frame().reset_index()
    df_sub2_5 = book_stock.query(f'seconds_in_bucket >= 300').groupby(['time_id'])['wap2'].agg(calc_rv_from_wap_numba, engine='numba').to_frame().reset_index()
    df_sub3_5 = book_stock.query(f'seconds_in_bucket >= 300').groupby(['time_id'])['wap3'].agg(calc_rv_from_wap_numba, engine='numba').to_frame().reset_index()
    df_sub_5['time_id'] = [f'{stock_id}-{time_id}' for time_id in df_sub_5['time_id']]
    df_sub_5 = pd.concat([df_sub_5,df_sub2_5['wap2'],df_sub3_5['wap3']],axis=1)
    df_sub_5 = df_sub_5.rename(columns={'time_id':'row_id','wap': 'rv_5', 'wap2': 'rv2_5', 'wap3': 'rv3_5'})
    
    # Calculate realized volatility last 2 min
    df_sub_2 = book_stock.query(f'seconds_in_bucket >= 480').groupby(['time_id'])['wap'].agg(calc_rv_from_wap_numba, engine='numba').to_frame().reset_index()
    df_sub2_2 = book_stock.query(f'seconds_in_bucket >= 480').groupby(['time_id'])['wap2'].agg(calc_rv_from_wap_numba, engine='numba').to_frame().reset_index()
    df_sub3_2 = book_stock.query(f'seconds_in_bucket >= 480').groupby(['time_id'])['wap3'].agg(calc_rv_from_wap_numba, engine='numba').to_frame().reset_index()    
    df_sub_2['time_id'] = [f'{stock_id}-{time_id}' for time_id in df_sub_2['time_id']] 
    df_sub_2 = pd.concat([df_sub_2,df_sub2_2['wap2'],df_sub3_2['wap3']],axis=1)
    df_sub_2 = df_sub_2.rename(columns={'time_id':'row_id','wap': 'rv_2', 'wap2': 'rv2_2', 'wap3': 'rv3_2'})
    
    list_rv.append(df_sub)
    list_rv2.append(df_sub_5)
    list_rv3.append(df_sub_2)
    
    # Calculate other financial metrics from book 
    df_sub_book_feats = book_stock.groupby(['time_id']).apply(financial_metrics).to_frame().reset_index()
    df_sub_book_feats = df_sub_book_feats.rename(columns={0:'embedding'})
    df_sub_book_feats[['wap_imbalance','price_spread','bid_spread','ask_spread','total_vol','vol_imbalance']] = pd.DataFrame(df_sub_book_feats.embedding.tolist(), index=df_try.index)
    df_sub_book_feats['time_id'] = [f'{stock_id}-{time_id}' for time_id in df_sub_book_feats['time_id']] 
    df_sub_book_feats = df_sub_book_feats.rename(columns={'time_id':'row_id'}).drop(['embedding'],axis=1)
    
    df_sub_book_feats5 = book_stock.query(f'seconds_in_bucket >= 300').groupby(['time_id']).apply(financial_metrics).to_frame().reset_index()
    df_sub_book_feats5 = df_sub_book_feats5.rename(columns={0:'embedding'})
    df_sub_book_feats5[['wap_imbalance5','price_spread5','bid_spread5','ask_spread5','total_vol5','vol_imbalance5']] = pd.DataFrame(df_sub_book_feats5.embedding.tolist(), index=df_try.index)
    df_sub_book_feats5['time_id'] = [f'{stock_id}-{time_id}' for time_id in df_sub_book_feats5['time_id']] 
    df_sub_book_feats5 = df_sub_book_feats5.rename(columns={'time_id':'row_id'}).drop(['embedding'],axis=1)
    
    list_fin.append(df_sub_book_feats)
    list_fin2.append(df_sub_book_feats5)
    
    # Compute entropy 
    df_ent = book_stock.query(f'seconds_in_bucket >= 480').groupby(['time_id']).apply(entropy_from_df).to_frame().reset_index().fillna(0)
    df_ent2 = book_stock.query(f'seconds_in_bucket >= 480').groupby(['time_id']).apply(entropy_from_df2).to_frame().reset_index().fillna(0)
    df_ent3 = book_stock.query(f'seconds_in_bucket >= 480').groupby(['time_id']).apply(entropy_from_df3).to_frame().reset_index().fillna(0)
    df_ent['time_id'] = [f'{stock_id}-{time_id}' for time_id in df_ent['time_id']]
    df_ent = df_ent.rename(columns={'time_id':'row_id',0:'entropy'})
    df_ent2 = df_ent2.rename(columns={0:'entropy2'}).drop(['time_id'],axis=1)
    df_ent3 = df_ent3.rename(columns={0:'entropy3'}).drop(['time_id'],axis=1)
    df_ent = pd.concat([df_ent,df_ent2,df_ent3],axis=1)
    list_ent.append(df_ent)
    
    # Compute other metrics
    df_others = book_stock.groupby(['time_id']).apply(other_metrics).to_frame().reset_index().fillna(0)
    df_others = df_others.rename(columns={0:'embedding'})
    df_others[['linearFit1_1','linearFit1_2','linearFit1_3','wap_std1_1','wap_std1_2','wap_std1_3']] = pd.DataFrame(df_others.embedding.tolist(), index=df_try.index)
    df_others['time_id'] = [f'{stock_id}-{time_id}' for time_id in df_others['time_id']] 
    df_others = df_others.rename(columns={'time_id':'row_id'}).drop(['embedding'],axis=1)
    list_others.append(df_others)
    
    df_others2 = book_stock.query(f'seconds_in_bucket >= 300').groupby(['time_id']).apply(other_metrics).to_frame().reset_index().fillna(0)
    df_others2 = df_others2.rename(columns={0:'embedding'})
    df_others2[['linearFit2_1','linearFit2_2','linearFit2_3','wap_std2_1','wap_std2_2','wap_std2_3']] = pd.DataFrame(df_others2.embedding.tolist(), index=df_try.index)
    df_others2['time_id'] = [f'{stock_id}-{time_id}' for time_id in df_others2['time_id']] 
    df_others2 = df_others2.rename(columns={'time_id':'row_id'}).drop(['embedding'],axis=1)
    list_others2.append(df_others2)
    
    df_others3 = book_stock.query(f'seconds_in_bucket >= 480').groupby(['time_id']).apply(other_metrics).to_frame().reset_index().fillna(0)
    df_others3 = df_others3.rename(columns={0:'embedding'})
    df_others3[['linearFit3_1','linearFit3_2','linearFit3_3','wap_std3_1','wap_std3_2','wap_std3_3']] = pd.DataFrame(df_others3.embedding.tolist(), index=df_try.index)
    df_others3['time_id'] = [f'{stock_id}-{time_id}' for time_id in df_others3['time_id']] 
    df_others3 = df_others3.rename(columns={'time_id':'row_id'}).drop(['embedding'],axis=1)
    list_others3.append(df_others3)
    
    print('Computing one stock took', time.time() - start, 'seconds for stock ', stock_id)
    
# Create features dataframe
df_submission = pd.concat(list_rv)
df_submission2 = pd.concat(list_rv2)
df_submission3 = pd.concat(list_rv3)
df_ent_concat = pd.concat(list_ent)
df_fin_concat = pd.concat(list_fin)
df_fin2_concat = pd.concat(list_fin2)
df_others = pd.concat(list_others)
df_others2 = pd.concat(list_others2)
df_others3 = pd.concat(list_others3)

df_book_features = df_submission.merge(df_submission2, on = ['row_id'], how='left').fillna(0)
df_book_features = df_book_features.merge(df_submission3, on = ['row_id'], how='left').fillna(0)
df_book_features = df_book_features.merge(df_ent_concat, on = ['row_id'], how='left').fillna(0)
df_book_features = df_book_features.merge(df_fin_concat, on = ['row_id'], how='left').fillna(0)
df_book_features = df_book_features.merge(df_fin2_concat, on = ['row_id'], how='left').fillna(0)
df_book_features = df_book_features.merge(df_others, on = ['row_id'], how='left').fillna(0)
df_book_features = df_book_features.merge(df_others2, on = ['row_id'], how='left').fillna(0)
df_book_features = df_book_features.merge(df_others3, on = ['row_id'], how='left').fillna(0)



In [None]:
df_book_features.to_csv('book_features_tot.csv')

In [116]:
### print(book_stock['time_id'].nunique())
start = time.time()
grouped_wap = list(book_stock.groupby('time_id')['wap'])
grouped_seconds = list(book_stock.groupby('time_id')['seconds_in_bucket'])
entropies = list()
for time_id in range(book_stock['time_id'].nunique()):
    wap = np.array(grouped_wap[time_id][1])
    seconds = np.array(grouped_seconds[time_id][1])
    #entropies.append(entropy_from_wap(wap,seconds,120))
    entropies.append(entropy(pd.Series(wap).value_counts()))
    
print('Computing one stock took', time.time() - start, 'seconds for stock ', stock_id)
#np.array(list(book_stock.groupby('time_id')['seconds_in_bucket'])[time_id_list][1])

print(seconds)
idx = np.where(seconds > 590)[0]
print(len(idx))
#df_entropies = pd.concat(entropies)
#df_entropies

NameError: name 'entropy' is not defined

In [30]:
from scipy.stats import entropy
entropy(pd.Series(wap).value_counts())
#pd.Series(wap).value_counts().to_frame().to_csv('test_shannon.csv')

def ApEn_new(U, m, r):
    U = np.array(U)
    N = U.shape[0]
            
    def _phi(m):
        z = N - m + 1.0
        x = np.array([U[i:i+m] for i in range(int(z))])
        X = np.repeat(x[:, np.newaxis], 1, axis=2)
        C = np.sum(np.absolute(x - X).max(axis=2) <= r, axis=0) / z
        return np.log(C).sum() / z
    
    return abs(_phi(m + 1) - _phi(m))

wap
ApEn_new(wap,3,0.001)

0.027788768059913704

In [49]:
df_submission

Unnamed: 0,row_id,target
0,0-5,0.004499
1,0-11,0.001204
2,0-16,0.002330
3,0-31,0.002574
4,0-62,0.001880
...,...,...
3825,0-32751,0.002539
3826,0-32753,0.002173
3827,0-32758,0.002913
3828,0-32763,0.003046


In [6]:
book_all_features = pd.DataFrame()
encoder = np.eye(len(all_stocks_ids))

stocks_id_list, row_id_list = [], []
volatility_list, entropy2_list = [], []
linearFit_list, linearFit5_list, linearFit2_list = [], [], []
wap_std_list, wap_std5_list, wap_std2_list = [], [], []

for file in book_path_train:
    start = time.time()
    
    book_stock = pd.read_parquet(file)
    stock_id = file.split('=')[1]
    print('stock id computing = ' + str(stock_id))
    stock_time_ids = book_stock['time_id'].unique()
    for time_id in stock_time_ids:     
        
        # Access book data at this time + stock
        book_stock_time = book_stock[book_stock['time_id'] == time_id]

        # Create feature matrix
        stocks_id_list.append(stock_id)
        row_id_list.append(str(f'{stock_id}-{time_id}'))
        volatility_list.append(realized_volatility_from_book_pd(book_stock_time=book_stock_time))
        entropy2_list.append(entropy_from_book(book_stock_time=book_stock_time,last_min=2))
        linearFit_list.append(linearFit(book_stock_time=book_stock_time,last_min=10))
        linearFit5_list.append(linearFit(book_stock_time=book_stock_time,last_min=5))
        linearFit2_list.append(linearFit(book_stock_time=book_stock_time,last_min=2))
        wap_std_list.append(wapStat(book_stock_time=book_stock_time,last_min=10))
        wap_std5_list.append(wapStat(book_stock_time=book_stock_time,last_min=5))
        wap_std2_list.append(wapStat(book_stock_time=book_stock_time,last_min=2))
        
    print('Computing one stock entropy took', time.time() - start, 'seconds for stock ', stock_id)

# Merge targets
stocks_id_pd = pd.DataFrame(stocks_id_list,columns=['stock_id'])
row_id_pd = pd.DataFrame(row_id_list,columns=['row_id'])
volatility_pd = pd.DataFrame(volatility_list,columns=['volatility'])
entropy2_pd = pd.DataFrame(entropy2_list,columns=['entropy2'])
linearFit_pd = pd.DataFrame(linearFit_list,columns=['linearFit_coef'])
linearFit5_pd = pd.DataFrame(linearFit5_list,columns=['linearFit_coef5'])
linearFit2_pd = pd.DataFrame(linearFit2_list,columns=['linearFit_coef2'])
wap_std_pd = pd.DataFrame(wap_std_list,columns=['wap_std'])
wap_std5_pd = pd.DataFrame(wap_std5_list,columns=['wap_std5'])
wap_std2_pd = pd.DataFrame(wap_std2_list,columns=['wap_std2'])

book_all_features = pd.concat([stocks_id_pd,row_id_pd,volatility_pd,entropy2_pd,linearFit_pd,linearFit5_pd,linearFit2_pd,
                              wap_std_pd,wap_std5_pd,wap_std2_pd],axis=1)

book_all_features = train.merge(book_all_features, on = ['row_id'])

# Add encoded stock
encoded = list()

for i in range(book_all_features.shape[0]):
    stock_id = book_all_features['stock_id'][i]
    encoded_stock = encoder[np.where(all_stocks_ids == int(stock_id))[0],:]
    encoded.append(encoded_stock)

encoded_pd = pd.DataFrame(np.array(encoded).reshape(book_all_features.shape[0],np.array(all_stocks_ids).shape[0]))
book_all_features_encoded = pd.concat([book_all_features, encoded_pd],axis=1)

stock id computing = 0
Computing one stock entropy took 31.55561923980713 seconds for stock  0
stock id computing = 1


KeyboardInterrupt: 

In [62]:
book_all_features_encoded

Unnamed: 0,row_id,target,stock_id,volatility,entropy2,linearFit_coef,linearFit_coef5,linearFit_coef2,wap_std,wap_std5,...,102,103,104,105,106,107,108,109,110,111
0,0-5,0.004136,0,0.004499,0.357092,3.874554e-06,-7.296451e-07,-0.000002,0.000698,0.000498,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0-11,0.001445,0,0.001204,0.173592,6.045011e-07,2.164258e-07,0.000001,0.000258,0.000186,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0-16,0.002168,0,0.002369,0.066508,-3.463848e-06,-8.714889e-06,-0.000008,0.000924,0.000911,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0-31,0.002195,0,0.002574,0.076961,-4.829273e-06,-4.213925e-06,-0.000007,0.000791,0.000401,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0-62,0.001747,0,0.001894,0.164630,-3.157112e-09,2.151035e-06,0.000006,0.000265,0.000184,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428927,126-32751,0.003461,126,0.003691,0.264883,-8.803732e-07,1.123249e-06,0.000005,0.000473,0.000302,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
428928,126-32753,0.003113,126,0.004104,0.218649,7.455736e-06,1.272477e-05,0.000024,0.001142,0.000777,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
428929,126-32758,0.004070,126,0.003118,0.244481,2.550217e-06,7.278551e-06,0.000011,0.000503,0.000443,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
428930,126-32763,0.003357,126,0.003661,0.423108,4.275249e-07,-5.802447e-06,-0.000010,0.000466,0.000477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [66]:
import xgboost as xgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


X = book_all_features_encoded.drop(['row_id','target','stock_id'],axis=1)
y = book_all_features_encoded['target']


print('Persistence model perf :', rmspe(y,book_all_features_encoded['volatility']))

xgboost_default = xgb.XGBRegressor(random_state=0)
xgboost_default.fit(X,y)

yhat_xgb = xgboost_default.predict(X)
print('New model xgb perf : ', rmspe(y, yhat_xgb))

lightgbm_default = LGBMRegressor()
lightgbm_default.fit(X,y)
yhat_light = lightgbm_default.predict(X)
print('New model lgbm perf : ', rmspe(y, yhat_light))

catboost_default = CatBoostRegressor(verbose=0)
catboost_default.fit(X,y)
yhat_cat = catboost_default.predict(X)
print('New model catboost perf : ', rmspe(y, yhat_cat))

print('New model mean gradient boosted trees : ', rmspe(y,(yhat_xgb + yhat_light + yhat_cat)/3))


Persistence model perf : 0.34135449018801606
New model xgb perf :  0.306084377117114
New model lgbm perf :  0.2927659975146637
New model catboost perf :  0.28643356812572324
New model mean gradient boosted trees :  0.2931403264914029


**Main evaluation code**

In [5]:
# Glob book file train (contains all paths for each file in this folder)
list_order_book_file_train = glob.glob(os.path.join(datapath,'book_train.parquet','*')) 
list_order_book_file_test = glob.glob(os.path.join(datapath,'book_test.parquet','*'))
list_trade_file_train = glob.glob(os.path.join(datapath,'trade_train.parquet','*')) 
list_trade_file_test = glob.glob(os.path.join(datapath,'trade_test.parquet','*'))

# Compute predictions
predictions = prediction_function(pred='entropy',
                                 book_path_train=list_order_book_file_train,
                                 trade_path_train=list_trade_file_train,
                                 targets=train,
                                 book_path_test=list_order_book_file_test,
                                 trade_path_test=list_trade_file_test,
                                 all_stocks_ids=all_stocks_ids,
                                 test_file=test)

predictions.to_csv('submission.csv',index = False)

stock id computing = 0
Computing one stock entropy took 0.03986024856567383 seconds for stock  0
  row_id stock_id  volatility  entropy2  linearFit_coef  linearFit_coef5  \
0    0-4        0    0.000294         0        0.000059                0   

   linearFit_coef2   wap_std  wap_std5  wap_std2  ...  102  103  104  105  \
0                0  0.000118         0         0  ...  0.0  0.0  0.0  0.0   

   106  107  108  109  110  111  
0  0.0  0.0  0.0  0.0  0.0  0.0  

[1 rows x 122 columns]
stock id computing = 0


KeyboardInterrupt: 

**Notes**