## Main code for Kaggle - Optiver Realized Volatility Prediction
@LaurentMombaerts 

In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


**Lib Import / Data loading**

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import glob
import time

# ML
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Maths
import nolds
from scipy.interpolate import interp1d

# Paths tricks
import os
from pathlib import Path

# Support code
from support_file import *

datapath = os.path.join(str(Path.home()), 'ownCloud', 'Data', 'Kaggle', 'optiver-realized-volatility-prediction')

# Load dataset
train = pd.read_csv(os.path.join(datapath,'train.csv')) 
all_stocks_ids = train['stock_id'].unique()
all_time_ids = train['time_id'].unique()

train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]

# Load test ids
test = pd.read_csv(os.path.join(datapath,'test.csv'))
test = test.drop(['stock_id','time_id'],axis=1)

**Functions**

In [3]:
# Competition metric
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

# Prediction function (chose here which prediction strategy to use)
def prediction_function(pred,book_path_train,trade_path_train,targets,book_path_test,trade_path_test,all_stocks_ids,test_file):
    
    if pred == 'naive':
        # Naive prediction (persistence model)
        prediction = past_realized_volatility_per_stock(list_file=book_path_train,prediction_column_name='pred')
        
        # Merge and evaluate results
        prediction = train.merge(prediction[['row_id','pred']], on = ['row_id'], how = 'left')
        print(prediction.head(5))

        # Estimate performances
        R2 = round(r2_score(y_true = prediction['target'], y_pred = prediction['pred']),3)
        RMSPE = round(rmspe(y_true = prediction['target'], y_pred = prediction['pred']),3)

        print('--')
        print(f'Performance of prediction: R2 score: {R2}, RMSPE: {RMSPE}')
        
        prediction = prediction.drop(columns=['target'])
        prediction = prediction.rename(columns={'pred': 'target'})

    if pred == 'stupid_RF':
        # Stupid nonlinear regression between persistence and next volatility (random forest)
        prediction = stupidForestPrediction(book_path_train=book_path_train,
                                            prediction_column_name='pred',
                                            train_targets_pd=targets,
                                            book_path_test=book_path_test)
        
    if pred == 'entropy':
        prediction = entropy_Prediction(book_path_train=book_path_train,
                                            prediction_column_name='pred',
                                            train_targets_pd=targets,
                                            book_path_test=book_path_test,
                                            all_stocks_ids=all_stocks_ids,
                                            test_file=test_file)
        
    return prediction

**Test code**

In [4]:
# Try a prediction code

# Glob book file train (contains all paths for each file in this folder)
list_order_book_file_train = glob.glob(os.path.join(datapath,'book_train.parquet','*')) 
list_order_book_file_test = glob.glob(os.path.join(datapath,'book_test.parquet','*'))
list_trade_file_train = glob.glob(os.path.join(datapath,'trade_train.parquet','*')) 
list_trade_file_test = glob.glob(os.path.join(datapath,'trade_test.parquet','*'))

# Given variables
pred = 'entropy_based'
book_path_train = list_order_book_file_train
trade_path_train = list_trade_file_train
targets = train
book_path_test = list_order_book_file_test
trade_path_test = list_trade_file_test

In [59]:
book_all_features = pd.DataFrame()
encoder = np.eye(len(all_stocks_ids))

stocks_id_list, row_id_list = [], []
volatility_list, entropy2_list = [], []
linearFit_list, linearFit5_list, linearFit2_list = [], [], []
wap_std_list, wap_std5_list, wap_std2_list = [], [], []

for file in book_path_train:
    start = time.time()
    
    book_stock = pd.read_parquet(file)
    stock_id = file.split('=')[1]
    print('stock id computing = ' + str(stock_id))
    stock_time_ids = book_stock['time_id'].unique()
    for time_id in stock_time_ids:     
        
        # Access book data at this time + stock
        book_stock_time = book_stock[book_stock['time_id'] == time_id]

        # Create feature matrix
        stocks_id_list.append(stock_id)
        row_id_list.append(str(f'{stock_id}-{time_id}'))
        volatility_list.append(realized_volatility_from_book_pd(book_stock_time=book_stock_time))
        entropy2_list.append(entropy_from_book(book_stock_time=book_stock_time,last_min=2))
        linearFit_list.append(linearFit(book_stock_time=book_stock_time,last_min=10))
        linearFit5_list.append(linearFit(book_stock_time=book_stock_time,last_min=5))
        linearFit2_list.append(linearFit(book_stock_time=book_stock_time,last_min=2))
        wap_std_list.append(wapStat(book_stock_time=book_stock_time,last_min=10))
        wap_std5_list.append(wapStat(book_stock_time=book_stock_time,last_min=5))
        wap_std2_list.append(wapStat(book_stock_time=book_stock_time,last_min=2))
        
    print('Computing one stock entropy took', time.time() - start, 'seconds for stock ', stock_id)

# Merge targets
stocks_id_pd = pd.DataFrame(stocks_id_list,columns=['stock_id'])
row_id_pd = pd.DataFrame(row_id_list,columns=['row_id'])
volatility_pd = pd.DataFrame(volatility_list,columns=['volatility'])
entropy2_pd = pd.DataFrame(entropy2_list,columns=['entropy2'])
linearFit_pd = pd.DataFrame(linearFit_list,columns=['linearFit_coef'])
linearFit5_pd = pd.DataFrame(linearFit5_list,columns=['linearFit_coef5'])
linearFit2_pd = pd.DataFrame(linearFit2_list,columns=['linearFit_coef2'])
wap_std_pd = pd.DataFrame(wap_std_list,columns=['wap_std'])
wap_std5_pd = pd.DataFrame(wap_std5_list,columns=['wap_std5'])
wap_std2_pd = pd.DataFrame(wap_std2_list,columns=['wap_std2'])

book_all_features = pd.concat([stocks_id_pd,row_id_pd,volatility_pd,entropy2_pd,linearFit_pd,linearFit5_pd,linearFit2_pd,
                              wap_std_pd,wap_std5_pd,wap_std2_pd],axis=1)

book_all_features = train.merge(book_all_features, on = ['row_id'])

# Add encoded stock
encoded = list()

for i in range(book_all_features.shape[0]):
    stock_id = book_all_features['stock_id'][i]
    encoded_stock = encoder[np.where(all_stocks_ids == int(stock_id))[0],:]
    encoded.append(encoded_stock)

encoded_pd = pd.DataFrame(np.array(encoded).reshape(book_all_features.shape[0],np.array(all_stocks_ids).shape[0]))
book_all_features_encoded = pd.concat([book_all_features, encoded_pd],axis=1)

stock id computing = 0
Computing one stock entropy took 49.275190114974976 seconds for stock  0
stock id computing = 1
Computing one stock entropy took 54.06134223937988 seconds for stock  1
stock id computing = 10
Computing one stock entropy took 55.17041039466858 seconds for stock  10
stock id computing = 100
Computing one stock entropy took 52.73424696922302 seconds for stock  100
stock id computing = 101
Computing one stock entropy took 55.06243658065796 seconds for stock  101
stock id computing = 102
Computing one stock entropy took 52.77800631523132 seconds for stock  102
stock id computing = 103
Computing one stock entropy took 48.65172719955444 seconds for stock  103
stock id computing = 104
Computing one stock entropy took 53.03036284446716 seconds for stock  104
stock id computing = 105
Computing one stock entropy took 55.25834321975708 seconds for stock  105
stock id computing = 107
Computing one stock entropy took 54.821441650390625 seconds for stock  107
stock id computing

In [62]:
book_all_features_encoded

Unnamed: 0,row_id,target,stock_id,volatility,entropy2,linearFit_coef,linearFit_coef5,linearFit_coef2,wap_std,wap_std5,...,102,103,104,105,106,107,108,109,110,111
0,0-5,0.004136,0,0.004499,0.357092,3.874554e-06,-7.296451e-07,-0.000002,0.000698,0.000498,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0-11,0.001445,0,0.001204,0.173592,6.045011e-07,2.164258e-07,0.000001,0.000258,0.000186,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0-16,0.002168,0,0.002369,0.066508,-3.463848e-06,-8.714889e-06,-0.000008,0.000924,0.000911,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0-31,0.002195,0,0.002574,0.076961,-4.829273e-06,-4.213925e-06,-0.000007,0.000791,0.000401,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0-62,0.001747,0,0.001894,0.164630,-3.157112e-09,2.151035e-06,0.000006,0.000265,0.000184,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428927,126-32751,0.003461,126,0.003691,0.264883,-8.803732e-07,1.123249e-06,0.000005,0.000473,0.000302,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
428928,126-32753,0.003113,126,0.004104,0.218649,7.455736e-06,1.272477e-05,0.000024,0.001142,0.000777,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
428929,126-32758,0.004070,126,0.003118,0.244481,2.550217e-06,7.278551e-06,0.000011,0.000503,0.000443,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
428930,126-32763,0.003357,126,0.003661,0.423108,4.275249e-07,-5.802447e-06,-0.000010,0.000466,0.000477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [66]:
import xgboost as xgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


X = book_all_features_encoded.drop(['row_id','target','stock_id'],axis=1)
y = book_all_features_encoded['target']


print('Persistence model perf :', rmspe(y,book_all_features_encoded['volatility']))

xgboost_default = xgb.XGBRegressor(random_state=0)
xgboost_default.fit(X,y)

yhat_xgb = xgboost_default.predict(X)
print('New model xgb perf : ', rmspe(y, yhat_xgb))

lightgbm_default = LGBMRegressor()
lightgbm_default.fit(X,y)
yhat_light = lightgbm_default.predict(X)
print('New model lgbm perf : ', rmspe(y, yhat_light))

catboost_default = CatBoostRegressor(verbose=0)
catboost_default.fit(X,y)
yhat_cat = catboost_default.predict(X)
print('New model catboost perf : ', rmspe(y, yhat_cat))

print('New model mean gradient boosted trees : ', rmspe(y,(yhat_xgb + yhat_light + yhat_cat)/3))


Persistence model perf : 0.34135449018801606
New model xgb perf :  0.306084377117114
New model lgbm perf :  0.2927659975146637
New model catboost perf :  0.28643356812572324
New model mean gradient boosted trees :  0.2931403264914029


**Main evaluation code**

In [None]:
# Glob book file train (contains all paths for each file in this folder)
list_order_book_file_train = glob.glob(os.path.join(datapath,'book_train.parquet','*')) 
list_order_book_file_test = glob.glob(os.path.join(datapath,'book_test.parquet','*'))
list_trade_file_train = glob.glob(os.path.join(datapath,'trade_train.parquet','*')) 
list_trade_file_test = glob.glob(os.path.join(datapath,'trade_test.parquet','*'))

# Compute predictions
predictions = prediction_function(pred='entropy',
                                 book_path_train=list_order_book_file_train,
                                 trade_path_train=list_trade_file_train,
                                 targets=train,
                                 book_path_test=list_order_book_file_test,
                                 trade_path_test=list_trade_file_test,
                                 all_stocks_ids=all_stocks_ids,
                                 test_file=test)

predictions.to_csv('submission.csv',index = False)

stock id computing = 0
Computing one stock entropy took 0.013932466506958008 seconds for stock  0
stock id computing = 0
Computing one stock entropy took 46.5729238986969 seconds for stock  0
stock id computing = 1
Computing one stock entropy took 50.43742537498474 seconds for stock  1
stock id computing = 10
Computing one stock entropy took 53.414494037628174 seconds for stock  10
stock id computing = 100
Computing one stock entropy took 49.98269701004028 seconds for stock  100
stock id computing = 101
Computing one stock entropy took 53.656184911727905 seconds for stock  101
stock id computing = 102
Computing one stock entropy took 50.270257234573364 seconds for stock  102
stock id computing = 103


**Notes**