## Main code for Kaggle - Optiver Realized Volatility Prediction
@LaurentMombaerts 

In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


**Lib Import / Data loading**

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import glob
import time

# ML
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Maths
import nolds
from scipy.interpolate import interp1d

# Paths tricks
import os
from pathlib import Path

# Support code
from support_file import *

datapath = os.path.join(str(Path.home()), 'ownCloud', 'Data', 'Kaggle', 'optiver-realized-volatility-prediction')

# Load dataset
train = pd.read_csv(os.path.join(datapath,'train.csv')) 
all_stocks_ids = train['stock_id'].unique()
all_time_ids = train['time_id'].unique()

train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]

**Functions**

In [3]:
# Competition metric
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

# Prediction function (chose here which prediction strategy to use)
def prediction_function(pred,book_path_train,trade_path_train,targets,book_path_test,trade_path_test):
    
    if pred == 'naive':
        # Naive prediction (persistence model)
        prediction = past_realized_volatility_per_stock(list_file=book_path_train,prediction_column_name='pred')
        
        # Merge and evaluate results
        prediction = train.merge(prediction[['row_id','pred']], on = ['row_id'], how = 'left')
        print(prediction.head(5))

        # Estimate performances
        R2 = round(r2_score(y_true = prediction['target'], y_pred = prediction['pred']),3)
        RMSPE = round(rmspe(y_true = prediction['target'], y_pred = prediction['pred']),3)

        print('--')
        print(f'Performance of prediction: R2 score: {R2}, RMSPE: {RMSPE}')
        
        prediction = prediction.drop(columns=['target'])
        prediction = prediction.rename(columns={'pred': 'target'})

    if pred == 'stupid_RF':
        # Stupid nonlinear regression between persistence and next volatility (random forest)
        prediction = stupidForestPrediction(book_path_train=book_path_train,
                                            prediction_column_name='pred',
                                            train_targets_pd=targets,
                                            book_path_test=book_path_test)
        
    if pred == 'entropy_based':
        prediction = entropy_Prediction(book_path_train=book_path_train,
                                            prediction_column_name='pred',
                                            train_targets_pd=targets,
                                            book_path_test=book_path_test)
        
    return prediction

**Test code**

In [4]:
# Try a prediction code

# Glob book file train (contains all paths for each file in this folder)
list_order_book_file_train = glob.glob(os.path.join(datapath,'book_train.parquet','*')) 
list_order_book_file_test = glob.glob(os.path.join(datapath,'book_test.parquet','*'))
list_trade_file_train = glob.glob(os.path.join(datapath,'trade_train.parquet','*')) 
list_trade_file_test = glob.glob(os.path.join(datapath,'trade_test.parquet','*'))

# Given variables
pred = 'entropy_based'
book_path_train = list_order_book_file_train
trade_path_train = list_trade_file_train
targets = train
book_path_test = list_order_book_file_test
trade_path_test = list_trade_file_test

In [5]:
# Memory efficient version
#book_all_features = pd.DataFrame()
#encoder = np.eye(len(all_stocks_ids))

#for file in book_path_train:

#file = book_path_train[0]
#start = time.time()

#book_stock = pd.read_parquet(file)
#stock_id = file.split('=')[1]
#print('stock id computing = ' + str(stock_id) + '...')

# Compute outside of loops
#book_stock['wap'] = compute_wap(book_stock)
#book_stock['log_return'] = book_stock.groupby(['time_id'])['wap'].apply(log_return)
#book_stock = book_stock[~book_stock['log_return'].isnull()]

#print(book_stock.head(5))

# Compute the square root of the sum of log return squared to get realized volatility
#realized_vol = book_stock.groupby(['time_id'])['log_return'].agg(realized_volatility)
#df_realized_vol_per_stock = pd.DataFrame(realized_vol)
#df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':'realized_volatility'})

#entropy = book_stock.groupby(['time_id']).agg(entropy_from_book,last_min=10)
#entropy_5 = book_stock.groupby(['time_id']).agg(entropy_from_book,last_min=5)
#entropy_2 = book_stock.groupby(['time_id']).agg(entropy_from_book,last_min=2)

#print(entropy_2.head(5))

#encoded_stock = encoder[np.where(all_stocks_ids == int(stock_id))[0],:]   
#encoded_stock_pd = pd.DataFrame(encoded_stock)

# Concatenate features, rows
#book_features = pd.concat([book_features,encoded_stock_pd],axis=1)
#book_all_features = pd.concat([book_all_features,book_features])
        
#print('Computing one stock entropy took', time.time() - start, 'seconds for stock ', stock_id)


In [7]:
book_all_features = pd.DataFrame()
encoder = np.eye(len(all_stocks_ids))

for file in book_path_train:
    start = time.time()

    #file = book_path_train[48]
    book_stock = pd.read_parquet(file)
    stock_id = file.split('=')[1]
    print('stock id computing = ' + str(stock_id))
    stock_time_ids = book_stock['time_id'].unique()
    for time_id in stock_time_ids:     
        
        # Access book data at this time + stock
        book_stock_time = book_stock[book_stock['time_id'] == time_id]

        # Create feature matrix
        book_features = pd.DataFrame()
        book_features['stock_id'] = [stock_id]
        book_features['time_id'] = [time_id]
        book_features['row_id'] = book_features['time_id'].apply(lambda x:f'{stock_id}-{x}')

        # Hand-designed features
        book_features['volatility'] = realized_volatility_from_book_pd(book_stock_time=book_stock_time)
        #book_features['entropy'] = entropy_from_book(book_stock_time=book_stock_time,last_min=10)  
        #book_features['entropy_last5'] = entropy_from_book(book_stock_time=book_stock_time,last_min=5)

        book_features['entropy_last2'] = entropy_from_book(book_stock_time=book_stock_time,last_min=2)

        encoded_stock = encoder[np.where(all_stocks_ids == int(stock_id))[0],:]   
        encoded_stock_pd = pd.DataFrame(encoded_stock)

        # Concatenate features, rows
        #book_features = pd.concat([book_features,encoded_stock_pd],axis=1)   
        book_all_features = pd.concat([book_all_features,book_features])
    
    print('Computing one stock entropy took', time.time() - start, 'seconds for stock ', stock_id)

# Merge targets
#book_all_features = book_all_features.merge(train, on = ['row_id'])
book_all_features = train.merge(book_all_features, on = ['row_id'])

stock id computing = 0
Computing one stock entropy took 38.01057291030884 seconds for stock  0
stock id computing = 1
Computing one stock entropy took 43.15756893157959 seconds for stock  1
stock id computing = 10
Computing one stock entropy took 46.92417049407959 seconds for stock  10
stock id computing = 100
Computing one stock entropy took 44.92473387718201 seconds for stock  100
stock id computing = 101
Computing one stock entropy took 45.46704125404358 seconds for stock  101
stock id computing = 102
Computing one stock entropy took 44.04671335220337 seconds for stock  102
stock id computing = 103
Computing one stock entropy took 44.03094959259033 seconds for stock  103
stock id computing = 104
Computing one stock entropy took 46.51736521720886 seconds for stock  104
stock id computing = 105
Computing one stock entropy took 49.05899930000305 seconds for stock  105
stock id computing = 107
Computing one stock entropy took 49.36283206939697 seconds for stock  107
stock id computing =

ValueError: cannot embed data of length 1 with embedding dimension 3 and lag 1, minimum required length is 3

In [6]:
book_all_features

Unnamed: 0,stock_id,time_id,row_id,volatility,entropy_last2,0,1,2,3,4,...,102,103,104,105,106,107,108,109,110,111
0,0,5,0-5,0.004499,0.357092,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0,11,0-11,0.001204,0.173592,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0,16,0-16,0.002369,0.066508,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0,31,0-31,0.002574,0.076961,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0,62,0-62,0.001894,0.164630,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,20,4329,20-4329,0.002262,0.582752,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,20,4361,20-4361,0.001112,0.646385,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,20,4364,20-4364,0.002585,0.473661,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,20,4367,20-4367,0.001945,0.275590,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
import xgboost as xgb

X = book_all_features.drop(['row_id','target','stock_id','time_id'],axis=1)
y = book_all_features['target']

x_test = X # to change

xgboost_default = xgb.XGBRegressor(random_state=0)
xgboost_default.fit(X,y)

yhat = xgboost_default.predict(x_test)
print('Persistence model perf :', rmspe(y,book_all_features['volatility']))
print('New model perf : ', rmspe(y, yhat))

Persistence model perf : 0.39322701284497674
New model perf :  0.32213065513839995


In [None]:
import lightgbm

In [None]:
import catboost

**Main evaluation code**

In [15]:
# Glob book file train (contains all paths for each file in this folder)
list_order_book_file_train = glob.glob(os.path.join(datapath,'book_train.parquet','*')) 
list_order_book_file_test = glob.glob(os.path.join(datapath,'book_test.parquet','*'))
list_trade_file_train = glob.glob(os.path.join(datapath,'trade_train.parquet','*')) 
list_trade_file_test = glob.glob(os.path.join(datapath,'trade_test.parquet','*'))

# Compute predictions
prediction = prediction_function(pred='stupid_RF',
                                 book_path_train=list_order_book_file_train,
                                 trade_path_train=list_trade_file_train,
                                 targets=train,
                                 book_path_test=list_order_book_file_test,
                                 trade_path_test=list_trade_file_test)

**Notes**

In [14]:
prediction

Unnamed: 0,row_id,target
0,0-4,0.001062
