## Main code for Kaggle - Optiver Realized Volatility Prediction
@LaurentMombaerts 

In [28]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


**Lib Import / Data loading**

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import glob
from sklearn.metrics import r2_score

# Maths
import nolds

# Paths tricks
import os
from pathlib import Path

# Models
from support_file import *

datapath = os.path.join(str(Path.home()), 'ownCloud', 'Data', 'Kaggle', 'optiver-realized-volatility-prediction')

# Load dataset
train = pd.read_csv(os.path.join(datapath,'train.csv')) 
all_stocks_ids = train['stock_id'].unique()
all_time_ids = train['time_id'].unique()

train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]

**Functions**

In [30]:
# Competition metric
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

# Prediction function (chose here which prediction strategy to use)
def prediction_function(pred,book_path_train,trade_path_train,targets,book_path_test,trade_path_test):
    
    if pred == 'naive':
        # Naive prediction (persistence model)
        prediction = past_realized_volatility_per_stock(list_file=book_path_train,prediction_column_name='pred')
        
        # Merge and evaluate results
        prediction = train.merge(prediction[['row_id','pred']], on = ['row_id'], how = 'left')
        print(prediction.head(5))

        # Estimate performances
        R2 = round(r2_score(y_true = prediction['target'], y_pred = prediction['pred']),3)
        RMSPE = round(rmspe(y_true = prediction['target'], y_pred = prediction['pred']),3)

        print('--')
        print(f'Performance of prediction: R2 score: {R2}, RMSPE: {RMSPE}')
        
        prediction = prediction.drop(columns=['target'])
        prediction = prediction.rename(columns={'pred': 'target'})

    if pred == 'stupid_RF':
        # Stupid nonlinear regression between persistence and next volatility (random forest)
        prediction = stupidForestPrediction(book_path_train=book_path_train,
                                            prediction_column_name='pred',
                                            train_targets_pd=targets,
                                            book_path_test=book_path_test)
        
    if pred == 'entropy_based':
        prediction = entropy_Prediction(book_path_train=book_path_train,
                                            prediction_column_name='pred',
                                            train_targets_pd=targets,
                                            book_path_test=book_path_test)
        
    return prediction

**Test code**

In [33]:
# Try a prediction code

# Glob book file train (contains all paths for each file in this folder)
list_order_book_file_train = glob.glob(os.path.join(datapath,'book_train.parquet','*')) 
list_order_book_file_test = glob.glob(os.path.join(datapath,'book_test.parquet','*'))
list_trade_file_train = glob.glob(os.path.join(datapath,'trade_train.parquet','*')) 
list_trade_file_test = glob.glob(os.path.join(datapath,'trade_test.parquet','*'))

# Given variables
pred = 'entropy_based'
book_path_train = list_order_book_file_train
trade_path_train = list_trade_file_train
targets = train
book_path_test = list_order_book_file_test
trade_path_test = list_trade_file_test
                    
# Code

def computeBook_Entropy():
    
    return entropy
    
def entropy_Prediction(book_path_train,prediction_column_name,train_targets_pd,book_path_test):
    
    entropies = list() 
    # Compute entropy
    #for file in book_path_train:
    #    df_book_stock_time = pd.read_parquet(file)
    
    #test = pd.read_parquet()
    prediction = entropies
    
    return prediction

entropies = list()

book_pd = pd.read_parquet(book_path_train[0])
book_pd['wap'] = compute_wap(book_pd)
#resampled_wap = wap.interpolate()

In [34]:
book_pd

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,wap
0,5,0,1.001422,1.002301,1.001370,1.002353,3,226,2,100,1.001434
1,5,1,1.001422,1.002301,1.001370,1.002353,3,100,2,100,1.001448
2,5,5,1.001422,1.002301,1.001370,1.002405,3,100,2,100,1.001448
3,5,6,1.001422,1.002301,1.001370,1.002405,3,126,2,100,1.001443
4,5,7,1.001422,1.002301,1.001370,1.002405,3,126,2,100,1.001443
...,...,...,...,...,...,...,...,...,...,...,...
917548,32767,568,0.998275,0.998754,0.997796,0.998946,90,90,48,28,0.998515
917549,32767,569,0.998275,0.998754,0.997892,0.998946,91,90,200,28,0.998516
917550,32767,571,0.998275,0.998754,0.997892,0.998946,91,90,100,28,0.998516
917551,32767,572,0.998275,0.998754,0.997892,0.998946,92,90,100,28,0.998517


In [36]:
book_all_features = pd.DataFrame()
for file in book_path_train():
    book_stock = pd.read_parquet(file)
    for time_id in all_time_ids:
        book_stock_time = book_stock[book_stock['time_id'] == time_id]
        book_features = pd.DataFrame()
        book_features['stock_id'] = file.split('=')[1]
        book_features['time_id'] = time_id
        book_features['row_id'] = book_features['time_id'].apply(lambda x:f'{stock_id}-{x}')
        

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000017DD4259688>

In [27]:
#sampleEntropy = nolds.sampen(resampled_wap)

**Main evaluation code**

In [15]:
# Glob book file train (contains all paths for each file in this folder)
list_order_book_file_train = glob.glob(os.path.join(datapath,'book_train.parquet','*')) 
list_order_book_file_test = glob.glob(os.path.join(datapath,'book_test.parquet','*'))
list_trade_file_train = glob.glob(os.path.join(datapath,'trade_train.parquet','*')) 
list_trade_file_test = glob.glob(os.path.join(datapath,'trade_test.parquet','*'))

# Compute predictions
prediction = prediction_function(pred='stupid_RF',
                                 book_path_train=list_order_book_file_train,
                                 trade_path_train=list_trade_file_train,
                                 targets=train,
                                 book_path_test=list_order_book_file_test,
                                 trade_path_test=list_trade_file_test)

**Notes**

In [14]:
prediction

Unnamed: 0,row_id,target
0,0-4,0.001062
