## Main code for Kaggle - Optiver Realized Volatility Prediction
@LaurentMombaerts 

In [76]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


**Lib Import / Data loading**

In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import glob
from sklearn.metrics import r2_score

import os
import earthpy as et

datapath = os.path.join(et.io.HOME, 'data', 'optiver-realized-volatility-prediction')
# datapath = os.path.join(et.io.HOME, 'ownCloud', 'Data', 'Kaggle', 'optiver-realized-volatility-prediction')

# Load dataset
train = pd.read_csv(os.path.join(datapath,'train.csv')) 
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]

In [78]:
datapath

'/Users/vladimir.levin/data/optiver-realized-volatility-prediction'

In [79]:
train

Unnamed: 0,row_id,target
0,0-5,0.004136
1,0-11,0.001445
2,0-16,0.002168
3,0-31,0.002195
4,0-62,0.001747
...,...,...
428927,126-32751,0.003461
428928,126-32753,0.003113
428929,126-32758,0.004070
428930,126-32763,0.003357


**Functions**

In [80]:
# Competition metric
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

# Prediction function (chose here which prediction strategy to use)
def prediction_function(pred, book_path_train, trade_path_train, targets, book_path_test, trade_path_test):
    
    if pred == 'naive':
        # Naive prediction (persistence model)
        prediction = past_realized_volatility_per_stock(list_file=book_path_train,prediction_column_name='pred')
        
        # Merge and evaluate results
        prediction = train.merge(prediction[['row_id','pred']], on = ['row_id'], how = 'left')
        prediction = prediction[prediction.pred.notnull()]
        print(prediction.head(5))

        # Estimate performances
        R2 = round(r2_score(y_true = prediction['target'], y_pred = prediction['pred']),3)
        RMSPE = round(rmspe(y_true = prediction['target'], y_pred = prediction['pred']),3)

        print('--')
        print(f'Performance of prediction: R2 score: {R2}, RMSPE: {RMSPE}')
        
        prediction = prediction.drop(columns=['target'])
        prediction = prediction.rename(columns={'pred': 'target'})

    if pred == 'stupid_RF':
        # Stupid nonlinear regression between persistence and next volatility (random forest)
        prediction = stupidForestPrediction(book_path_train=book_path_train,
                                            prediction_column_name='pred',
                                            train_targets_pd=targets,
                                            book_path_test=book_path_test)
        
    if pred == 'garch':
        # fit garch and predict
        prediction = garch_volatility_per_stock(list_file=book_path_train, prediction_column_name='pred')
        
        # Merge and evaluate results
        prediction = train.merge(prediction[['row_id','pred']], on = ['row_id'], how = 'left')
        prediction = prediction[prediction.pred.notnull()]
        print(prediction.head(5))
        
        # Estimate performances
        R2 = round(r2_score(y_true = prediction['target'], y_pred = prediction['pred']),3)
        RMSPE = round(rmspe(y_true = prediction['target'], y_pred = prediction['pred']),3)

        print('--')
        print(f'Performance of prediction: R2 score: {R2}, RMSPE: {RMSPE}')
        
#         prediction = prediction.drop(columns=['target'])
#         prediction = prediction.rename(columns={'pred': 'target'})
    
    return prediction

**Main evaluation code**

In [81]:
# Models
# from persistence import *
from custom1 import *
import warnings
warnings.filterwarnings('ignore')

In [83]:
# Glob book file train (contains all paths for each file in this folder)
list_order_book_file_train = glob.glob(os.path.join(datapath,'book_train.parquet','*')) 
list_order_book_file_test = glob.glob(os.path.join(datapath,'book_test.parquet','*'))
list_trade_file_train = glob.glob(os.path.join(datapath,'trade_train.parquet','*')) 
list_trade_file_test = glob.glob(os.path.join(datapath,'trade_test.parquet','*'))

# Compute predictions
prediction = prediction_function(pred='naive',
                                 book_path_train=list_order_book_file_train[0:30],
                                 trade_path_train=list_trade_file_train,
                                 targets=train,
                                 book_path_test=list_order_book_file_test,
                                 trade_path_test=list_trade_file_test)

      row_id    target      pred
38300   10-5  0.005707  0.005378
38301  10-11  0.002352  0.002376
38302  10-16  0.002363  0.002343
38303  10-31  0.002341  0.002042
38304  10-62  0.002007  0.001426
--
Performance of prediction: R2 score: 0.628, RMSPE: 0.343


30 stocks

1. naive: Performance of prediction: R2 score: 0.628, RMSPE: 0.343
2. garch: Performance of prediction: R2 score: -1.759, RMSPE: 0.966
3. stupid_RF:

**Notes**

In [60]:
prediction

Unnamed: 0,row_id,target
61279,17-5,0.004091
61280,17-11,0.002155
61281,17-16,0.002566
61282,17-31,0.002221
61283,17-62,0.002155
...,...,...
99574,28-32751,0.002121
99575,28-32753,0.001796
99576,28-32758,0.002111
99577,28-32763,0.002089
