In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import r2_score
import glob

In [2]:
# A function to calculate realized volatility for all time intervals in a single book file
def realized_volatility_single_stock(file_path, prediction_column_name):
    df_book_data = pd.read_parquet(file_path)
    stock_id = file_path.split('=')[1]
    time_ids, bpr, bsz, apr, asz = (df_book_data[col].values for col in ['time_id', 'bid_price1','bid_size1','ask_price1','ask_size1' ])
    wap = (bpr * asz +apr * bsz) / (asz + bsz)
    log_wap = np.log(wap)
    ids, index = np.unique(time_ids, return_index=True)

    splits = np.split(log_wap, index[1:])
    ret=[]
    for time_id, x in zip(ids.tolist(), splits):
        log_ret = np.diff(x)
        volatility = np.sqrt((log_ret ** 2).sum())
        ret.append((f'{stock_id}-{time_id}', volatility.item()))
    return pd.DataFrame(ret, columns=['row_id', prediction_column_name])

In [3]:
def realized_volatility_all(files_list, prediction_column_name):
    return pd.concat( [realized_volatility_single_stock(file, prediction_column_name) for file in files_list])

Run on the train set to sanity check

In [5]:
list_order_book_file_train = glob.glob('../input/optiver-realized-volatility-prediction/book_train.parquet/*')


In [6]:
%%time
df_past_realized_train = realized_volatility_all(list_order_book_file_train, 'pred')

CPU times: user 20.3 s, sys: 1.92 s, total: 22.2 s
Wall time: 18.4 s


In [7]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]
df_joined = train.merge(df_past_realized_train[['row_id','pred']], on = ['row_id'], how = 'left')


In [8]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
R2 = round(r2_score(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
RMSPE = round(rmspe(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
print(f'Performance of the naive prediction: R2 score: {R2}, RMSPE: {RMSPE}')

Performance of the naive prediction: R2 score: 0.628, RMSPE: 0.341


In [10]:
list_order_book_file_test = glob.glob('../input/optiver-realized-volatility-prediction/book_test.parquet/*')
df_naive_pred_test = realized_volatility_all(list_order_book_file_test,'target')
df_naive_pred_test.to_csv('submission.csv',index = False)

In [11]:
pd.read_csv('submission.csv').head()


Unnamed: 0,row_id,target
0,0-4,0.000294
