<a href="https://colab.research.google.com/github/Meta-Sean/Practical-Deep-Learning/blob/main/optiver_realized_vol.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from google.colab import files
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [4]:
! mkdir ~/.kaggle #creating folder
! cp kaggle.json ~/.kaggle/ #copying kaggle.json
! chmod 600 ~/.kaggle/kaggle.json #reading the file with full access

In [5]:
from pathlib import Path
import os

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/optiver-realized-volatility-prediction')
    !pip install -Uqq fastai
else:
    import zipfile,kaggle
    path = Path('optiver-realized-volatility-prediction')
    kaggle.api.competition_download_cli(str(path))
    zipfile.ZipFile(f'{path}.zip').extractall(path)

Downloading optiver-realized-volatility-prediction.zip to /content


100%|██████████| 1.59G/1.59G [00:11<00:00, 148MB/s]





# Weighed average price

In [4]:
def weighted_average_price(bid_price, ask_price, bid_size, ask_size):
  return ((bid_price * ask_size) + (ask_price * bid_size)) / (bid_size + ask_size)

weighted_average_price(147, 148, 251, 221)

147.53177966101694

# Log returns
## How can we compare the price of a stock between yesterday and today
 - additive across time
 - unbounded

# Realized volatility
- normalized 1-yer period and the annualied standard deviationm, but we are not annualizing the volatility and we assume that log returns have 0 mean.
- sqaured root of the sum of squared log returns
- Using WAP of the stock to compute log returns

In [6]:
import pandas as pd
import numpy as np
import plotly.express as px
train = pd.read_csv('optiver-realized-volatility-prediction/train.csv')
train.head()

Unnamed: 0,stock_id,time_id,target
0,0,5,0.004136
1,0,11,0.001445
2,0,16,0.002168
3,0,31,0.002195
4,0,62,0.001747


In [8]:
book_example = pd.read_parquet('optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
trade_example =  pd.read_parquet('optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0')
stock_id = '0'
book_example = book_example[book_example['time_id']==5]
book_example.loc[:,'stock_id'] = stock_id
trade_example = trade_example[trade_example['time_id']==5]
trade_example.loc[:,'stock_id'] = stock_id

In [9]:
book_example.head()

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,stock_id
0,5,0,1.001422,1.002301,1.00137,1.002353,3,226,2,100,0
1,5,1,1.001422,1.002301,1.00137,1.002353,3,100,2,100,0
2,5,5,1.001422,1.002301,1.00137,1.002405,3,100,2,100,0
3,5,6,1.001422,1.002301,1.00137,1.002405,3,126,2,100,0
4,5,7,1.001422,1.002301,1.00137,1.002405,3,126,2,100,0


In [10]:
trade_example.head()

Unnamed: 0,time_id,seconds_in_bucket,price,size,order_count,stock_id
0,5,21,1.002301,326,12,0
1,5,46,1.002778,128,4,0
2,5,50,1.002818,55,1,0
3,5,57,1.003155,121,5,0
4,5,68,1.003646,4,1,0


In [11]:
book_example['wap'] = (book_example['bid_price1'] * book_example['ask_size1'] +
                                book_example['ask_price1'] * book_example['bid_size1']) / (
                                       book_example['bid_size1']+ book_example['ask_size1'])

In [12]:
book_example.head()

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,stock_id,wap
0,5,0,1.001422,1.002301,1.00137,1.002353,3,226,2,100,0,1.001434
1,5,1,1.001422,1.002301,1.00137,1.002353,3,100,2,100,0,1.001448
2,5,5,1.001422,1.002301,1.00137,1.002405,3,100,2,100,0,1.001448
3,5,6,1.001422,1.002301,1.00137,1.002405,3,126,2,100,0,1.001443
4,5,7,1.001422,1.002301,1.00137,1.002405,3,126,2,100,0,1.001443


In [13]:
fig = px.line(book_example, x="seconds_in_bucket", y="wap", title="WAP of stock_id_0, time_id_5")
fig.show()

In [14]:
def log_return(list_stock_prices):
  return np.log(list_stock_prices).diff()

In [15]:
book_example.loc[:, 'log_return'] = log_return(book_example['wap'])
book_example = book_example[~book_example['log_return'].isnull()]

In [16]:
fig = px.line(book_example, x="seconds_in_bucket", y="log_return", title="Log return of stock_id_0, time_id_5")
fig.show()

In [21]:
def realized_volatility(series_log_return):
  return np.sqrt(np.sum(series_log_return**2))
realized_vol = realized_volatilty(book_example['log_return'])
print(f'Realized vol for sotck_id 0 on time_id 5 is {realized_vol}')

Realized vol for sotck_id 0 on time_id 5 is 0.004499364172786558


In [22]:
import os
from sklearn.metrics import r2_score
import glob
list_order_book_file_train = glob.glob('optiver-realized-volatility-prediction/book_train.parquet/*')

In [23]:
def realized_volatility_per_time_id(file_path, prediction_column_name):
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                      df_book_data['bid_size1']+ df_book_data[
                                  'ask_size1'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':prediction_column_name})
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id',prediction_column_name]]

In [24]:
def past_realized_volatility_per_stock(list_file,prediction_column_name):
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,
                                     realized_volatility_per_time_id(file,prediction_column_name)])
    return df_past_realized
df_past_realized_train = past_realized_volatility_per_stock(list_file=list_order_book_file_train,
                                                           prediction_column_name='pred')

In [25]:
train.head()

Unnamed: 0,stock_id,time_id,target
0,0,5,0.004136
1,0,11,0.001445
2,0,16,0.002168
3,0,31,0.002195
4,0,62,0.001747


In [26]:
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id', 'target']]
df_joined = train.merge(df_past_realized_train[['row_id', 'pred']], on = ['row_id'], how = 'left')

In [29]:
from sklearn.metrics import r2_score
def rmspe(y_true, y_pred):
  return (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
R2 = round(r2_score(y_true = df_joined['target'], y_pred = df_joined['pred']), 3)
RMSPE = round(rmspe(y_true = df_joined['target'], y_pred = df_joined['pred']), 3)
print(f'Performance of the naive prediciton: R2 score: {R2}, RMSPE: {RMSPE}')

Performance of the naive prediciton: R2 score: 0.628, RMSPE: 0.341


In [31]:
list_order_book_file_test = glob.glob('optiver-realized-volatility-prediciton/book_test.paraquet/*')
df_naive_pred_test = past_realized_volatility_per_stock(list_file=list_order_book_file_test, prediction_column_name='target')

df_naive_pred_test.to_csv('submission.csv', index = False)

In [33]:
!kaggle competitions submit -c optiver-realized-volatility-prediction -f submission.csv -m "Naive Prediciton using past realized vol"

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.7/dist-packages/kaggle/cli.py", line 67, in main
    out = args.func(**command_args)
  File "/usr/local/lib/python3.7/dist-packages/kaggle/api/kaggle_api_extended.py", line 562, in competition_submit_cli
    competition, quiet)
  File "/usr/local/lib/python3.7/dist-packages/kaggle/api/kaggle_api_extended.py", line 513, in competition_submit
    content_length=os.path.getsize(file_name),
  File "/usr/lib/python3.7/genericpath.py", line 50, in getsize
    return os.stat(filename).st_size
FileNotFoundError: [Errno 2] No such file or directory: 'optiver_realized_vol.ipynb'


In [37]:
! kaggle competitions submissions -c optiver-realized-volatility-prediction

No submissions found
