In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import os
from sklearn.metrics import r2_score
import glob
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import pdb
from tqdm import tqdm
from joblib import Parallel, delayed
import time

In [2]:
train = pd.read_csv('./optiver-realized-volatility-prediction/train.csv')
test = pd.read_csv('./optiver-realized-volatility-prediction/test.csv')

## Define Feature

### 0, base feature

In [3]:
def base_feat(ts):
    return [ts.mean(), ts.std(), ts.median(), ts.min(), ts.max()]

### 1, book feature

In [4]:
def bidAskSpread(df_book_data): 
    return df_book_data.ask_price1/df_book_data.bid_price1 -1 

In [5]:
def wap(df_book_data):
    return (df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                      df_book_data['bid_size1']+ df_book_data['ask_size1'])

In [6]:
def logRet_book(df_book_data):
    return np.log(wap(df_book_data)).diff()[1:]

In [7]:
def size_imb(df_book_data):
    ask_size = df_book_data.ask_size1 + df_book_data.ask_size2
    bid_size = df_book_data.bid_size1 + df_book_data.bid_size2
    return (ask_size - bid_size) / (ask_size + bid_size)

In [8]:
book_feat_list = [bidAskSpread, wap, logRet_book, size_imb]

### 2, trade feature

In [9]:
def logRet_trade(df_trade_data):
    return np.log(df_trade_data.price).diff()

In [10]:
def size_trade(df_trade_data):
    return df_trade_data['size']

In [11]:
def order_count(df_trade_data):
    return df_trade_data.order_count

In [12]:
trade_feat_list = [logRet_trade, size_trade, order_count]

## Dataloader

In [55]:
stock_id_list_train = np.unique(train.stock_id)
stock_id_list_test = np.unique(test.stock_id)

In [58]:
def get_data(data, book_feat_list, trade_feat_list, base_feat, stock_id, tag='train'):
    book_file_path = f'optiver-realized-volatility-prediction/book_{tag}.parquet/stock_id={stock_id}'
    df_book_data = pd.read_parquet(book_file_path)
    trade_file_path = f'optiver-realized-volatility-prediction/trade_{tag}.parquet/stock_id={stock_id}'
    df_trade_data = pd.read_parquet(trade_file_path)
    time_id_list = np.unique(df_trade_data.time_id)
    feat_list = []
    target_list = []
    for time_id in tqdm(time_id_list):
        if tag == 'train':
            target = data.query(f'stock_id == {stock_id} & time_id == {time_id}').target.item()
        else:
            target = np.nan
        df_book_data_sub = df_book_data.query(f'time_id == {time_id}')
        df_trade_data_sub = df_trade_data.query(f'time_id == {time_id}')
        feat_list_sub = []
        for func in book_feat_list:
            feat_list_sub.extend(base_feat(func(df_book_data_sub)))
        for func in trade_feat_list:
            feat_list_sub.extend(base_feat(func(df_trade_data_sub)))
        feat_list.append(np.array(feat_list_sub))
        target_list.append(target)
    return np.array(target_list), np.stack(feat_list, axis=0)

In [60]:
start_time = time.time()
result_test = Parallel(n_jobs = -1, verbose = 1)(delayed(get_data)(test, book_feat_list, trade_feat_list, base_feat, stock_id, tag='test') for stock_id in stock_id_list_test)
print("--- %s seconds ---" % (time.time() - start_time))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


--- 0.382068395614624 seconds ---


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.3s finished


In [64]:
start_time = time.time()
result_train = Parallel(n_jobs = -1, verbose = 1)(delayed(get_data)(train, book_feat_list, trade_feat_list, base_feat, stock_id, tag='train') for stock_id in stock_id_list_train)
print("--- %s seconds ---" % (time.time() - start_time))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed: 12.4min


--- 2668.3007152080536 seconds ---


[Parallel(n_jobs=-1)]: Done 112 out of 112 | elapsed: 44.5min finished


In [83]:
train_y = []
train_x = []
for data in result_train:
    train_y.append(data[0])
    train_x.append(data[1])

In [84]:
train_x = np.vstack(train_x)
train_y = np.hstack(train_y)

In [87]:
np.save('train_x.npy', train_x)
np.save('train_y.npy', train_y)

In [None]:
np.save('train_x.npy', train_x)
np.save('train_y.npy', train_y)

In [109]:
test_x = []
for data in result_test:
    test_x.append(data[1])

In [110]:
np.save('test_x.npy', test_x)

In [None]:
# class MarketDataset(Dataset):
#     def __init__(self, train_data, ob_feat_list=None, t_feat_list=None):
#         self.train_data = train_data
#         self.ob_feat_list = ob_feat_list
#         self.t_feat_list = t_feat_list
    
#     def __len__(self):
#         return len(self.train_data)
    
#     def __getitem__(self, index):
        
#         stock_id = int(self.train_data.loc[index].stock_id)
#         time_id = int(self.train_data.loc[index].time_id)
#         book_file_path = f'optiver-realized-volatility-prediction/book_train.parquet/stock_id={stock_id}'
#         df_book_data = pd.read_parquet(book_file_path).query(f'time_id == {time_id}')
#         trade_file_path = f'optiver-realized-volatility-prediction/trade_train.parquet/stock_id={stock_id}'
#         df_trade_data = pd.read_parquet(trade_file_path).query(f'time_id == {time_id}')
#         target = self.train_data.query(f'stock_id == {stock_id} & time_id == {time_id}').target.item()

#         feat_list = []
#         for func in book_feat_list:
#           feat_list.extend(base_feat(func(df_book_data)))
#         for func in trade_feat_list:
#           feat_list.extend(base_feat(func(df_trade_data)))

#         return target, np.array(feat_list)