In [None]:
#import all the book data
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from arch import arch_model
import glob
list_order_book_file_train = glob.glob('book_train.parquet/*')

In [2]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

In [None]:
#computimg logreturn for every time id
def logreturn_per_timeid(file_path, stock_id):
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                      df_book_data['bid_size1']+ df_book_data[
                                  'ask_size1'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].transform(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    log_returns_dict = {
        f'stockid{stock_id}_timeid{time_id}': group['log_return'].values
        for time_id, group in df_book_data.groupby('time_id')
    }

    return log_returns_dict

In [4]:
print(list_order_book_file_train)

['book_train.parquet/stock_id=17', 'book_train.parquet/stock_id=28', 'book_train.parquet/stock_id=10', 'book_train.parquet/stock_id=26', 'book_train.parquet/stock_id=19', 'book_train.parquet/stock_id=21', 'book_train.parquet/stock_id=75', 'book_train.parquet/stock_id=81', 'book_train.parquet/stock_id=86', 'book_train.parquet/stock_id=72', 'book_train.parquet/stock_id=44', 'book_train.parquet/stock_id=88', 'book_train.parquet/stock_id=43', 'book_train.parquet/stock_id=20', 'book_train.parquet/stock_id=27', 'book_train.parquet/stock_id=18', 'book_train.parquet/stock_id=11', 'book_train.parquet/stock_id=16', 'book_train.parquet/stock_id=29', 'book_train.parquet/stock_id=89', 'book_train.parquet/stock_id=42', 'book_train.parquet/stock_id=73', 'book_train.parquet/stock_id=87', 'book_train.parquet/stock_id=80', 'book_train.parquet/stock_id=74', 'book_train.parquet/stock_id=103', 'book_train.parquet/stock_id=104', 'book_train.parquet/stock_id=105', 'book_train.parquet/stock_id=102', 'book_tra

In [None]:
#iterating for each stock
def logreturn_per_stock(list_file):
    all_log_returns = {}

    for file in list_file:
        stock_id = int(os.path.basename(file).split('=')[1].split('.')[0])
        one_stock_log = logreturn_per_timeid(file, stock_id)
        all_log_returns.update(one_stock_log)

    return all_log_returns

In [None]:
#ditionary with all the log return vectors
all_logreturn = logreturn_per_stock(list_file=list_order_book_file_train)

In [None]:
#apply GJR in each time id rescaling to avoid numerical instability
data = []
success_count=0
for key, vector in all_logreturn.items():
    parts = key.replace("stockid", "").split("_timeid")
    stock_id = int(parts[0])
    time_id = int(parts[1])
    model = arch_model(vector* 0.3*1e5, vol='GARCH', p=1, o=1,q=1, dist='normal')
    res = model.fit(disp='off')
    sigma_GJR = np.nan 
    if res.optimization_result.success:
        success_count += 1 
        sigma_GJR = res.conditional_volatility[-1]
    
    data.append({
        "stock_id": stock_id,
        "time_id": time_id,
        "sigma_GJR": sigma_GJR
    })
df_GJR = pd.DataFrame(data)


In [None]:
#visualize how many GJR converges out of all the time ids
print(success_count)
print(success_count/len(all_logreturn))

428430
0.9988296513200228


In [None]:

df_GJR['sigma_GJR']=df_GJR['sigma_GJR']/(0.3*1e5)
print(df_GJR)

        stock_id  time_id  sigma_GJR
0             17        5   0.000191
1             17       11   0.000126
2             17       16   0.000239
3             17       31   0.000146
4             17       62   0.000264
...          ...      ...        ...
428927        98    32751   0.000208
428928        98    32753   0.000072
428929        98    32758   0.000168
428930        98    32763   0.000203
428931        98    32767   0.000149

[428932 rows x 3 columns]


In [None]:
#import the train dataset with volatility target for each stock and time id
# add GJR column
train = pd.read_csv('train.csv')
df_joined = train.merge(
    df_GJR[['stock_id', 'time_id', 'sigma_GJR']],
    on=['stock_id', 'time_id'],
    how='left'
)

In [20]:
print(df_joined)

        stock_id  time_id    target  sigma_GJR
0              0        5  0.004136   0.000207
1              0       11  0.001445   0.000202
2              0       16  0.002168   0.000165
3              0       31  0.002195   0.000224
4              0       62  0.001747   0.000343
...          ...      ...       ...        ...
428927       126    32751  0.003461   0.000185
428928       126    32753  0.003113   0.000355
428929       126    32758  0.004070   0.000248
428930       126    32763  0.003357   0.000173
428931       126    32767  0.002090   0.000149

[428932 rows x 4 columns]
