In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

(parent_folder_path, current_dir) = os.path.split(os.path.abspath(''))
sys.path.append(parent_folder_path)

from pathlib import Path
from typing import Optional
import numpy as np
import pandas as pd
from tqdm import tqdm
from glob import glob
from decimal import Decimal

from equities.data_processing.lobster import lobster_preproc
from equities.data_processing import itch_preproc
from equities.data_processing.lobster import lobster_encoding
from equities.data_processing import itch_encoding
from equities.data_processing.lobster import lobster_dataloader

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Let's setup a few paths! For convenience, we will redirect all the output artefacts to the MarketSimT/artefacts directory—this includes processed dataset, tokenizer files, experimental artefacts (configs, saved checkpoints, trained models, etc.), etc.

In [21]:
# BASE_DIR = os.path.abspath(".")
# ARTEFACTS_DIR = os.path.join(BASE_DIR, "artefacts")
# SCRIPTS_DIR = os.path.join(BASE_DIR, "scripts")
# CONFIGS_DIR = os.path.join(BASE_DIR, "scripts/configs")

In [None]:
# !process_dataset.py \
#     --data_dir={os.path.join(parent_folder_path, "dataset/raw/ITCH/")} \
#     --save_dir={os.path.join(parent_folder_path, "dataset/ITCH/")} \
#     --filter_above_lvl=20 \
#     --n_tick_range=500

In [4]:
lobster_load_path = parent_folder_path + '/dataset/raw/LOBSTER/'
itch_load_path = parent_folder_path + '/dataset/raw/ITCH/'

# locate LOBSTER data
lobster_message_files = sorted(glob(lobster_load_path + '*message*.csv'))
lobster_book_files = sorted(glob(lobster_load_path + '*orderbook*.csv'))

print('found', len(lobster_message_files), 'LOBSTER message files')
print('found', len(lobster_book_files), 'LOBSTER book files')
print()

# locate ITCH data
itch_message_files = sorted(glob(itch_load_path + '*message*.csv'))
itch_book_files = sorted(glob(itch_load_path + '*book*.csv'))

print('found', len(itch_message_files), 'ITCH message files')
print('found', len(itch_book_files), 'ITCH book files')

found 1 LOBSTER message files
found 1 LOBSTER book files

found 1 ITCH message files
found 1 ITCH book files


In [5]:
# load LOBSTER data
for m_f, b_f in tqdm(zip(lobster_message_files, lobster_book_files)):
    print(m_f)
    print(b_f)

    lobster_messages = lobster_preproc.load_message_df(m_f)

    lobster_book = pd.read_csv(
        b_f,
        index_col=False,
        header=None
    )
    assert len(lobster_messages) == len(lobster_book)

    print("LOBSTER messages shape:", lobster_messages.shape)
    print("LOBSTER book shape:", lobster_book.shape)

# load ITCH data
for m_f, b_f in tqdm(zip(itch_message_files, itch_book_files)):
    print(m_f)
    print(b_f)

    itch_messages = itch_preproc.load_message_df(m_f)

    itch_book = pd.read_csv(
        b_f,
        # index_col=False,
        # header=None
    )
    assert len(itch_messages) == len(itch_book)

    print("ITCH messages shape:", itch_messages.shape)
    print("ITCH book shape:", itch_book.shape)

0it [00:00, ?it/s]

/home/aaron/Documents/Github/MarketSimT/dataset/raw/LOBSTER/AAPL_2012-06-21_34200000_57600000_message_10.csv
/home/aaron/Documents/Github/MarketSimT/dataset/raw/LOBSTER/AAPL_2012-06-21_34200000_57600000_orderbook_10.csv


1it [00:00,  1.51it/s]

LOBSTER messages shape: (400391, 6)
LOBSTER book shape: (400391, 40)





In [5]:
# load symbols file
symbols_load_path = parent_folder_path + '/dataset/symbols/'
symbols_file = sorted(glob(symbols_load_path + '*sp500*.txt'))[0]

print(symbols_file)

# create symbol mapping
d = {}
with open(symbols_file) as f:
    idx = 0
    for line in f:
        idx += 1
        d[line.strip()] = idx

ticker = m_f.rsplit('/', maxsplit=1)[-1][:-12].rsplit('_', maxsplit=1)[-1]
ticker

/home/aaron/Documents/Github/MarketSimT/dataset/symbols/sp500_constituents_symbols_3_06_2023.txt


### Message Processing

In [8]:
lobster_messages
# lobster_book

# itch_messages
# itch_book

Unnamed: 0,time,event_type,order_id,size,price,direction
0,34200.004241176,1,16113575,18,5853300,1
1,34200.00426064,1,16113584,18,5853200,1
2,34200.004447484,1,16113594,18,5853100,1
3,34200.025551909,1,16120456,18,5859100,-1
4,34200.025579546,1,16120480,18,5859200,-1
...,...,...,...,...,...,...
400386,57599.444019561,1,287150868,48,5776100,-1
400387,57599.444794893,1,287150931,100,5777200,-1
400388,57599.625827171,3,286898608,100,5774900,1
400389,57599.913117637,4,287150868,48,5776100,-1


In [9]:
print("Columns:", list(lobster_messages.columns))
print("Sample:", lobster_messages.values[1])
print()

lobster_tok = lobster_encoding.Message_Tokenizer()

print('<< pre processing LOBSTER dataset >>')
lobster_m_ = lobster_tok.preproc(lobster_messages, lobster_book)

print("Shape:", lobster_m_.shape)
print("Columns:", ['order_id', 'event_type', 'direction', 'price_abs', 'price', 'size',
               'delta_t_s', 'delta_t_ns', 'time_s', 'time_ns',
               # ref fields
               'price', 'size', 'time_s', 'time_ns'])
print("Sample:", lobster_m_[0])
lobster_m_

Columns: ['time', 'event_type', 'order_id', 'size', 'price', 'direction']
Sample: [Decimal('34200.00426064') 1 16113584 18 5853200 1]

<< pre processing LOBSTER dataset >>
truncating 0.0000% of prices > 99900
truncating 0.0000% of prices < -99900
Shape: (389058, 14)
Columns: ['order_id', 'event_type', 'direction', 'price_abs', 'price', 'size', 'delta_t_s', 'delta_t_ns', 'time_s', 'time_ns', 'price', 'size', 'time_s', 'time_ns']
Sample: [16113584        1        1  5853200      -31       18        0    19464
    34200  4260640    -9999    -9999    -9999    -9999]


array([[ 16113584,         1,         1, ...,     -9999,     -9999,
            -9999],
       [ 16113594,         1,         1, ...,     -9999,     -9999,
            -9999],
       [ 16120456,         1,         0, ...,     -9999,     -9999,
            -9999],
       ...,
       [287150931,         1,         0, ...,     -9999,     -9999,
            -9999],
       [286898608,         3,         1, ...,       100,     57594,
        339740972],
       [287150868,         4,         0, ...,        48,     57599,
        444019561]])

In [None]:
# # filter out only allowed event types ...
# allowed_event_types=[1,2,3,4]
# lobster_messages = lobster_messages.loc[lobster_messages.event_type.isin(allowed_event_types)].copy()
# # ... and corresponding book changes
# lobster_book = lobster_book.loc[lobster_messages.index]

# # TIME
# # DELTA_T: time since previous order --> 4 tokens of length 3
# lobster_messages.insert(
#     loc=1,
#     column='delta_t_ns',
#     value=lobster_messages['time'].diff().fillna(0)
# )
# lobster_messages.insert(
#     loc=1,
#     column='delta_t_s',
#     value=lobster_messages.delta_t_ns.astype(int)
# )
# lobster_messages.delta_t_ns = ((lobster_messages.delta_t_ns % 1) * 1000000000).astype(int)

# lobster_messages.insert(0, 'time_s', lobster_messages.time.astype(int))
# lobster_messages.rename(columns={'time': 'time_ns'}, inplace=True)
# lobster_messages.time_ns = ((lobster_messages.time_ns % 1) * 1000000000).astype(int)

# # SIZE
# lobster_messages.loc[lobster_messages['size'] > 9999, 'size'] = 9999
# lobster_messages['size'] = lobster_messages['size'].astype(int)

# # PRICE
# lobster_messages['price_abs'] = lobster_messages.price  # keep absolute price for later (simulator)
# # mid-price reference, rounded down to nearest tick_size
# tick_size = 100
# p_ref = ((lobster_book.iloc[:, 0] + lobster_book.iloc[:, 2]) / 2).shift()#.round(-2).astype(int).shift()
# p_ref = (p_ref // tick_size) * tick_size
# # --> 1999 price levels // ...00 since tick size is 100
# lobster_messages.price = lobster_tok._preproc_prices(lobster_messages.price, p_ref, p_lower_trunc=-99900, p_upper_trunc=99900)
# lobster_messages = lobster_messages.iloc[1:]
# lobster_messages.price = lobster_messages.price.astype(int)

# # DIRECTION
# lobster_messages.direction = ((lobster_messages.direction + 1) / 2).astype(int)

# # change column order
# lobster_messages = lobster_messages[['order_id', 'event_type', 'direction', 'price_abs', 'price', 'size',
#         'delta_t_s', 'delta_t_ns', 'time_s', 'time_ns']]

# lobster_messages

In [None]:
# # add original message as feature
# # for all referential order types (2, 3, 4)
# modif_types={2,3,4}
# modif_fields=['price', 'size', 'time_s', 'time_ns']
# nan_val=-9999

# m_changes = pd.merge(
#     lobster_messages.loc[lobster_messages.event_type.isin(modif_types)].reset_index(),
#     lobster_messages.loc[lobster_messages.event_type == 1, ['order_id'] + modif_fields],
#     how='left', on='order_id', suffixes=['', '_ref']).set_index('index')

# # m_changes

# # add new empty columns for referenced order
# modif_cols = [field + '_ref' for field in modif_fields]
# lobster_messages[modif_cols] = nan_val

# # replace order changes by original order and additional new fields
# lobster_messages.loc[m_changes.index] = m_changes
# lobster_messages[modif_cols] = lobster_messages[modif_cols].fillna(nan_val).astype(int)

# lobster_messages

In [10]:
itch_messages

Unnamed: 0,time,type,id,side,size,price,cancSize,execSize,oldId,oldSize,oldPrice,mpid
0,14400006432545,A,13301,1,18.0,207.85,,,,,,
1,14400008777412,A,15969,0,100.0,129.33,,,,,,
2,14400016498868,A,20677,0,1.0,114.94,,,,,,
3,14400017857990,A,22061,0,1.0,98.39,,,,,,
4,14403597489791,A,98453,1,300.0,192.70,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2010131,72000073334596,D,336886329,0,0.0,186.45,100.0,,,,,
2010132,72000073447563,D,342882157,0,0.0,188.00,1.0,,,,,
2010133,72000073477970,D,335312241,0,0.0,175.00,50.0,,,,,
2010134,72000074023439,D,343725045,1,0.0,195.00,2600.0,,,,,


In [11]:
# filter messages that are above a certain level
filter_above_lvl = 20
itch_messages, itch_book = itch_preproc.filter_by_lvl(itch_messages, itch_book, filter_above_lvl)

# remove mpid field from ITCH data
itch_messages = itch_messages.drop(columns=['mpid'])

# remove pre-market and after-market hours from ITCH data
remove_premarket = True # False
remove_aftermarket = True

if remove_premarket:
    itch_messages = itch_messages[itch_messages['time'] >= 34200000000000]
if remove_aftermarket:
    itch_messages = itch_messages[itch_messages['time'] <= 57600000000000]

# itch_messages = itch_messages[itch_messages['time'] >= 34200000000000]
# itch_messages = itch_messages[itch_messages['time'] <= 57600000000000]

# format time for pre-processing
itch_messages['time'] = itch_messages['time'].astype('string')
itch_messages['time'] = itch_messages['time'].apply(lambda x: '.'.join((x[0:5], x[5:])))
itch_messages['time'] = itch_messages['time'].apply(lambda x: Decimal(x))

# convert price to pennies from dollars
itch_messages['price'] = (itch_messages['price'] * 100).astype('int')
itch_messages['oldPrice'] = (itch_messages['oldPrice'] * 100) # make int after dealing with NaNs

itch_messages

Unnamed: 0,time,type,id,side,size,price,cancSize,execSize,oldId,oldSize,oldPrice
12347,34200.010884166,A,9449261,1,100.0,18885,,,,,
12352,34200.029662346,P,0,0,,18875,,600.0,,,
12353,34200.029662346,P,0,0,,18875,,2.0,,,
12366,34200.059062044,E,9177349,0,0.0,18874,,100.0,,,
12367,34200.059062044,P,0,0,,18874,,100.0,,,
...,...,...,...,...,...,...,...,...,...,...,...
1973110,57599.967690895,A,352936041,1,1600.0,18853,,,,,
1973113,57599.990456021,A,352942509,0,10.0,18847,,,,,
1973114,57599.998220184,A,352943165,0,100.0,18847,,,,,
1973115,57599.998239790,D,352930741,1,0.0,18850,100.0,,,,


In [12]:
print("Columns:", list(itch_messages.columns))
print("Sample:", itch_messages.values[1])
print()

itch_tok = itch_encoding.Message_Tokenizer()

print('<< pre processing ITCH dataset >>')
itch_m_ = itch_tok.preproc(itch_messages, itch_book)

print("Shape:", itch_m_.shape)
print("Columns:", ['id', 'type', 'side', 'price_abs', 'price', 'size',
               'delta_t_s', 'delta_t_ns', 'time_s', 'time_ns',
               # ref fields
               'cancSize', 'execSize', 'oldId', 'oldSize', 'oldPrice',
               'time_s_ref', 'time_ns_ref'])
print("Sample:", itch_m_[0])
itch_m_

Columns: ['time', 'type', 'id', 'side', 'size', 'price', 'cancSize', 'execSize', 'oldId', 'oldSize', 'oldPrice']
Sample: [Decimal('34200.029662346') 'P' 0 0 nan 18875 nan 600.0 nan nan nan]

<< pre processing ITCH dataset >>
truncating 0.0000% of prices > 999
truncating 0.0000% of prices < -999
Shape: (1720463, 17)
Columns: ['id', 'type', 'side', 'price_abs', 'price', 'size', 'delta_t_s', 'delta_t_ns', 'time_s', 'time_ns', 'cancSize', 'execSize', 'oldId', 'oldSize', 'oldPrice', 'time_s_ref', 'time_ns_ref']
Sample: [9177349 'E' 0 18874 -5 0 0 48177878 34200 59062044 -9999 100 -9999 -9999
 -9999 -9999 -9999]


array([[9177349, 'E', 0, ..., -9999, -9999, -9999],
       [9482317, 'A', 1, ..., -9999, -9999, -9999],
       [9482325, 'A', 1, ..., -9999, -9999, -9999],
       ...,
       [352943165, 'A', 0, ..., -9999, -9999, -9999],
       [352930741, 'D', 1, ..., -9999, 57599, 940363730],
       [352930753, 'D', 1, ..., -9999, 57599, 940388253]], dtype=object)

In [13]:
print(itch_m_.shape)
itch_m_

# prepend column with ticker ID
ticker_id = d[ticker]
itch_m_ = np.concatenate([np.full((itch_m_.shape[0], 1), ticker_id), itch_m_], axis=1)
itch_m_


(1720463, 17)


array([[40, 9177349, 'E', ..., -9999, -9999, -9999],
       [40, 9482317, 'A', ..., -9999, -9999, -9999],
       [40, 9482325, 'A', ..., -9999, -9999, -9999],
       ...,
       [40, 352943165, 'A', ..., -9999, -9999, -9999],
       [40, 352930741, 'D', ..., -9999, 57599, 940363730],
       [40, 352930753, 'D', ..., -9999, 57599, 940388253]], dtype=object)

In [None]:
# itch_tok = itch_encoding.Message_Tokenizer()

# # filter out only allowed event types ...
# allowed_event_types=['A','E','C','D','R']
# itch_messages = itch_messages.loc[itch_messages.type.isin(allowed_event_types)].copy()
# # ... and corresponding book changes
# itch_book = itch_book.loc[itch_messages.index]

# # TIME
# # DELTA_T: time since previous order --> 4 tokens of length 3
# itch_messages.insert(
#     loc=1,
#     column='delta_t_ns',
#     value=itch_messages['time'].diff().fillna(0)
# )
# itch_messages.insert(
#     loc=1,
#     column='delta_t_s',
#     value=itch_messages.delta_t_ns.astype(int)
# )
# itch_messages.delta_t_ns = ((itch_messages.delta_t_ns % 1) * 1000000000).astype(int)

# itch_messages.insert(0, 'time_s', itch_messages.time.astype(int))
# itch_messages.rename(columns={'time': 'time_ns'}, inplace=True)
# itch_messages.time_ns = ((itch_messages.time_ns % 1) * 1000000000).astype(int)

# # SIZE
# itch_messages.loc[itch_messages['size'] > 9999, 'size'] = 9999
# itch_messages.loc[itch_messages['oldSize'] > 9999, 'oldSize'] = 9999
# itch_messages['size'] = itch_messages['size'].astype(int)

# # PRICE
# itch_messages['price_abs'] = itch_messages.price  # keep absolute price for later (simulator)
# # mid-price reference, rounded down to nearest tick_size
# tick_size = 1
# p_ref = (((itch_book.iloc[:, 1] * 100) + (itch_book.iloc[:, 3] * 100)) / 2).shift()#.round(-2).astype(int).shift()
# p_ref = (p_ref // tick_size) * tick_size
# # --> 1999 price levels // ...00 since tick size is 100
# itch_messages.price = itch_tok._preproc_prices(itch_messages.price, p_ref, p_lower_trunc=-999, p_upper_trunc=999)
# itch_messages = itch_messages.iloc[1:]
# itch_messages.price = itch_messages.price.astype(int)

# # # DIRECTION
# # itch_messages.direction = ((itch_messages.direction + 1) / 2).astype(int)

# # change column order
# # m = m[['order_id', 'event_type', 'direction', 'price_abs', 'price', 'size',
# #        'delta_t_s', 'delta_t_ns', 'time_s', 'time_ns']]
# itch_messages = itch_messages[['id', 'type', 'side', 'price_abs', 'price', 'size',
#         'delta_t_s', 'delta_t_ns', 'time_s', 'time_ns',
#         'cancSize', 'execSize', 'oldId', 'oldSize', 'oldPrice']]

# itch_messages

In [None]:
# # add time elements of original message as feature and process NaNs
# # for all referential order types ('E','C','D','R')
# modif_types={'E','C','D'}
# modif_types_special={'R'}
# modif_fields=['time_s', 'time_ns']
# ref_cols = ['cancSize', 'execSize', 'oldId', 'oldSize', 'oldPrice']
# nan_val=-9999

# # make df that converts 'R' values to 'A' values
# itch_r_messages = itch_messages.copy()
# itch_r_messages['type'] = itch_r_messages['type'].replace('R', 'A')

# m_changes = pd.merge(
#     itch_messages.loc[itch_messages.type.isin(modif_types)].reset_index(),
#     itch_r_messages.loc[itch_r_messages.type == 'A', ['id'] + modif_fields],
#     how='left', on='id', suffixes=['', '_ref']).set_index('index')

# m_changes

# # find modif_fields of A events that match with R event oldId
# m_changes_special = pd.merge(
#     itch_messages.loc[itch_messages.type.isin(modif_types_special)].reset_index(),
#     (itch_r_messages.loc[itch_r_messages.type == 'A', ['id'] + modif_fields]).rename(columns={'id': 'oldId'}),
#     how='left', on='oldId', suffixes=['', '_ref']).set_index('index')

# m_changes_special

# # add new empty columns for referenced order
# modif_cols = [field + '_ref' for field in modif_fields]
# itch_messages[modif_cols] = nan_val

# # replace order changes by original order and additional new fields
# itch_messages.loc[m_changes.index] = m_changes
# itch_messages.loc[m_changes_special.index] = m_changes_special
# itch_messages[modif_cols] = itch_messages[modif_cols].fillna(nan_val).astype(int)

# # process other ref fields
# itch_messages[ref_cols] = itch_messages[ref_cols].fillna(nan_val).astype(int)

# itch_messages

### Book Processing

In [14]:
# load ITCH data
for m_f, b_f in tqdm(zip(itch_message_files, itch_book_files)):
    itch_messages = itch_preproc.load_message_df(m_f)

    itch_book = pd.read_csv(
        b_f,
        # index_col=False,
        # header=None
    )
    assert len(itch_messages) == len(itch_book)

    print("ITCH messages shape:", itch_messages.shape)
    print("ITCH book shape:", itch_book.shape)

# remove pre-market and after-market hours from ITCH data
if remove_premarket:
    itch_messages = itch_messages[itch_messages['time'] >= 34200000000000]
if remove_aftermarket:
    itch_messages = itch_messages[itch_messages['time'] <= 57600000000000]

print("ITCH messages shape 2:", itch_messages.shape)
print("ITCH book shape 2:", itch_book.shape)

# remove disallowed order types
allowed_events=['A','E','C','D','R']
itch_messages = itch_messages.loc[itch_messages.type.isin(allowed_events)]
# make sure book is same length as messages
itch_book = itch_book.loc[itch_messages.index]

print("ITCH messages shape 3:", itch_messages.shape)
print("ITCH book shape 3:", itch_book.shape)

# filter messages that are above a certain level
filter_above_lvl = 20
itch_messages, itch_book = itch_preproc.filter_by_lvl(itch_messages, itch_book, filter_above_lvl)

print("ITCH messages shape 4:", itch_messages.shape)
print("ITCH book shape 4:", itch_book.shape)

# remove time field from ITCH book data
itch_book = itch_book.drop(columns=['time'])

assert len(itch_messages) == len(itch_book)
itch_book

1it [00:04,  4.05s/it]

ITCH messages shape: (2010136, 12)
ITCH book shape: (2010136, 81)
ITCH messages shape 2: (1960770, 12)
ITCH book shape 2: (2010136, 81)





ITCH messages shape 3: (1953937, 12)
ITCH book shape 3: (1953937, 81)
ITCH messages shape 4: (1720464, 12)
ITCH book shape 4: (1720464, 81)


Unnamed: 0,1_bid_price,1_bid_vol,1_ask_price,1_ask_vol,2_bid_price,2_bid_vol,2_ask_price,2_ask_vol,3_bid_price,3_bid_vol,...,18_ask_price,18_ask_vol,19_bid_price,19_bid_vol,19_ask_price,19_ask_vol,20_bid_price,20_bid_vol,20_ask_price,20_ask_vol
12347,188.74,100.0,188.85,117.0,188.58,2.0,188.88,50.0,188.49,31.0,...,189.65,305.0,187.60,100.0,189.66,100.0,187.57,50.0,189.68,100.0
12366,188.58,2.0,188.85,117.0,188.49,31.0,188.88,50.0,188.37,100.0,...,189.65,305.0,187.57,50.0,189.66,100.0,187.53,500.0,189.68,100.0
12368,188.58,2.0,188.82,100.0,188.49,31.0,188.85,117.0,188.37,100.0,...,189.60,100.0,187.57,50.0,189.65,305.0,187.53,500.0,189.66,100.0
12369,188.58,2.0,188.75,100.0,188.49,31.0,188.82,100.0,188.37,100.0,...,189.58,900.0,187.57,50.0,189.60,100.0,187.53,500.0,189.65,305.0
12376,188.58,2.0,188.75,100.0,188.49,31.0,188.82,100.0,188.37,100.0,...,189.58,900.0,187.57,50.0,189.60,100.0,187.53,500.0,189.65,305.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1973110,188.47,8320.0,188.50,200.0,188.46,300.0,188.51,400.0,188.45,2200.0,...,188.67,4600.0,188.29,2661.0,188.68,5850.0,188.28,400.0,188.69,156.0
1973113,188.47,8330.0,188.50,200.0,188.46,300.0,188.51,400.0,188.45,2200.0,...,188.67,4600.0,188.29,2661.0,188.68,5850.0,188.28,400.0,188.69,156.0
1973114,188.47,8430.0,188.50,200.0,188.46,300.0,188.51,400.0,188.45,2200.0,...,188.67,4600.0,188.29,2661.0,188.68,5850.0,188.28,400.0,188.69,156.0
1973115,188.47,8430.0,188.50,100.0,188.46,300.0,188.51,400.0,188.45,2200.0,...,188.67,4600.0,188.29,2661.0,188.68,5850.0,188.28,400.0,188.69,156.0


In [16]:
# itch_messages
# lobster_book

In [None]:
# # process book
# price_levels = 500 # n_price_series

# # mid-price rounded to nearest tick (100)
# p_ref = ((lobster_book.iloc[:, 0] + lobster_book.iloc[:, 2]) / 2).round(-2).astype(int)
# b_indices = lobster_book.iloc[:, ::2].sub(p_ref, axis=0).div(100).astype(int) # div 100 bc tick
# b_indices = b_indices + price_levels // 2 # make fit between span of 0 to price_levels
# b_indices.columns = list(range(b_indices.shape[1])) # reset col indices
# vol_book = lobster_book.iloc[:, 1::2].copy()
# # convert sell volumes (ask side) to negative
# vol_book.iloc[:, ::2] = vol_book.iloc[:, ::2].mul(-1)
# vol_book.columns = list(range(vol_book.shape[1])) # reset col indices

# # convert to book representation with volume at each price level relative to reference price (mid)
# # whilst preserving empty levels to maintain sparse representation of book
# # i.e. at each time we have a fixed width snapshot around the mid price
# # therefore movement of the mid price needs to be a separate feature (e.g. relative to previous price)

# mybook = np.zeros((len(lobster_book), price_levels), dtype=np.int32)

# a = b_indices.values
# for i in range(a.shape[0]):
#     for j in range(a.shape[1]):
#         price = a[i, j]
#         # remove prices outside of price_levels range
#         if price >= 0 and price < price_levels:
#             mybook[i, price] = vol_book.values[i, j]

# # prepend column with best bid changes (in ticks)
# mid_diff = p_ref.div(100).diff().fillna(0).astype(int).values
# mybook = np.concatenate([mid_diff[:, None], mybook], axis=1)

# mybook

In [17]:
itch_book

Unnamed: 0,1_bid_price,1_bid_vol,1_ask_price,1_ask_vol,2_bid_price,2_bid_vol,2_ask_price,2_ask_vol,3_bid_price,3_bid_vol,...,18_ask_price,18_ask_vol,19_bid_price,19_bid_vol,19_ask_price,19_ask_vol,20_bid_price,20_bid_vol,20_ask_price,20_ask_vol
12347,188.74,100.0,188.85,117.0,188.58,2.0,188.88,50.0,188.49,31.0,...,189.65,305.0,187.60,100.0,189.66,100.0,187.57,50.0,189.68,100.0
12366,188.58,2.0,188.85,117.0,188.49,31.0,188.88,50.0,188.37,100.0,...,189.65,305.0,187.57,50.0,189.66,100.0,187.53,500.0,189.68,100.0
12368,188.58,2.0,188.82,100.0,188.49,31.0,188.85,117.0,188.37,100.0,...,189.60,100.0,187.57,50.0,189.65,305.0,187.53,500.0,189.66,100.0
12369,188.58,2.0,188.75,100.0,188.49,31.0,188.82,100.0,188.37,100.0,...,189.58,900.0,187.57,50.0,189.60,100.0,187.53,500.0,189.65,305.0
12376,188.58,2.0,188.75,100.0,188.49,31.0,188.82,100.0,188.37,100.0,...,189.58,900.0,187.57,50.0,189.60,100.0,187.53,500.0,189.65,305.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1973110,188.47,8320.0,188.50,200.0,188.46,300.0,188.51,400.0,188.45,2200.0,...,188.67,4600.0,188.29,2661.0,188.68,5850.0,188.28,400.0,188.69,156.0
1973113,188.47,8330.0,188.50,200.0,188.46,300.0,188.51,400.0,188.45,2200.0,...,188.67,4600.0,188.29,2661.0,188.68,5850.0,188.28,400.0,188.69,156.0
1973114,188.47,8430.0,188.50,200.0,188.46,300.0,188.51,400.0,188.45,2200.0,...,188.67,4600.0,188.29,2661.0,188.68,5850.0,188.28,400.0,188.69,156.0
1973115,188.47,8430.0,188.50,100.0,188.46,300.0,188.51,400.0,188.45,2200.0,...,188.67,4600.0,188.29,2661.0,188.68,5850.0,188.28,400.0,188.69,156.0


In [None]:
# # process book
# price_levels = 500 # n_price_series

# # mid-price rounded to nearest tick
# p_ref = ((itch_book.iloc[:, 0] + itch_book.iloc[:, 2]) / 2).mul(100).round().astype(int)
# b_indices = itch_book.iloc[:, ::2].mul(100).sub(p_ref, axis=0).astype(int)
# b_indices = b_indices + price_levels // 2 # make tick differences fit between span of 0 to price_levels
# b_indices.columns = list(range(b_indices.shape[1])) # reset col indices
# vol_book = itch_book.iloc[:, 1::2].copy().astype(int)
# # convert sell volumes (ask side) to negative
# vol_book.iloc[:, 1::2] = vol_book.iloc[:, 1::2].mul(-1)
# vol_book.columns = list(range(vol_book.shape[1])) # reset col indices

# # convert to book representation with volume at each price level relative to reference price (mid)
# # whilst preserving empty levels to maintain sparse representation of book
# # i.e. at each time we have a fixed width snapshot around the mid price
# # therefore movement of the mid price needs to be a separate feature (e.g. relative to previous price)

# mybook = np.zeros((len(itch_book), price_levels), dtype=np.int32)

# a = b_indices.values
# for i in range(a.shape[0]):
#     for j in range(a.shape[1]):
#         price = a[i, j]
#         # remove prices outside of price_levels range
#         if price >= 0 and price < price_levels:
#             mybook[i, price] = vol_book.values[i, j]

# # prepend column with best bid changes (in ticks)
# mid_diff = p_ref.diff().fillna(0).astype(int).values
# mybook = np.concatenate([mid_diff[:, None], mybook], axis=1)

# mybook

In [18]:
mybook = itch_preproc.process_book(itch_book, price_levels=500)
mybook

array([[ 0,  0,  0, ...,  0,  0,  0],
       [-8,  0,  0, ...,  0,  0,  0],
       [-2,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 1,  0,  0, ...,  0,  0,  0]])

In [19]:
print(mybook.shape)

# prepend column with ticker ID
ticker_id = d[ticker]
mybook = np.concatenate([np.full((mybook.shape[0], 1), ticker_id), mybook], axis=1)
mybook

(1720464, 501)


array([[40,  0,  0, ...,  0,  0,  0],
       [40, -8,  0, ...,  0,  0,  0],
       [40, -2,  0, ...,  0,  0,  0],
       ...,
       [40,  0,  0, ...,  0,  0,  0],
       [40,  0,  0, ...,  0,  0,  0],
       [40,  1,  0, ...,  0,  0,  0]])

### Dataloading

In [3]:
# Set params...
msg_seq_len = 500
use_book_data = True # False
# use_simple_book = False
# book_transform = True # False

data_dir = parent_folder_path + '/dataset/proc/LOBSTER/'
name = 'lobster'

dataset_obj = lobster_dataloader.LOBSTER(
    name,
    data_dir=data_dir,
    mask_fn=lobster_dataloader.LOBSTER_Dataset.causal_mask,
    msg_seq_len=msg_seq_len,
    use_book_data=use_book_data,
    # use_simple_book=use_simple_book,
    # book_transform=book_transform,
    # book_depth=book_depth,
    # n_cache_files=1e7,  # large number to keep everything in cache
    # return_raw_msgs=return_raw_msgs,
)
dataset_obj.setup()

n_test_files= 1


In [4]:
vars(dataset_obj)

{'data_dir': PosixPath('/home/aaron/Documents/Github/MarketSimT/dataset/proc/LOBSTER'),
 'val_split': 0.1,
 'test_split': 0.1,
 'seed': 42,
 'mask_fn': <function equities.data_processing.lobster.lobster_dataloader.LOBSTER_Dataset.causal_mask(seq, rng)>,
 'use_book_data': True,
 'use_simple_book': False,
 'book_transform': False,
 'n_cache_files': 0,
 'book_depth': 500,
 'return_raw_msgs': False,
 'msg_seq_len': 500,
 'dataset_train': <equities.data_processing.lobster.lobster_dataloader.LOBSTER_Dataset at 0x7fd74bf2f500>,
 'dataset_val': None,
 'dataset_test': None,
 'n_messages': 500,
 'train_files': ('/home/aaron/Documents/Github/MarketSimT/dataset/proc/LOBSTER/AAPL_2012-06-21_34200000_57600000_message_10_proc.npy',),
 'rng': <random.Random at 0x74dc070>,
 'train_book_files': ('/home/aaron/Documents/Github/MarketSimT/dataset/proc/LOBSTER/AAPL_2012-06-21_34200000_57600000_orderbook_10_proc.npy',),
 'd_input': 12011,
 'd_output': 12011,
 'L': 11000,
 'L_book': 500,
 'd_book': 501}

In [16]:
from equities.data_processing.base import make_data_loader

bsz = 16
# use sampler to only get individual samples and automatic batching from dataloader
trn_loader = make_data_loader(
    dataset_obj.dataset_train,
    dataset_obj,
    seed=dataset_obj.seed,
    batch_size=bsz)

trn_loader

<torch.utils.data.dataloader.DataLoader at 0x7fe599c46e40>

In [61]:
# loop over batches...
for batch_idx, batch in enumerate(tqdm(trn_loader)):
    print("batch_idx:", batch_idx)
    # print("batch:", batch)
    print("len(batch):", len(batch))
    inputs, targets, aux_data = batch
    book_data = aux_data.get("book_data", None)
    timestep_msg = aux_data.get("timesteps_msg", None) # None
    timestep_book = aux_data.get("timesteps_book", None) # None
    print("inputs.shape:", inputs.shape) # (bsz, 22*seq_len)
    print("targets.shape:", targets.shape) # (bsz, 1)
    print("book_data.shape:", book_data.shape) # (bsz, seq_len, 𝑃 + 1)
    print("inputs:", inputs) # 22 tokens per message
    print("targets:", targets) # prediction target is masked token?

    # 𝑃 separate volume features around the mid-price, coupled with one feature
    # representing mid-price changes from the previous observation.
    print("book_data:", book_data) # book snapshots or "volume images"

    break

  0%|          | 0/48 [00:01<?, ?it/s]

batch_idx: 0
len(batch): 3
inputs.shape: (16, 11000)
targets.shape: (16, 1)
book_data.shape: (16, 500, 501)
inputs: [[ 1003 12009 12008 ...     1     1     1]
 [ 1003 12010 12007 ...     1     1     1]
 [ 1005 12010 12007 ...     1     1     1]
 ...
 [ 1003 12009 12008 ...     1     1     1]
 [ 1003 12009 12008 ...     2     2     0]
 [ 1005 12010 12007 ...     1     1     1]]
targets: [[12007]
 [ 1015]
 [ 1024]
 [    2]
 [  602]
 [    2]
 [ 1107]
 [12010]
 [    3]
 [ 1003]
 [12007]
 [    3]
 [12008]
 [   46]
 [    2]
 [12009]]
book_data: [[[-2  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  ...
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]]

 [[ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  ...
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]]

 [[ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  ...
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  




In [73]:
vocab = lobster_encoding.Vocab()
vocab.ENCODING

for i in inputs[0][0:22]:
    print(i)
inputs[0][0:22]
print(lobster_encoding.decode_msg(inputs[0][0:22], vocab.ENCODING))

1003
12009
12008
11014
1011
3
3
39
148
55
902
949
59
64
2
2
2
2
2
2
2
2
[    -9999         1         0     -9999         7         4         0
     36145     52899 946056061     -9999     -9999     -9999     -9999]


In [76]:
lobster_tok = lobster_encoding.Message_Tokenizer()

vars(lobster_tok)

lobster_tok.FIELD_I

{'event_type': 0,
 'direction': 1,
 'price': 2,
 'size': 3,
 'delta_t_s': 4,
 'delta_t_ns': 5,
 'time_s': 6,
 'time_ns': 7,
 'price_ref': 8,
 'size_ref': 9,
 'time_s_ref': 10,
 'time_ns_ref': 11}

In [46]:
X = np.load(dataset_obj.train_files[0], mmap_mode='r')
X_raw = np.array(X[0:500])
print(X_raw.shape)

X_raw

X = lobster_encoding.encode_msgs(X_raw, vocab.ENCODING)

X

# seq = X_raw.copy()
seq = X.copy()

y = seq[-1][11]
y

seq[-1][11] = -10000

seq

y = y.reshape(-1)
X = X.reshape(-1)
X

(500, 14)


array([[16113584,        1,        1, ...,    -9999,    -9999,    -9999],
       [16113594,        1,        1, ...,    -9999,    -9999,    -9999],
       [16120456,        1,        0, ...,    -9999,    -9999,    -9999],
       ...,
       [17325121,        1,        0, ...,    -9999,    -9999,    -9999],
       [17325702,        1,        0, ...,    -9999,    -9999,    -9999],
       [17329817,        1,        0, ...,    -9999,    -9999,    -9999]])