In [1]:
import glob
import vaex
import dask

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from typing import List
from datetime import datetime
from src.preprocessing import extract_features, symmetrize_data
from src.microprice import get_micro_adjustment

In [2]:
DATA_PATH = '/Users/mac/Desktop/Repos/FBD_Project/datasets/'
orderbook_list = sorted(glob.glob(DATA_PATH + 'btcusdt/orderbook/*.csv.gz'))
quote_list = sorted(glob.glob(DATA_PATH + 'btcusdt/quotes/*.csv.gz'))
trade_list = sorted(glob.glob(DATA_PATH + 'btcusdt/trades/*.csv.gz'))

In [3]:
# process raw data to get features for calculation.
%time
all_features = [extract_features(path) for path in orderbook_list[:5]] 
df_feat = dask.compute(all_features)[0]
df_feat = pd.concat(df_feat)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 69.9 µs


In [9]:
# symmetrized data (for obtaining microprice)
df_sym = symmetrize_data(df_feat, symmetrize=True)

# df sig will be used for trading signals
df_sig = symmetrize_data(df_feat, symmetrize=False)

In [6]:
# get micro adjustment
df_micro = get_micro_adjustment(df_sym)

In [10]:
df_micro

Unnamed: 0_level_0,Unnamed: 1_level_0,g_star
ba_spread,imbalance,Unnamed: 2_level_1
1,1,-0.016683
1,2,-0.004311
1,3,0.004311
1,4,0.016683
2,1,-0.054286
2,2,-0.007619
2,3,0.007619
2,4,0.054286
3,1,-0.061538
3,2,0.010651


In [22]:
# join micro price table to the df_sig.
df_sig = pd.merge(
            df_sig[['mid_price','ba_spread','imbalance']].reset_index(),
            df_micro.reset_index(),
            how='left',
            left_on=['ba_spread','imbalance'], 
            right_on=['ba_spread','imbalance']
        ).set_index('timestamp')

# micro price calculation: mid_price + g_star
df_sig['micro_price'] = df_sig['mid_price'] + df_sig['g_star'] 

In [27]:
# micro price 
df_sig[['mid_price', 'micro_price']]

Unnamed: 0_level_0,mid_price,micro_price
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-09-02 00:00:00,20122.55,20122.533317
2022-09-02 00:00:01,20122.55,20122.533317
2022-09-02 00:00:03,20120.75,20120.733317
2022-09-02 00:00:08,20124.85,20124.866683
2022-09-02 00:00:09,20124.85,20124.854311
...,...,...
2022-09-06 23:59:48,18779.95,18779.933317
2022-09-06 23:59:49,18779.95,18779.933317
2022-09-06 23:59:51,18780.55,18780.566683
2022-09-06 23:59:54,18784.35,18784.354311


To do 
- check bidask imblaance and buy and sell volume (pre-trades only)
- eth, btc, ada spread distribution
- upgrade microprice prediction (bitcoin too much volatility)

In [4]:
# # process raw data to get features for calculation.
# %time
# trades = [extract_trades(path) for path in trade_list[:5]] 
# quotes = [extract_quotes(path) for path in quote_list[:5]] 

# df_trade = dask.compute(trades)[0]
# df_trade = pd.concat(df_trade)

# df_quote = dask.compute(quotes)[0]
# df_quote = pd.concat(df_quote)

# # join trade and quotes
# df_tq = pd.merge(df_quote.reset_index(), df_trade.reset_index(),how='outer', on='timestamp')
# df_tq = df_tq.set_index('timestamp').sort_index()

# df_tq['ba_spread'] = df_tq['ask_price'] - df_tq['bid_price']
# df_tq['imbalance'] = df_tq['bid_amount']/(df_tq['bid_amount'] + df_tq['ask_amount'])
# df_tq[['ba_spread', 'imbalance']] = df_tq[['ba_spread', 'imbalance']].ffill()