In [4]:
import os
import glob
import dask

import numpy as np
import pandas as pd

from typing import List
from datetime import datetime
from src.preprocessing import extract_features, symmetrize_data
from src.microprice import get_micro_adjustment

In [6]:
def get_file_date(path: str) -> str:
    file_name = os.path.split(path)[1]
    next_date = file_name.split("_")[-2]
    return next_date

In [12]:
# set the path for raw and preprocessed folder 
# set the asset pair that you would like to calculate the microprice
ASSET = 'ethusdt'
RAW_DATA_PATH = '/Users/mac/Desktop/Repos/FBD_Project/datasets/raw/'
PRC_DATA_PATH = '/Users/mac/Desktop/Repos/FBD_Project/datasets/processed/'

orderbook_list = sorted(glob.glob(RAW_DATA_PATH + ASSET + '/orderbook/*.csv.gz'))
# quote_list = sorted(glob.glob(RAW_DATA_PATH + ASSET + '/quotes/*.csv.gz'))
# trade_list = sorted(glob.glob(RAW_DATA_PATH + ASSET + 'btcusdt/trades/*.csv.gz'))

In [13]:
win_len = 5

# for each window of specified length,
# process raw data to get features for calculation.
print(f'Start microprice calculation. (Asset {ASSET})')
for i in range(win_len, len(orderbook_list)):
    next_date = get_file_date(orderbook_list[i])
    print(f'Processing... (date: {next_date}, progress: {i-5}/{len(orderbook_list) - 1 - 5})')
    
    # extract raw data
    all_features = [extract_features(path) for path in orderbook_list[i-win_len:i]] 
    df_feat = dask.compute(all_features)[0]
    df_feat = pd.concat(df_feat)

    # symmetrized data (for obtaining microprice)
    df_sym = symmetrize_data(df_feat, symmetrize=True)

    # get micro adjustment and save micro adjustment matrix.
    df_micro = get_micro_adjustment(df_sym)
    df_micro.to_csv(PRC_DATA_PATH + f'microprice/{ASSET}/micro_adjustment_{next_date}.csv')

Start microprice calculation. (Asset ethusdt)
Processing... (date: 2022-09-07, progress: 0/114)
Processing... (date: 2022-09-08, progress: 1/114)
Processing... (date: 2022-09-09, progress: 2/114)
Processing... (date: 2022-09-10, progress: 3/114)
Processing... (date: 2022-09-11, progress: 4/114)
Processing... (date: 2022-09-12, progress: 5/114)
Processing... (date: 2022-09-13, progress: 6/114)
Processing... (date: 2022-09-14, progress: 7/114)
Processing... (date: 2022-09-15, progress: 8/114)
Processing... (date: 2022-09-16, progress: 9/114)
Processing... (date: 2022-09-17, progress: 10/114)
Processing... (date: 2022-09-18, progress: 11/114)
Processing... (date: 2022-09-19, progress: 12/114)
Processing... (date: 2022-09-20, progress: 13/114)
Processing... (date: 2022-09-21, progress: 14/114)
Processing... (date: 2022-09-22, progress: 15/114)
Processing... (date: 2022-09-23, progress: 16/114)
Processing... (date: 2022-09-24, progress: 17/114)
Processing... (date: 2022-09-25, progress: 18/

In [9]:
# symmetrized data (for obtaining microprice)
df_sym = symmetrize_data(df_feat, symmetrize=True)

# df sig will be used for trading signals
df_sig = symmetrize_data(df_feat, symmetrize=False)

In [22]:
# join micro price table to the df_sig.
df_sig = pd.merge(
            df_sig[['mid_price','ba_spread','imbalance']].reset_index(),
            df_micro.reset_index(),
            how='left',
            left_on=['ba_spread','imbalance'], 
            right_on=['ba_spread','imbalance']
        ).set_index('timestamp')

# micro price calculation: mid_price + g_star
df_sig['micro_price'] = df_sig['mid_price'] + df_sig['g_star'] 

In [27]:
# micro price 
df_sig[['mid_price', 'micro_price']]

Unnamed: 0_level_0,mid_price,micro_price
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-09-02 00:00:00,20122.55,20122.533317
2022-09-02 00:00:01,20122.55,20122.533317
2022-09-02 00:00:03,20120.75,20120.733317
2022-09-02 00:00:08,20124.85,20124.866683
2022-09-02 00:00:09,20124.85,20124.854311
...,...,...
2022-09-06 23:59:48,18779.95,18779.933317
2022-09-06 23:59:49,18779.95,18779.933317
2022-09-06 23:59:51,18780.55,18780.566683
2022-09-06 23:59:54,18784.35,18784.354311
