In [206]:
import glob
import vaex
import dask
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt

from datetime import datetime 

In [2]:
PROJECT_PATH = 'D:/work/personal/FBD_Project/'
orderbook_list = sorted(glob.glob(PROJECT_PATH + 'datasets/btcusdt/orderbooks/*.csv.gz'))
quote_list = sorted(glob.glob(PROJECT_PATH + 'datasets/btcusdt/quotes/*.csv.gz'))

In [195]:
@dask.delayed
def extract_features(orderbook_file_path:str) -> pd.DataFrame:
    df = pd.read_csv(orderbook_file_path)[['timestamp', 'asks[0].price', 'bids[0].price', 'asks[0].amount', 'bids[0].amount']]
                           
   # calculate mid price and bidask spread
    df['mid_price'] = (df['asks[0].price'] + df['bids[0].price'])/2
    df['ba_spread'] = np.round((df['asks[0].price'] - df['bids[0].price'])/2,2)
    df['imbalance'] = df['bids[0].amount']/(df['bids[0].amount'] + df['asks[0].amount'])
    df['timestamp'] = pd.to_datetime(df['timestamp']/1000, unit='ms')

    # convert timestamp to datetime format
    df = df[['timestamp','mid_price', 'ba_spread', 'imbalance']].set_index('timestamp')
    
    # resample by 1second frequency
    df = df.resample('1s').last().ffill()
    return df

def symmetrize_data(df_feature: pd.DataFrame, tick_size:float=0.05, numImbalance:int=10) -> pd.DataFrame:
    df_signal = df_feature.copy(deep=True)
    
    # discretize bidask spread then get next time's bidask spread
    # discretize imbalance and get next imbalance
    # cap spread values that goes over 0.2 as 0.25 (group 5 means spread is over 0.2)
    df_signal = df_signal[df_signal.ba_spread <= 0.2]
    df_signal['ba_spread'] = np.round(df_signal['ba_spread'].div(tick_size)).astype(int)
    df_signal['imbalance'] = pd.cut(df_feature['imbalance'], bins=np.arange(numImbalance+1)/numImbalance, labels=np.arange(1,numImbalance+1)).astype(int)

    # state variable x = (i,s) and next x
    df_signal['x_now'] = list(zip(df_signal['imbalance'],df_signal['ba_spread']))
    df_signal['x_next'] = df_signal['x_now'].shift(-1)

    # calculate change in mid price
    # include data that bidask spread is within 0.2, same goes for
    # mid price change
    df_signal['mid_chg'] = np.round(df_signal['mid_price'].diff(),2).shift(-1,)
    df_signal = df_signal[abs(df_signal.mid_chg) <= 0.1]

    # drop na rows then change dtype
    df_signal = df_signal.dropna()

    # make symmetric data
    df_symmetric = df_signal.copy(deep=True)
    df_symmetric['imbalance'] = numImbalance - df_signal['imbalance'] + 1
    df_symmetric['x_now'] = list(zip(df_symmetric['imbalance'], df_symmetric['ba_spread']))
    df_symmetric['x_next'] = df_symmetric['x_now'].shift(-1)
    df_symmetric['mid_chg'] = -df_signal['mid_chg']

    df = pd.concat([
            df_signal[['x_now', 'x_next','mid_chg']], 
            df_symmetric[['x_now', 'x_next','mid_chg']]
    ])
    return df.dropna()

In [None]:
def prep_data_sym(T,n_imb,dt,n_spread):
    spread=T.ask-T.bid
    ticksize=np.round(min(spread.loc[spread>0])*100)/100
    T.spread=T.ask-T.bid
    # adds the spread and mid prices
    T['spread']=np.round((T['ask']-T['bid'])/ticksize)*ticksize
    T['mid']=(T['bid']+T['ask'])/2
    #filter out spreads >= n_spread
    T = T.loc[(T.spread <= n_spread*ticksize) & (T.spread>0)]
    T['imb']=T['bs']/(T['bs']+T['as'])
    #discretize imbalance into percentiles
    T['imb_bucket'] = pd.qcut(T['imb'], n_imb, labels=False)
    T['next_mid']=T['mid'].shift(-dt)
    #step ahead state variables
    T['next_spread']=T['spread'].shift(-dt)
    T['next_time']=T['time'].shift(-dt)
    T['next_imb_bucket']=T['imb_bucket'].shift(-dt)
    # step ahead change in price
    T['dM']=np.round((T['next_mid']-T['mid'])/ticksize*2)*ticksize/2
    T = T.loc[(T.dM <= ticksize*1.1) & (T.dM>=-ticksize*1.1)]
    # symetrize data
    T2 = T.copy(deep=True)
    T2['imb_bucket']=n_imb-1-T2['imb_bucket']
    T2['next_imb_bucket']=n_imb-1-T2['next_imb_bucket']
    T2['dM']=-T2['dM']
    T2['mid']=-T2['mid']
    T3=pd.concat([T,T2])
    T3.index = pd.RangeIndex(len(T3.index)) 
    return T3,ticksize

In [184]:
%time
all_features = [extract_features(path) for path in orderbook_list[:20]] 

df = dask.compute(all_features)[0]
df = pd.concat(df)

CPU times: total: 0 ns
Wall time: 0 ns


imbalance 

$ I_t = \Sigma_{j=1}^{n} j \mathbf{1}_{\frac{j-1}{n} < I < \frac{j}{n}} $

imbalance 값을 discretize 했다고 보면 됨. 10등분 한다면 $n=10$ 이 되는셈. 많이 쪼갤수록 좋을듯 

bid-ask spread $S_t$ 는 tick 단위로 count 함. 만약 tick 단위가 0.05 이고 bid-ask spread 가 0.1 이면 $S_t =2$

Discretized Markov process 사용. state space 아래와 같이 정의

$X_t = (I_t, S_t)$

In [205]:
df_sig = symmetrize_data(df, numImbalance=4)

In [209]:
unique_x = list(itertools.product(np.arange(1,4+1),np.arange(1,4+1)))
K = np.array([-0.1, -0.05, 0, 0.05, 0.1])

# Q_xy: transition prob matrix for cases dM = 0
Q_xy = df_sig[df_sig.mid_chg == 0].groupby(['x_now', 'x_next']).count().unstack().fillna(0)
Q_xy = Q_xy/Q_xy.sum(axis=1).values.reshape(-1,1)
Q_xy.columns = Q_xy.columns.droplevel(0)

# T_xy: transition prob matrix for cases dM != 0
T_xy = df_sig[df_sig.mid_chg != 0].groupby(['x_now', 'x_next']).count().unstack().fillna(0)
T_xy = T_xy/T_xy.sum(axis=1).values.reshape(-1,1)
T_xy.columns = T_xy.columns.droplevel(0)

# R_xk: transient state matrix 
R_xk = df_sig.groupby(['x_now', 'mid_chg']).count().unstack().fillna(0)
R_xk = R_xk/R_xk.sum(axis=1).values.reshape(-1,1)

# ensure Q and T have shape mn x mn
Q = pd.DataFrame(0, index=unique_x, columns=unique_x, dtype=float)
T = Q.copy(deep=True)

Q.loc[Q_xy.columns,Q_xy.columns] = Q_xy
T.loc[T_xy.columns,T_xy.columns] = T_xy

Q, T = Q.fillna(0), T.fillna(0)

In [213]:
Q_xy

x_next,"(1, 1)","(1, 2)","(1, 3)","(1, 4)","(2, 1)","(2, 2)","(2, 3)","(2, 4)","(3, 1)","(3, 2)","(3, 3)","(3, 4)","(4, 1)","(4, 2)","(4, 3)","(4, 4)"
x_now,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
"(1, 1)",0.664324,3.3e-05,3.2e-05,2e-06,0.161844,1.5e-05,8e-06,2e-06,0.08551,2.4e-05,1.2e-05,0.0,0.088124,4.4e-05,2.3e-05,5e-06
"(1, 2)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(1, 3)",0.272727,0.0,0.090909,0.0,0.181818,0.0,0.0,0.0,0.272727,0.0,0.0,0.0,0.181818,0.0,0.0,0.0
"(2, 1)",0.271321,5.7e-05,1.5e-05,2e-06,0.415652,1.5e-05,1.2e-05,0.0,0.177168,2e-05,1e-05,0.0,0.13565,5.2e-05,2.7e-05,0.0
"(2, 3)",0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0
"(3, 1)",0.135808,3e-05,2e-05,2e-06,0.176939,7e-06,1e-05,0.0,0.415512,1.7e-05,1.5e-05,0.0,0.271567,4.9e-05,2.2e-05,2e-06
"(3, 3)",0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0
"(4, 1)",0.087649,5.5e-05,2.1e-05,0.0,0.084905,1.2e-05,6e-06,2e-06,0.161468,2.3e-05,9e-06,2e-06,0.665768,4.8e-05,2.6e-05,8e-06
"(4, 2)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(4, 3)",0.454545,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.272727,0.0,0.0,0.0,0.181818,0.0,0.0,0.0


In [212]:
np.linalg.inv(Q_xy)

LinAlgError: Last 2 dimensions of the array must be square

In [211]:
g_1   = np.linalg.inv(1-Q) @ R_xk @ K 
B_mat = np.linalg.inv(1-Q) @ T_xy

LinAlgError: Singular matrix

In [153]:
R_xk

Unnamed: 0_level_0,x_next,x_next,x_next,x_next,x_next,x_next,x_next,x_next,x_next
mid_chg,-0.20,-0.15,-0.10,-0.05,0.00,0.05,0.10,0.15,0.20
x_now,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
"(1, 1)",0.019769,4.2e-05,0.04288,4.2e-05,0.930508,3.7e-05,0.004409,1.8e-05,0.002295
"(1, 2)",0.0,0.239766,0.005848,0.432749,0.0,0.25731,0.0,0.064327,0.0
"(1, 3)",0.197368,0.0,0.473684,0.0,0.078947,0.0,0.210526,0.0,0.039474
"(1, 4)",0.0,0.28,0.0,0.04,0.0,0.16,0.0,0.52,0.0
"(2, 1)",0.005777,8e-06,0.013597,4.2e-05,0.969443,5.5e-05,0.007783,2.1e-05,0.003274
"(2, 2)",0.0,0.193548,0.0,0.403226,0.0,0.306452,0.016129,0.080645,0.0
"(2, 3)",0.12,0.0,0.2,0.0,0.08,0.04,0.48,0.0,0.08
"(2, 4)",0.0,0.454545,0.0,0.0,0.0,0.181818,0.0,0.363636,0.0
"(3, 1)",0.003274,2.1e-05,0.007783,5.5e-05,0.969443,4.2e-05,0.013597,8e-06,0.005777
"(3, 2)",0.0,0.080645,0.016129,0.306452,0.0,0.403226,0.0,0.193548,0.0


In [111]:
micro_adjustments = []
for i in range(1,20):
    micro_adjustments.append(np.linalg.matrix_power(B_mat, i) @  g_1)

In [114]:
g_1 + np.linalg.matrix_power(B_mat, 20) @  g_1

0   -1.868313e+82
1    1.034851e+88
2   -2.486767e+82
3   -2.466386e+87
4    2.462531e+82
5    2.466392e+87
6    1.844079e+82
7   -1.034851e+88
dtype: float64

In [113]:
np.array(micro_adjustments)

array([[ 3.16368358e+04, -1.72311355e+10,  4.14447507e+04,
         4.08741689e+09, -4.05255976e+04, -4.08745404e+09,
        -3.07177947e+04,  1.72311713e+10],
       [-4.10624156e+08,  2.13876552e+14, -5.38276676e+08,
        -5.09670748e+13,  4.84465666e+08,  5.09675451e+13,
         3.56813846e+08, -2.13876941e+14],
       [ 4.97423420e+12, -2.65317081e+18,  6.55977948e+12,
         6.32334519e+17, -6.12928023e+12, -6.32336823e+17,
        -4.54373353e+12,  2.65317247e+18],
       [-5.89935662e+16,  3.29124631e+22, -7.86628395e+16,
        -7.84412021e+21,  7.87449224e+16,  7.84411827e+21,
         5.90756015e+16, -3.29124613e+22],
       [ 7.18506699e+20, -4.08277260e+26,  9.62503462e+20,
         9.73058749e+25, -9.90131368e+20, -9.73058782e+25,
        -7.46134576e+20,  4.08277305e+26],
       [-9.01968982e+24,  5.06465570e+30, -1.20464599e+25,
        -1.20707267e+30,  1.21758675e+25,  1.20707540e+30,
         9.14910431e+24, -5.06465863e+30],
       [ 1.13862860e+29, -6.282676