In [1]:
import glob
import vaex
import dask
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime 

In [2]:
PROJECT_PATH = 'D:/work/personal/FBD_Project/'
orderbook_list = sorted(glob.glob(PROJECT_PATH + 'datasets/btcusdt/orderbooks/*.csv.gz'))
quote_list = sorted(glob.glob(PROJECT_PATH + 'datasets/btcusdt/quotes/*.csv.gz'))

In [3]:
@dask.delayed
def extract_features(orderbook_file_path:str) -> pd.DataFrame:
    df = pd.read_csv(orderbook_file_path)[['timestamp', 'asks[0].price', 'bids[0].price', 'asks[0].amount', 'bids[0].amount']]
                           
   # calculate mid price and bidask spread
    df['mid_price'] = (df['asks[0].price'] + df['bids[0].price'])/2
    df['ba_spread'] = np.round((df['asks[0].price'] - df['bids[0].price'])/2,2)
    df['imbalance'] = df['bids[0].amount']/(df['bids[0].amount'] + df['asks[0].amount'])
    df['timestamp'] = pd.to_datetime(df['timestamp']/1000, unit='ms')

    # convert timestamp to datetime format
    df = df[['timestamp','mid_price', 'ba_spread', 'imbalance']].set_index('timestamp')
    
    # resample by 1second frequency
    df = df.resample('1s').last().ffill()
    return df

In [121]:
%time
all_features = [extract_features(path) for path in orderbook_list[:5]] 

df = dask.compute(all_features)[0]
df = pd.concat(df)

CPU times: total: 0 ns
Wall time: 0 ns


imbalance 

$ I_t = \Sigma_{j=1}^{n} j \mathbf{1}_{\frac{j-1}{n} < I < \frac{j}{n}} $

imbalance 값을 discretize 했다고 보면 됨. 10등분 한다면 $n=10$ 이 되는셈. 많이 쪼갤수록 좋을듯 

bid-ask spread $S_t$ 는 tick 단위로 count 함. 만약 tick 단위가 0.05 이고 bid-ask spread 가 0.1 이면 $S_t =2$

Discretized Markov process 사용. state space 아래와 같이 정의

$X_t = (I_t, S_t)$

In [115]:
def symmetrize_data(df_feature: pd.DataFrame, tick_size:float=0.05, numParition:int=10) -> pd.DataFrame:
    df_signal = df_feature.copy(deep=True)

    # discretize bidask spread then get next time's bidask spread
    # discretize imbalance and get next imbalance
    # cap spread values that goes over 0.2 as 0.25 (group 5 means spread is over 0.2)
    df_signal.loc[df_signal.ba_spread > 0.2,'ba_spread'] = 0.25
    df_signal['ba_spread'] = np.round(df_signal['ba_spread'].div(tick_size)).astype(int)
    df_signal['imbalance'] = pd.cut(df_feature['imbalance'], bins=np.arange(numParition+1)/numParition, labels=np.arange(1,numParition+1)).astype(int)
    
    # state variable x = (i,s) and next x
    df_signal['x_now'] = list(zip(df_signal['imbalance'],df_signal['ba_spread']))
    df_signal['x_next'] = df_signal['x_now'].shift(-1)
    
    # calculate change in mid price
    df_signal['mid_chg'] = np.round(df_signal['mid_price'].diff(),2).shift(-1,)
    
    # drop na rows then change dtype
    df_signal = df_signal.dropna()
    
    # make symmetric data
    df_symmetric = df_signal.copy(deep=True)
    df_symmetric['imbalance'] = numParition - df_signal['imbalance'] + 1
    df_symmetric['x_now'] = list(zip(df_symmetric['imbalance'], df_symmetric['ba_spread']))
    df_symmetric['x_next'] = df_symmetric['x_now'].shift(-1)
    df_symmetric['mid_chg'] = -df_signal['mid_chg']
    
    df = pd.concat([
            df_signal[['x_now', 'x_next','mid_chg']], 
            df_symmetric[['x_now', 'x_next','mid_chg']]
    ])
    return df.dropna()

In [126]:
df_sig = symmetrize_data(df, numParition=4)

In [127]:
K_min = df_sig.loc[df_sig.mid_chg < -0.2, 'mid_chg'].mean() 
K_max = df_sig.loc[df_sig.mid_chg > 0.2, 'mid_chg'].mean() 
K = np.array([K_min, -0.2, -0.15, -0.1, -0.05, 0, 0.05, 0.1, 0.15, 0.2, K_max])

df_sig.loc[df_sig.mid_chg < -0.2, 'mid_chg'] = -0.25
df_sig.loc[df_sig.mid_chg > 0.2, 'mid_chg'] = 0.25

In [128]:
# Q_xy: transition prob matrix for cases dM = 0
Q_xy = df_sig[df_sig.mid_chg == 0].groupby(['x_now', 'x_next']).count().unstack().fillna(0)
Q_xy = Q_xy/Q_xy.sum(axis=1).values.reshape(-1,1)

# T_xy: transition prob matrix for cases dM != 0
T_xy = df_sig[df_sig.mid_chg != 0].groupby(['x_now', 'x_next']).count().unstack().fillna(0)
T_xy = T_xy/T_xy.sum(axis=1).values.reshape(-1,1)

# R_xk: transient state matrix 
R_xk = df_sig.groupby(['x_now', 'mid_chg']).count().unstack().fillna(0)
R_xk = R_xk/R_xk.sum(axis=1).values.reshape(-1,1)

print(Q_xy.shape, T_xy.shape, R_xk.shape)

(12, 10) (20, 20) (20, 11)


In [132]:
Q = T_xy.copy()
Q.iloc[:,:] = 0
Q.loc[Q_xy.index, Q_xy.columns] = Q_xy

In [135]:
np.linalg.inv(1-Q)

LinAlgError: Singular matrix

In [119]:
np.linalg.inv(1 - Q)

LinAlgError: Singular matrix

In [8]:
q_mat = df_sig[df_sig.mid_chg != 0]
pd.crosstab(q_mat.x_now, q_mat.x_next)

KeyboardInterrupt: 

In [21]:
q_mat.x_now.to_dict()

{Timestamp('2022-09-03 00:00:02'): [1.0, 1.0],
 Timestamp('2022-09-03 00:00:03'): [3.0, 1.0],
 Timestamp('2022-09-03 00:00:04'): [1.0, 1.0],
 Timestamp('2022-09-03 00:00:05'): [1.0, 1.0],
 Timestamp('2022-09-03 00:00:09'): [10.0, 1.0],
 Timestamp('2022-09-03 00:00:10'): [10.0, 1.0],
 Timestamp('2022-09-03 00:00:20'): [9.0, 1.0],
 Timestamp('2022-09-03 00:00:27'): [2.0, 1.0],
 Timestamp('2022-09-03 00:00:30'): [1.0, 1.0],
 Timestamp('2022-09-03 00:00:35'): [1.0, 1.0],
 Timestamp('2022-09-03 00:00:42'): [10.0, 1.0],
 Timestamp('2022-09-03 00:00:50'): [2.0, 1.0],
 Timestamp('2022-09-03 00:00:58'): [5.0, 1.0],
 Timestamp('2022-09-03 00:00:59'): [2.0, 1.0],
 Timestamp('2022-09-03 00:01:03'): [1.0, 1.0],
 Timestamp('2022-09-03 00:01:04'): [1.0, 2.0],
 Timestamp('2022-09-03 00:01:05'): [1.0, 1.0],
 Timestamp('2022-09-03 00:01:07'): [8.0, 1.0],
 Timestamp('2022-09-03 00:01:08'): [10.0, 1.0],
 Timestamp('2022-09-03 00:01:09'): [8.0, 1.0],
 Timestamp('2022-09-03 00:01:10'): [10.0, 1.0],
 Timesta