### 

In [167]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import dask
import statsmodels.formula.api as sm


# Data Preprocessing

In [97]:
DATA_PATH = '/Users/mac/Desktop/Repos/FBD_Project/datasets/'
DATA_PATH = '/Users/zakarysouid/Downloads/'
orderbook_list = sorted(glob.glob(DATA_PATH + 'btcusdt/orderbook/*.csv.gz'))
quote_list = sorted(glob.glob(DATA_PATH + 'btcusdt/quotes/*.csv.gz'))
trade_list = sorted(glob.glob(DATA_PATH + 'btcusdt/trades/*.csv.gz'))

In [119]:
def discretize(
    df_feature: pd.DataFrame,
    numSpreads: int = 4,
    numImbalance: int = 4,
    numdM: int = 2,):
    df_signal = df_feature.copy(deep=True)
    #df_signal['ba_spread']=df_signal['ba_spread'].where(df_signal['side']>=0.1,0.1)
    tick_size = df_signal.ba_spread[df_signal.ba_spread != 0].min()
    # discretize bidask spread then get next time's bidask spread
    # discretize imbalance and get next imbalance
    df_signal = df_signal[df_signal.ba_spread <= numSpreads * tick_size]
    df_signal["ba_spread"] = np.round(df_signal["ba_spread"].div(tick_size)).astype(int)
    df_signal["imbalance"] = pd.cut(
        df_feature["imbalance"],
        bins=np.arange(numImbalance+1) / numImbalance,
        labels=np.arange(1, numImbalance+1),
        include_lowest=True
    )
    return df_signal

def signal(df : pd.DataFrame):
    df['signal'] = np.where(df['g_star'] > 0, 1, -1)
    #df['sum_signal'] = np.where(df['cum_g_star'] > 1, 1, -1)
    return df

# output a boolean if the 'mid_price_change' is greater than 0
def mid_price_change(df : pd.DataFrame):
    df['mid_price_change'] = np.where(df['mid_price'].diff() == 0, 0, 1)
    return df

def number_price_change(df : pd.DataFrame):
    df['number_price_change'] = df['mid_price_change'].cumsum()
    return df

def bid_ask_rebuild(df : pd.DataFrame):
    df['bid_price'] = df['mid_price'] - df['ba_spread']*0.05
    df['ask_price'] = df['mid_price'] + df['ba_spread']*0.05
    return df

def smth_is_happening(df : pd.DataFrame) :
    #df2=pd.DataFrame()
    #df2['smth_is_happening'] = np.where(df['cum_g_star']>= (df['ba_spread']*0.05))
    df2=df.copy(deep=True)
    df2=df2.loc[df2['cum_g_star']>= (df['ba_spread']*0.05)]
    return df2

def ratio(df : pd.DataFrame):
    df['ratio'] = df['g_star']/(df['ba_spread']*0.05)
    return df

def ratio_sum(df : pd.DataFrame):
    df['ratio_sum'] = abs(df['cum_g_star'])/(df['ba_spread']*0.05)
    return df

def time_since_price_change(df : pd.DataFrame):
    df['time_since_price_change'] = df.groupby('number_price_change').cumcount()
    return df

def imbalance(df : pd.DataFrame, horizon : int = 10,memory : bool = False):
    df['bid_amount']= df['amount'].where(df['side']==1,0)
    df['volume']=df['amount'].rolling(min_periods=1, window=horizon).sum()
    df['imbalance']=df['bid_amount'].rolling(min_periods=1,window=horizon).sum()/df['volume']
    df.drop(['bid_amount','signed_amount','volume'], axis=1, inplace=True)
    return df


def s_profit(df : pd.DataFrame):
    df['s_profit'] = (df['next_ask_price']- df['bid_price']).where(df['side']==1,df['next_bid_price'] - df['ask_price'])
    return df

In [120]:
# process raw data to get features for calculation.
%time
#all_features = [extract_features(path) for path in quote_list[:5]] 
df_trades_or = pd.concat((pd.read_csv(f) for f in trade_list[:1]))
print(len(trade_list))

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 8.11 µs
120


In [140]:
df_trades = df_trades_or.copy()
df_trades['timestamp'] = pd.to_datetime(df_trades['timestamp'] / 1000, unit='ms')
df_trades.drop(['id', 'exchange','local_timestamp'], axis=1, inplace=True)
df_trades.replace({'side': {'buy': 1, 'sell': -1}}, inplace=True)

df_trades['bid_price']=df_trades['price'].where(df_trades['side']==1, np.nan)
df_trades['ask_price']=df_trades['price'].where(df_trades['side']==-1, df_trades['bid_price']-0.1)
df_trades['bid_price'].fillna(df_trades['ask_price']+0.1, inplace=True)
df_trades['mid_price']=(df_trades['bid_price']+df_trades['ask_price'])/2


df_trades = mid_price_change(df_trades)
df_trades['next_bid_price']=df_trades['price'].where( (df_trades['side']==1) & (df_trades['mid_price_change']==1), np.nan)
df_trades['next_bid_price'].fillna(method='bfill', inplace=True)
df_trades['next_bid_price'].fillna(method='ffill', inplace=True) #get rid of nan at the end

df_trades['next_ask_price']=df_trades['price'].where((df_trades['side']==-1) & (df_trades['mid_price_change']==1), np.nan)
df_trades['next_ask_price'].fillna(method='bfill', inplace=True)
df_trades['next_ask_price'].fillna(method='ffill', inplace=True) #get rid of nan at the end


df_trades['ba_spread']=0.1

df_trades = s_profit(df_trades)



In [141]:
df_trades[00:100]

Unnamed: 0,symbol,timestamp,side,price,amount,bid_price,ask_price,mid_price,mid_price_change,next_bid_price,next_ask_price,ba_spread,s_profit
0,BTCUSDT,2022-09-02 00:00:00.038000128,1,20122.6,0.001,20122.6,20122.5,20122.55,1,20122.6,20122.4,0.1,-0.2
1,BTCUSDT,2022-09-02 00:00:00.078000128,1,20122.6,0.016,20122.6,20122.5,20122.55,0,20121.4,20122.4,0.1,-0.2
2,BTCUSDT,2022-09-02 00:00:03.544999936,-1,20122.5,0.857,20122.6,20122.5,20122.55,0,20121.4,20122.4,0.1,-1.1
3,BTCUSDT,2022-09-02 00:00:03.544999936,-1,20122.5,0.001,20122.6,20122.5,20122.55,0,20121.4,20122.4,0.1,-1.1
4,BTCUSDT,2022-09-02 00:00:03.544999936,-1,20122.5,0.001,20122.6,20122.5,20122.55,0,20121.4,20122.4,0.1,-1.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,BTCUSDT,2022-09-02 00:00:03.676000000,1,20121.8,0.600,20121.8,20121.7,20121.75,0,20122.0,20121.6,0.1,-0.2
96,BTCUSDT,2022-09-02 00:00:03.676000000,1,20122.0,0.400,20122.0,20121.9,20121.95,1,20122.0,20121.6,0.1,-0.4
97,BTCUSDT,2022-09-02 00:00:03.676000000,1,20122.0,0.002,20122.0,20121.9,20121.95,0,20122.0,20121.6,0.1,-0.4
98,BTCUSDT,2022-09-02 00:00:03.678000128,1,20122.0,0.088,20122.0,20121.9,20121.95,0,20122.0,20121.6,0.1,-0.4


In [142]:
#rolling imbalances
#df_trades = mid_price_change(df_trades)
df_trades = number_price_change(df_trades)
df_trades = time_since_price_change(df_trades)

df_trades['signed_amount'] = df_trades['amount'] * df_trades['side']
#df_trades['cum_size']= df_trades.groupby('number_price_change')['signed_amount'].cumsum()
df_trades = imbalance(df_trades, horizon=10, memory=True)
df_trades.drop(index=df_trades.index[:50], axis=0, inplace=True)
df_trades.drop(['number_price_change','time_since_price_change','mid_price_change','symbol'],axis=1, inplace=True)

In [143]:
df_trades['imbalance'].where(df_trades['imbalance']>0,0.1,inplace=True)
df_trades['imbalance'].where(df_trades['imbalance']<1,1,inplace=True)



In [144]:
#loading data

#df_feat = pd.read_csv('df_feat.csv')
df_micro = pd.read_csv('df_micro.csv')
#df_sig = pd.read_csv('df_sig.csv')

In [145]:
# add ba_spread=5 by duplicating the values for ba_spread=4
#del df_temp
df_temp =(df_micro[df_micro.ba_spread == 4])
df_temp = df_temp.assign(ba_spread=5)
df_micro = df_micro.append(df_temp)


In [146]:
df_trades.tail()

Unnamed: 0,timestamp,side,price,amount,bid_price,ask_price,mid_price,next_bid_price,next_ask_price,ba_spread,s_profit,imbalance
3954193,2022-09-02 23:59:59.724999936,1,19941.1,0.007,19941.1,19941.0,19941.05,19941.1,19940.2,0.1,-0.9,1.0
3954194,2022-09-02 23:59:59.726000128,1,19941.1,0.05,19941.1,19941.0,19941.05,19941.1,19940.2,0.1,-0.9,1.0
3954195,2022-09-02 23:59:59.734000128,1,19941.1,0.001,19941.1,19941.0,19941.05,19941.1,19940.2,0.1,-0.9,1.0
3954196,2022-09-02 23:59:59.736999936,-1,19941.0,0.025,19941.1,19941.0,19941.05,19941.1,19940.2,0.1,0.1,0.992243
3954197,2022-09-02 23:59:59.840000000,-1,19941.0,0.003,19941.1,19941.0,19941.05,19941.1,19940.2,0.1,0.1,0.990431


In [147]:
df_dis=discretize(df_trades)

In [148]:
df_dis

Unnamed: 0,timestamp,side,price,amount,bid_price,ask_price,mid_price,next_bid_price,next_ask_price,ba_spread,s_profit,imbalance
50,2022-09-02 00:00:03.600000000,1,20122.6,0.014,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4
51,2022-09-02 00:00:03.604000000,1,20122.6,0.008,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4
52,2022-09-02 00:00:03.604999936,-1,20122.5,0.001,20122.6,20122.5,20122.55,20121.4,20122.4,1,-1.1,4
53,2022-09-02 00:00:03.604999936,1,20122.6,0.001,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4
54,2022-09-02 00:00:03.614000128,1,20122.6,0.001,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4
...,...,...,...,...,...,...,...,...,...,...,...,...
3954193,2022-09-02 23:59:59.724999936,1,19941.1,0.007,19941.1,19941.0,19941.05,19941.1,19940.2,1,-0.9,4
3954194,2022-09-02 23:59:59.726000128,1,19941.1,0.050,19941.1,19941.0,19941.05,19941.1,19940.2,1,-0.9,4
3954195,2022-09-02 23:59:59.734000128,1,19941.1,0.001,19941.1,19941.0,19941.05,19941.1,19940.2,1,-0.9,4
3954196,2022-09-02 23:59:59.736999936,-1,19941.0,0.025,19941.1,19941.0,19941.05,19941.1,19940.2,1,0.1,4


In [149]:
df_dis_g = pd.merge(
            df_dis.reset_index(),
            df_micro.reset_index(),
            how='left',
            left_on=['ba_spread','imbalance'], 
            right_on=['ba_spread','imbalance']
        ).set_index('timestamp')

In [150]:
# micro price calculation: mid_price + g_star
df_dis_g['micro_price'] = df_dis_g['mid_price'] + df_dis_g['g_star'] 
df_dis_g

Unnamed: 0_level_0,index_x,side,price,amount,bid_price,ask_price,mid_price,next_bid_price,next_ask_price,ba_spread,s_profit,imbalance,index_y,g_star,micro_price
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2022-09-02 00:00:03.600000000,50,1,20122.6,0.014,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,3,0.016683,20122.566683
2022-09-02 00:00:03.604000000,51,1,20122.6,0.008,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,3,0.016683,20122.566683
2022-09-02 00:00:03.604999936,52,-1,20122.5,0.001,20122.6,20122.5,20122.55,20121.4,20122.4,1,-1.1,4,3,0.016683,20122.566683
2022-09-02 00:00:03.604999936,53,1,20122.6,0.001,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,3,0.016683,20122.566683
2022-09-02 00:00:03.614000128,54,1,20122.6,0.001,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,3,0.016683,20122.566683
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-02 23:59:59.724999936,3954193,1,19941.1,0.007,19941.1,19941.0,19941.05,19941.1,19940.2,1,-0.9,4,3,0.016683,19941.066683
2022-09-02 23:59:59.726000128,3954194,1,19941.1,0.050,19941.1,19941.0,19941.05,19941.1,19940.2,1,-0.9,4,3,0.016683,19941.066683
2022-09-02 23:59:59.734000128,3954195,1,19941.1,0.001,19941.1,19941.0,19941.05,19941.1,19940.2,1,-0.9,4,3,0.016683,19941.066683
2022-09-02 23:59:59.736999936,3954196,-1,19941.0,0.025,19941.1,19941.0,19941.05,19941.1,19940.2,1,0.1,4,3,0.016683,19941.066683


In [155]:
df_signal = signal(df_dis_g)
df_signal = mid_price_change(df_signal)
#df_signal = bid_ask_rebuild(df_signal)
df_signal = number_price_change(df_signal)
df_signal = time_since_price_change(df_signal)
df_signal = ratio(df_signal)



#delete useless columns
#add sum_g_star to df_signal until mid_price_change = 1
df_signal['weighted_g_star'] = df_signal['g_star'] * (df_signal['time_since_price_change']+1/df_signal['time_since_price_change'].mean())
df_signal['cum_g_star'] = df_signal.groupby('number_price_change').cumsum()['weighted_g_star'] 
df_signal = ratio_sum(df_signal)
df_signal = df_signal.drop(['index_x','index_y','signal','number_price_change','mid_price_change'], axis=1)
#df_test = smth_is_happening(df_signal)
#df_test
df_signal

Unnamed: 0_level_0,side,price,amount,bid_price,ask_price,mid_price,next_bid_price,next_ask_price,ba_spread,s_profit,imbalance,g_star,micro_price,time_since_price_change,ratio,weighted_g_star,cum_g_star,ratio_sum
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2022-09-02 00:00:03.600000000,1,20122.6,0.014,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,0.016683,20122.566683,0,0.333654,0.000600,0.000600,0.011997
2022-09-02 00:00:03.604000000,1,20122.6,0.008,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,0.016683,20122.566683,1,0.333654,0.017283,0.017882,0.357647
2022-09-02 00:00:03.604999936,-1,20122.5,0.001,20122.6,20122.5,20122.55,20121.4,20122.4,1,-1.1,4,0.016683,20122.566683,2,0.333654,0.033965,0.051848,1.036951
2022-09-02 00:00:03.604999936,1,20122.6,0.001,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,0.016683,20122.566683,3,0.333654,0.050648,0.102495,2.049909
2022-09-02 00:00:03.614000128,1,20122.6,0.001,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,0.016683,20122.566683,4,0.333654,0.067331,0.169826,3.396520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-02 23:59:59.724999936,1,19941.1,0.007,19941.1,19941.0,19941.05,19941.1,19940.2,1,-0.9,4,0.016683,19941.066683,94,0.333654,1.568771,22.706409,454.128188
2022-09-02 23:59:59.726000128,1,19941.1,0.050,19941.1,19941.0,19941.05,19941.1,19940.2,1,-0.9,4,0.016683,19941.066683,95,0.333654,1.585454,24.291863,485.837269
2022-09-02 23:59:59.734000128,1,19941.1,0.001,19941.1,19941.0,19941.05,19941.1,19940.2,1,-0.9,4,0.016683,19941.066683,96,0.333654,1.602137,25.894000,517.880003
2022-09-02 23:59:59.736999936,-1,19941.0,0.025,19941.1,19941.0,19941.05,19941.1,19940.2,1,0.1,4,0.016683,19941.066683,97,0.333654,1.618819,27.512820,550.256392


In [170]:
df_signal_light = df_signal[['s_profit','g_star','ratio','weighted_g_star','cum_g_star','ratio_sum']]

result_ratio = sm.ols(formula="s_profit ~ ratio ", data=df_signal_light).fit()
result_g_star = sm.ols(formula="s_profit ~ g_star ", data=df_signal_light).fit()
result_weighted_g_star = sm.ols(formula="s_profit ~ weighted_g_star ", data=df_signal_light).fit()
result_cum_g_star = sm.ols(formula="s_profit ~ cum_g_star ", data=df_signal_light).fit()
result_ratio_sum = sm.ols(formula="s_profit ~ ratio_sum ", data=df_signal_light).fit()

print(result_ratio.summary())

                            OLS Regression Results                            
Dep. Variable:               s_profit   R-squared:                       0.022
Model:                            OLS   Adj. R-squared:                  0.022
Method:                 Least Squares   F-statistic:                 9.098e+04
Date:                Sun, 29 Jan 2023   Prob (F-statistic):               0.00
Time:                        13:23:26   Log-Likelihood:            -9.7465e+06
No. Observations:             3954148   AIC:                         1.949e+07
Df Residuals:                 3954146   BIC:                         1.949e+07
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0200      0.001     13.995      0.0

In [171]:
print(result_g_star.summary())

                            OLS Regression Results                            
Dep. Variable:               s_profit   R-squared:                       0.022
Model:                            OLS   Adj. R-squared:                  0.022
Method:                 Least Squares   F-statistic:                 9.098e+04
Date:                Sun, 29 Jan 2023   Prob (F-statistic):               0.00
Time:                        13:23:36   Log-Likelihood:            -9.7465e+06
No. Observations:             3954148   AIC:                         1.949e+07
Df Residuals:                 3954146   BIC:                         1.949e+07
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0200      0.001     13.995      0.0

In [172]:
print(result_weighted_g_star.summary())

                            OLS Regression Results                            
Dep. Variable:               s_profit   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                 1.538e+04
Date:                Sun, 29 Jan 2023   Prob (F-statistic):               0.00
Time:                        13:23:47   Log-Likelihood:            -9.7838e+06
No. Observations:             3954148   AIC:                         1.957e+07
Df Residuals:                 3954146   BIC:                         1.957e+07
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           0.0180      0.001     

In [173]:
print(result_cum_g_star.summary())

                            OLS Regression Results                            
Dep. Variable:               s_profit   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     53.52
Date:                Sun, 29 Jan 2023   Prob (F-statistic):           2.56e-13
Time:                        13:23:52   Log-Likelihood:            -9.7914e+06
No. Observations:             3954148   AIC:                         1.958e+07
Df Residuals:                 3954146   BIC:                         1.958e+07
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0185      0.001     12.753      0.0

In [174]:
print(result_ratio_sum.summary())

                            OLS Regression Results                            
Dep. Variable:               s_profit   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     385.6
Date:                Sun, 29 Jan 2023   Prob (F-statistic):           7.50e-86
Time:                        13:23:57   Log-Likelihood:            -9.7913e+06
No. Observations:             3954148   AIC:                         1.958e+07
Df Residuals:                 3954146   BIC:                         1.958e+07
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0205      0.001     14.102      0.0

In [18]:
df_mvt = df_signal.copy(deep=True)
df_mvt = df_mvt.loc[df_mvt['time_since_price_change']==0]
df_signal['ask_change']=-df_signal['ask_price'].diff(-1)
df_signal['bid_change']=-df_signal['bid_price'].diff(-1)
df_signal['ask_next_change']=df_signal['ask_change'].replace(0,method='bfill')
df_signal['bid_next_change']=df_signal['bid_change'].replace(0,method='bfill')
df_signal_light = df_signal.drop(['index_x','index_y','signal','number_price_change','mid_price','ba_spread','micro_price','bid_price','ask_price','mid_price_change'], axis=1)

df_signal_light.corr()

Unnamed: 0,imbalance,g_star,time_since_price_change,ratio,weighted_g_star,cum_g_star,ratio_sum,ask_change,bid_change,ask_next_change,bid_next_change
imbalance,1.0,0.983937,0.01195,0.997336,0.494438,0.139164,0.017748,0.226754,0.222004,0.248945,0.246648
g_star,0.983937,1.0,0.011895,0.991576,0.483818,0.135557,0.018023,0.240529,0.240573,0.252865,0.252908
time_since_price_change,0.01195,0.011895,1.0,0.012054,0.066219,0.162801,0.705664,0.000979,0.000807,0.005354,0.0059
ratio,0.997336,0.991576,0.012054,1.0,0.492165,0.138017,0.018334,0.241841,0.241911,0.255584,0.255657
weighted_g_star,0.494438,0.483818,0.066219,0.492165,1.0,0.49798,0.109688,0.054827,0.054866,0.09932,0.099325
cum_g_star,0.139164,0.135557,0.162801,0.138017,0.49798,1.0,0.404153,0.008587,0.008609,0.022192,0.022244
ratio_sum,0.017748,0.018023,0.705664,0.018334,0.109688,0.404153,1.0,0.000256,0.000447,-0.001303,-0.001128
ask_change,0.226754,0.240529,0.000979,0.241841,0.054827,0.008587,0.000256,1.0,0.986681,0.645918,0.645899
bid_change,0.222004,0.240573,0.000807,0.241911,0.054866,0.008609,0.000447,0.986681,1.0,0.637313,0.653838
ask_next_change,0.248945,0.252865,0.005354,0.255584,0.09932,0.022192,-0.001303,0.645918,0.637313,1.0,0.993836


In [13]:
df_mvt['ask_change']=-df_mvt['ask_price'].diff(-1)
df_mvt['bid_change']=-df_mvt['bid_price'].diff(-1)
df_mvt_light = df_mvt.drop(['index_x','index_y','signal','number_price_change','mid_price','ba_spread','micro_price','bid_price','ask_price','mid_price_change', 'time_since_price_change'], axis=1)



df_mvt_light.corr()

Unnamed: 0,imbalance,g_star,ratio,weighted_g_star,cum_g_star,ratio_sum,ask_change,bid_change
imbalance,1.0,0.959446,0.995365,0.959446,0.959446,-0.002767,0.275173,0.269304
g_star,0.959446,1.0,0.97761,1.0,1.0,-0.001697,0.283395,0.283337
ratio,0.995365,0.97761,1.0,0.97761,0.97761,-0.002152,0.29273,0.292711
weighted_g_star,0.959446,1.0,0.97761,1.0,1.0,-0.001697,0.283395,0.283337
cum_g_star,0.959446,1.0,0.97761,1.0,1.0,-0.001697,0.283395,0.283337
ratio_sum,-0.002767,-0.001697,-0.002152,-0.001697,-0.001697,1.0,0.000566,0.000771
ask_change,0.275173,0.283395,0.29273,0.283395,0.283395,0.000566,1.0,0.986682
bid_change,0.269304,0.283337,0.292711,0.283337,0.283337,0.000771,0.986682,1.0


In [18]:
df_test=smth_is_happening(df_signal)
df_test

Unnamed: 0_level_0,index_x,mid_price,ba_spread,imbalance,index_y,g_star,micro_price,signal,mid_price_change,bid_price,ask_price,number_price_change,time_since_price_change,ratio,weighted_g_star,cum_g_star,ratio_sum,ask_change,bid_change
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2022-09-02 00:00:12,12,20124.85,1,4,3.0,0.016683,20124.866683,1,0,20124.8,20124.9,7,4,0.333654,0.069875,0.073831,1.476626,-0.0,-0.0
2022-09-02 00:00:13,13,20124.85,1,2,1.0,-0.004311,20124.845689,-1,0,20124.8,20124.9,7,5,-0.086217,-0.022367,0.051465,1.029293,-0.0,-0.0
2022-09-02 00:00:40,40,20109.25,1,4,3.0,0.016683,20109.266683,1,0,20109.2,20109.3,21,2,0.333654,0.036509,0.057149,1.142972,-0.3,-0.3
2022-09-02 00:00:56,56,20106.75,1,4,3.0,0.016683,20106.766683,1,0,20106.7,20106.8,29,3,0.333654,0.053192,0.066937,1.338741,-0.0,-0.0
2022-09-02 00:00:57,57,20106.75,1,4,3.0,0.016683,20106.766683,1,0,20106.7,20106.8,29,4,0.333654,0.069875,0.136812,2.736237,-0.0,-0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-06 23:56:49,431809,18800.05,1,4,3.0,0.016683,18800.066683,1,0,18800.0,18800.1,131016,2,0.333654,0.036509,0.053192,1.063842,-0.0,-0.0
2022-09-06 23:56:50,431810,18800.05,1,4,3.0,0.016683,18800.066683,1,0,18800.0,18800.1,131016,3,0.333654,0.053192,0.106384,2.127684,1.3,1.3
2022-09-06 23:58:15,431895,18800.05,1,4,3.0,0.016683,18800.066683,1,0,18800.0,18800.1,131068,7,0.333654,0.119923,0.088934,1.778690,0.3,0.3
2022-09-06 23:58:19,431899,18800.35,1,4,3.0,0.016683,18800.366683,1,0,18800.3,18800.4,131069,3,0.333654,0.053192,0.097969,1.959377,0.2,0.2


Ratio seems to be the best candidate for our trading bot, with the highest correlation with the price changes.

Should check next ask change and next bid change against all values too bc ultimately it is whats most interesting to us

# Trading algorithm

In [None]:
print(len(g1.transpose()))
def S(N):
 return sum((np.matmul(matrix_power(B,i),g1)) for i in range(1, N + 1))
g_star=g1+S(10)
print(len(g_star))

g_star=g_star.reshape(4,4)

def v_to_s_imbalance(value):
    if value < 0.25:
        return 0
    elif value < 0.5:
        return 1
    elif value < 0.75:
        return 2
    else:
        return 3

def v_to_s_spread(value,tick_size=0.01):
    if value/tick_size <= 1:
        return 0
    elif value/tick_size <= 2:
        return 1
    elif value/tick_size <= 3:
        return 2
    else:
        return 3

def g_value(I,S):
    I=v_to_s_imbalance(I)
    S=v_to_s_spread(S)
    return (g_star[I][S])

print(g_value(0.1,0.1))
