### 

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import dask
import statsmodels.formula.api as sm



In [7]:
from tqdm.notebook import trange, tqdm
from time import sleep

pip install ipywidgets \
jupyter nbextension enable --py widgetsnbextension \
pip install tqdm 

# Data Preprocessing

In [8]:
DATA_PATH = '/Users/mac/Desktop/Repos/FBD_Project/datasets/'
DATA_PATH = '/Users/zakarysouid/Downloads/'
orderbook_list = sorted(glob.glob(DATA_PATH + 'btcusdt/orderbook/*.csv.gz'))
quote_list = sorted(glob.glob(DATA_PATH + 'btcusdt/quotes/*.csv.gz'))
trade_list = sorted(glob.glob(DATA_PATH + 'btcusdt/trades/*.csv.gz'))

In [9]:
def discretize(
    df_feature: pd.DataFrame,
    numSpreads: int = 4,
    numImbalance: int = 4,
    numdM: int = 2,):
    df_signal = df_feature.copy(deep=True)
    #df_signal['ba_spread']=df_signal['ba_spread'].where(df_signal['side']>=0.1,0.1)
    tick_size = df_signal.ba_spread[df_signal.ba_spread != 0].min()
    # discretize bidask spread then get next time's bidask spread
    # discretize imbalance and get next imbalance
    df_signal = df_signal[df_signal.ba_spread <= numSpreads * tick_size]
    df_signal["ba_spread"] = np.round(df_signal["ba_spread"].div(tick_size)).astype(int)
    df_signal["imbalance"] = pd.cut(
        df_feature["imbalance"],
        bins=np.arange(numImbalance+1) / numImbalance,
        labels=np.arange(1, numImbalance+1),
        include_lowest=True
    )
    return df_signal

def signal(df : pd.DataFrame):
    df['signal'] = np.where(df['g_star'] > 0, 1, -1)
    #df['sum_signal'] = np.where(df['cum_g_star'] > 1, 1, -1)
    return df

# output a boolean if the 'mid_price_change' is greater than 0
def mid_price_change(df : pd.DataFrame):
    df['mid_price_change'] = np.where(df['mid_price'].diff() == 0, 0, 1)
    return df

def number_price_change(df : pd.DataFrame):
    df['number_price_change'] = df['mid_price_change'].cumsum()
    return df

def bid_ask_rebuild(df : pd.DataFrame):
    df['bid_price'] = df['mid_price'] - df['ba_spread']*0.05
    df['ask_price'] = df['mid_price'] + df['ba_spread']*0.05
    return df

def smth_is_happening(df : pd.DataFrame) :
    #df2=pd.DataFrame()
    #df2['smth_is_happening'] = np.where(df['cum_g_star']>= (df['ba_spread']*0.05))
    df2=df.copy(deep=True)
    df2=df2.loc[df2['cum_g_star']>= (df['ba_spread']*0.05)]
    return df2

def ratio(df : pd.DataFrame):
    df['ratio'] = df['g_star']/(df['ba_spread']*0.05)
    return df

def ratio_sum(df : pd.DataFrame):
    df['ratio_sum'] = abs(df['cum_g_star'])/(df['ba_spread']*0.05)
    return df

def time_since_price_change(df : pd.DataFrame):
    df['time_since_price_change'] = df.groupby('number_price_change').cumcount()
    return df

def imbalance(df : pd.DataFrame, horizon : int = 10,memory : bool = False):
    df['bid_amount']= df['amount'].where(df['side']==1,0)
    df['volume']=df['amount'].rolling(min_periods=1, window=horizon).sum()
    df['imbalance']=df['bid_amount'].rolling(min_periods=1,window=horizon).sum()/df['volume']
    df['imbalance_centered'] = df['imbalance'] - 0.5
    df.drop(['bid_amount','signed_amount','volume'], axis=1, inplace=True)
    return df


def s_profit(df : pd.DataFrame):
    df['s_profit'] = (df['next_ask_price']- df['bid_price']).where(df['side']==1,df['next_bid_price'] - df['ask_price'])
    return df

In [10]:
# process raw data to get features for calculation.
%time
#all_features = [extract_features(path) for path in quote_list[:5]] 
df_trades_or = pd.concat((pd.read_csv(f) for f in trade_list[:1]))
print(len(trade_list))

CPU times: user 3 µs, sys: 7 µs, total: 10 µs
Wall time: 21.9 µs
120


In [11]:
df_trades = df_trades_or.copy()
df_trades['timestamp'] = pd.to_datetime(df_trades['timestamp'] / 1000, unit='ms')
df_trades.drop(['id', 'exchange','local_timestamp'], axis=1, inplace=True)
df_trades.replace({'side': {'buy': 1, 'sell': -1}}, inplace=True)

df_trades['bid_price']=df_trades['price'].where(df_trades['side']==1, np.nan)
df_trades['ask_price']=df_trades['price'].where(df_trades['side']==-1, df_trades['bid_price']-0.1)
df_trades['bid_price'].fillna(df_trades['ask_price']+0.1, inplace=True)
df_trades['mid_price']=(df_trades['bid_price']+df_trades['ask_price'])/2


df_trades = mid_price_change(df_trades)
df_trades['next_bid_price']=df_trades['price'].where( (df_trades['side']==1) & (df_trades['mid_price_change']==1), np.nan)
df_trades['next_bid_price'].fillna(method='bfill', inplace=True)
df_trades['next_bid_price'].fillna(method='ffill', inplace=True) #get rid of nan at the end

df_trades['next_ask_price']=df_trades['price'].where((df_trades['side']==-1) & (df_trades['mid_price_change']==1), np.nan)
df_trades['next_ask_price'].fillna(method='bfill', inplace=True)
df_trades['next_ask_price'].fillna(method='ffill', inplace=True) #get rid of nan at the end


df_trades['ba_spread']=0.1

df_trades = s_profit(df_trades)



In [12]:
df_trades[00:100]

Unnamed: 0,symbol,timestamp,side,price,amount,bid_price,ask_price,mid_price,mid_price_change,next_bid_price,next_ask_price,ba_spread,s_profit
0,BTCUSDT,2022-09-02 00:00:00.038000128,1,20122.6,0.001,20122.6,20122.5,20122.55,1,20122.6,20122.4,0.1,-0.2
1,BTCUSDT,2022-09-02 00:00:00.078000128,1,20122.6,0.016,20122.6,20122.5,20122.55,0,20121.4,20122.4,0.1,-0.2
2,BTCUSDT,2022-09-02 00:00:03.544999936,-1,20122.5,0.857,20122.6,20122.5,20122.55,0,20121.4,20122.4,0.1,-1.1
3,BTCUSDT,2022-09-02 00:00:03.544999936,-1,20122.5,0.001,20122.6,20122.5,20122.55,0,20121.4,20122.4,0.1,-1.1
4,BTCUSDT,2022-09-02 00:00:03.544999936,-1,20122.5,0.001,20122.6,20122.5,20122.55,0,20121.4,20122.4,0.1,-1.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,BTCUSDT,2022-09-02 00:00:03.676000000,1,20121.8,0.600,20121.8,20121.7,20121.75,0,20122.0,20121.6,0.1,-0.2
96,BTCUSDT,2022-09-02 00:00:03.676000000,1,20122.0,0.400,20122.0,20121.9,20121.95,1,20122.0,20121.6,0.1,-0.4
97,BTCUSDT,2022-09-02 00:00:03.676000000,1,20122.0,0.002,20122.0,20121.9,20121.95,0,20122.0,20121.6,0.1,-0.4
98,BTCUSDT,2022-09-02 00:00:03.678000128,1,20122.0,0.088,20122.0,20121.9,20121.95,0,20122.0,20121.6,0.1,-0.4


In [13]:
#rolling imbalances
#df_trades = mid_price_change(df_trades)
df_trades = number_price_change(df_trades)
df_trades = time_since_price_change(df_trades)

df_trades['signed_amount'] = df_trades['amount'] * df_trades['side']
#df_trades['cum_size']= df_trades.groupby('number_price_change')['signed_amount'].cumsum()
df_trades = imbalance(df_trades, horizon=10, memory=True)
df_trades.drop(index=df_trades.index[:50], axis=0, inplace=True)
df_trades.drop(['number_price_change','time_since_price_change','mid_price_change','symbol'],axis=1, inplace=True)

In [14]:
df_trades['imbalance'].where(df_trades['imbalance']>0,0.1,inplace=True)
df_trades['imbalance'].where(df_trades['imbalance']<1,1,inplace=True)



In [15]:
#loading data

#df_feat = pd.read_csv('df_feat.csv')
df_micro = pd.read_csv('df_micro.csv')
#df_sig = pd.read_csv('df_sig.csv')

In [16]:
# add ba_spread=5 by duplicating the values for ba_spread=4
#del df_temp
df_temp =(df_micro[df_micro.ba_spread == 4])
df_temp = df_temp.assign(ba_spread=5)
df_micro = df_micro.append(df_temp)


In [17]:
df_trades.tail()

Unnamed: 0,timestamp,side,price,amount,bid_price,ask_price,mid_price,next_bid_price,next_ask_price,ba_spread,s_profit,imbalance,imbalance_centered
3954193,2022-09-02 23:59:59.724999936,1,19941.1,0.007,19941.1,19941.0,19941.05,19941.1,19940.2,0.1,-0.9,1.0,0.5
3954194,2022-09-02 23:59:59.726000128,1,19941.1,0.05,19941.1,19941.0,19941.05,19941.1,19940.2,0.1,-0.9,1.0,0.5
3954195,2022-09-02 23:59:59.734000128,1,19941.1,0.001,19941.1,19941.0,19941.05,19941.1,19940.2,0.1,-0.9,1.0,0.5
3954196,2022-09-02 23:59:59.736999936,-1,19941.0,0.025,19941.1,19941.0,19941.05,19941.1,19940.2,0.1,0.1,0.992243,0.492243
3954197,2022-09-02 23:59:59.840000000,-1,19941.0,0.003,19941.1,19941.0,19941.05,19941.1,19940.2,0.1,0.1,0.990431,0.490431


In [18]:
df_dis=discretize(df_trades)

In [19]:
df_dis

Unnamed: 0,timestamp,side,price,amount,bid_price,ask_price,mid_price,next_bid_price,next_ask_price,ba_spread,s_profit,imbalance,imbalance_centered
50,2022-09-02 00:00:03.600000000,1,20122.6,0.014,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,0.479740
51,2022-09-02 00:00:03.604000000,1,20122.6,0.008,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,0.436073
52,2022-09-02 00:00:03.604999936,-1,20122.5,0.001,20122.6,20122.5,20122.55,20121.4,20122.4,1,-1.1,4,0.355769
53,2022-09-02 00:00:03.604999936,1,20122.6,0.001,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,0.342105
54,2022-09-02 00:00:03.614000128,1,20122.6,0.001,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,0.335165
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3954193,2022-09-02 23:59:59.724999936,1,19941.1,0.007,19941.1,19941.0,19941.05,19941.1,19940.2,1,-0.9,4,0.500000
3954194,2022-09-02 23:59:59.726000128,1,19941.1,0.050,19941.1,19941.0,19941.05,19941.1,19940.2,1,-0.9,4,0.500000
3954195,2022-09-02 23:59:59.734000128,1,19941.1,0.001,19941.1,19941.0,19941.05,19941.1,19940.2,1,-0.9,4,0.500000
3954196,2022-09-02 23:59:59.736999936,-1,19941.0,0.025,19941.1,19941.0,19941.05,19941.1,19940.2,1,0.1,4,0.492243


In [20]:
df_dis_g = pd.merge(
            df_dis.reset_index(),
            df_micro.reset_index(),
            how='left',
            left_on=['ba_spread','imbalance'], 
            right_on=['ba_spread','imbalance']
        ).set_index('timestamp')

In [21]:
# micro price calculation: mid_price + g_star
df_dis_g['micro_price'] = df_dis_g['mid_price'] + df_dis_g['g_star'] 
df_dis_g

Unnamed: 0_level_0,index_x,side,price,amount,bid_price,ask_price,mid_price,next_bid_price,next_ask_price,ba_spread,s_profit,imbalance,imbalance_centered,index_y,g_star,micro_price
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2022-09-02 00:00:03.600000000,50,1,20122.6,0.014,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,0.479740,3,0.016683,20122.566683
2022-09-02 00:00:03.604000000,51,1,20122.6,0.008,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,0.436073,3,0.016683,20122.566683
2022-09-02 00:00:03.604999936,52,-1,20122.5,0.001,20122.6,20122.5,20122.55,20121.4,20122.4,1,-1.1,4,0.355769,3,0.016683,20122.566683
2022-09-02 00:00:03.604999936,53,1,20122.6,0.001,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,0.342105,3,0.016683,20122.566683
2022-09-02 00:00:03.614000128,54,1,20122.6,0.001,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,0.335165,3,0.016683,20122.566683
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-02 23:59:59.724999936,3954193,1,19941.1,0.007,19941.1,19941.0,19941.05,19941.1,19940.2,1,-0.9,4,0.500000,3,0.016683,19941.066683
2022-09-02 23:59:59.726000128,3954194,1,19941.1,0.050,19941.1,19941.0,19941.05,19941.1,19940.2,1,-0.9,4,0.500000,3,0.016683,19941.066683
2022-09-02 23:59:59.734000128,3954195,1,19941.1,0.001,19941.1,19941.0,19941.05,19941.1,19940.2,1,-0.9,4,0.500000,3,0.016683,19941.066683
2022-09-02 23:59:59.736999936,3954196,-1,19941.0,0.025,19941.1,19941.0,19941.05,19941.1,19940.2,1,0.1,4,0.492243,3,0.016683,19941.066683


In [22]:
df_signal = signal(df_dis_g)
df_signal = mid_price_change(df_signal)
#df_signal = bid_ask_rebuild(df_signal)
df_signal = number_price_change(df_signal)
df_signal = time_since_price_change(df_signal)
df_signal = ratio(df_signal)



#delete useless columns
#add sum_g_star to df_signal until mid_price_change = 1
df_signal['weighted_g_star'] = df_signal['g_star'] * (df_signal['time_since_price_change']+1/df_signal['time_since_price_change'].mean())
df_signal['cum_g_star'] = df_signal.groupby('number_price_change').cumsum()['weighted_g_star'] 
df_signal = ratio_sum(df_signal)
df_signal = df_signal.drop(['index_x','index_y','signal','number_price_change','mid_price_change'], axis=1)
#df_test = smth_is_happening(df_signal)
#df_test
df_signal

Unnamed: 0_level_0,side,price,amount,bid_price,ask_price,mid_price,next_bid_price,next_ask_price,ba_spread,s_profit,imbalance,imbalance_centered,g_star,micro_price,time_since_price_change,ratio,weighted_g_star,cum_g_star,ratio_sum
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2022-09-02 00:00:03.600000000,1,20122.6,0.014,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,0.479740,0.016683,20122.566683,0,0.333654,0.000600,0.000600,0.011997
2022-09-02 00:00:03.604000000,1,20122.6,0.008,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,0.436073,0.016683,20122.566683,1,0.333654,0.017283,0.017882,0.357647
2022-09-02 00:00:03.604999936,-1,20122.5,0.001,20122.6,20122.5,20122.55,20121.4,20122.4,1,-1.1,4,0.355769,0.016683,20122.566683,2,0.333654,0.033965,0.051848,1.036951
2022-09-02 00:00:03.604999936,1,20122.6,0.001,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,0.342105,0.016683,20122.566683,3,0.333654,0.050648,0.102495,2.049909
2022-09-02 00:00:03.614000128,1,20122.6,0.001,20122.6,20122.5,20122.55,20121.4,20122.4,1,-0.2,4,0.335165,0.016683,20122.566683,4,0.333654,0.067331,0.169826,3.396520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-02 23:59:59.724999936,1,19941.1,0.007,19941.1,19941.0,19941.05,19941.1,19940.2,1,-0.9,4,0.500000,0.016683,19941.066683,94,0.333654,1.568771,22.706409,454.128188
2022-09-02 23:59:59.726000128,1,19941.1,0.050,19941.1,19941.0,19941.05,19941.1,19940.2,1,-0.9,4,0.500000,0.016683,19941.066683,95,0.333654,1.585454,24.291863,485.837269
2022-09-02 23:59:59.734000128,1,19941.1,0.001,19941.1,19941.0,19941.05,19941.1,19940.2,1,-0.9,4,0.500000,0.016683,19941.066683,96,0.333654,1.602137,25.894000,517.880003
2022-09-02 23:59:59.736999936,-1,19941.0,0.025,19941.1,19941.0,19941.05,19941.1,19940.2,1,0.1,4,0.492243,0.016683,19941.066683,97,0.333654,1.618819,27.512820,550.256392


In [23]:
df_signal_light = df_signal[['s_profit','g_star','ratio','weighted_g_star','cum_g_star','ratio_sum']]
abs(df_signal_light['s_profit']).cumsum()


timestamp
2022-09-02 00:00:03.600000000    2.000000e-01
2022-09-02 00:00:03.604000000    4.000000e-01
2022-09-02 00:00:03.604999936    1.500000e+00
2022-09-02 00:00:03.604999936    1.700000e+00
2022-09-02 00:00:03.614000128    1.900000e+00
                                     ...     
2022-09-02 23:59:59.724999936    5.232690e+06
2022-09-02 23:59:59.726000128    5.232690e+06
2022-09-02 23:59:59.734000128    5.232691e+06
2022-09-02 23:59:59.736999936    5.232691e+06
2022-09-02 23:59:59.840000000    5.232692e+06
Name: s_profit, Length: 3954148, dtype: float64

In [27]:
result_ratio = sm.ols(formula="s_profit ~ ratio ", data=df_signal_light).fit()
result_g_star = sm.ols(formula="s_profit ~ g_star ", data=df_signal_light).fit()
result_weighted_g_star = sm.ols(formula="s_profit ~ weighted_g_star ", data=df_signal_light).fit()
result_cum_g_star = sm.ols(formula="s_profit ~ cum_g_star ", data=df_signal_light).fit()
result_ratio_sum = sm.ols(formula="s_profit ~ ratio_sum ", data=df_signal_light).fit()
result_imbalance = sm.ols(formula="s_profit ~ imbalance ", data=df_trades).fit()
result_imbalance_centered = sm.ols(formula="s_profit ~ imbalance_centered ", data=df_trades).fit()

print(result_ratio.summary())

                            OLS Regression Results                            
Dep. Variable:               s_profit   R-squared:                       0.022
Model:                            OLS   Adj. R-squared:                  0.022
Method:                 Least Squares   F-statistic:                 9.098e+04
Date:                Thu, 02 Feb 2023   Prob (F-statistic):               0.00
Time:                        22:22:06   Log-Likelihood:            -9.7465e+06
No. Observations:             3954148   AIC:                         1.949e+07
Df Residuals:                 3954146   BIC:                         1.949e+07
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0200      0.001     13.995      0.0

In [None]:
print(result_g_star.summary())

                            OLS Regression Results                            
Dep. Variable:               s_profit   R-squared:                       0.022
Model:                            OLS   Adj. R-squared:                  0.022
Method:                 Least Squares   F-statistic:                 9.098e+04
Date:                Sun, 29 Jan 2023   Prob (F-statistic):               0.00
Time:                        13:23:36   Log-Likelihood:            -9.7465e+06
No. Observations:             3954148   AIC:                         1.949e+07
Df Residuals:                 3954146   BIC:                         1.949e+07
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0200      0.001     13.995      0.0

In [172]:
print(result_weighted_g_star.summary())

                            OLS Regression Results                            
Dep. Variable:               s_profit   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                 1.538e+04
Date:                Sun, 29 Jan 2023   Prob (F-statistic):               0.00
Time:                        13:23:47   Log-Likelihood:            -9.7838e+06
No. Observations:             3954148   AIC:                         1.957e+07
Df Residuals:                 3954146   BIC:                         1.957e+07
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           0.0180      0.001     

In [173]:
print(result_cum_g_star.summary())

                            OLS Regression Results                            
Dep. Variable:               s_profit   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     53.52
Date:                Sun, 29 Jan 2023   Prob (F-statistic):           2.56e-13
Time:                        13:23:52   Log-Likelihood:            -9.7914e+06
No. Observations:             3954148   AIC:                         1.958e+07
Df Residuals:                 3954146   BIC:                         1.958e+07
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0185      0.001     12.753      0.0

In [174]:
print(result_ratio_sum.summary())

                            OLS Regression Results                            
Dep. Variable:               s_profit   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     385.6
Date:                Sun, 29 Jan 2023   Prob (F-statistic):           7.50e-86
Time:                        13:23:57   Log-Likelihood:            -9.7913e+06
No. Observations:             3954148   AIC:                         1.958e+07
Df Residuals:                 3954146   BIC:                         1.958e+07
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0205      0.001     14.102      0.0

In [37]:
print(result_imbalance.summary())

                            OLS Regression Results                            
Dep. Variable:               s_profit   R-squared:                       0.024
Model:                            OLS   Adj. R-squared:                  0.024
Method:                 Least Squares   F-statistic:                 9.591e+04
Date:                Sun, 29 Jan 2023   Prob (F-statistic):               0.00
Time:                        21:01:38   Log-Likelihood:            -9.7441e+06
No. Observations:             3954148   AIC:                         1.949e+07
Df Residuals:                 3954146   BIC:                         1.949e+07
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.4892      0.002   -224.877      0.0

In [42]:
print(result_imbalance_centered.summary())

                            OLS Regression Results                            
Dep. Variable:               s_profit   R-squared:                       0.024
Model:                            OLS   Adj. R-squared:                  0.024
Method:                 Least Squares   F-statistic:                 9.617e+04
Date:                Sun, 29 Jan 2023   Prob (F-statistic):               0.00
Time:                        21:02:48   Log-Likelihood:            -9.7439e+06
No. Observations:             3954148   AIC:                         1.949e+07
Df Residuals:                 3954146   BIC:                         1.949e+07
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               0.0202    

In [18]:
df_mvt = df_signal.copy(deep=True)
df_mvt = df_mvt.loc[df_mvt['time_since_price_change']==0]
df_signal['ask_change']=-df_signal['ask_price'].diff(-1)
df_signal['bid_change']=-df_signal['bid_price'].diff(-1)
df_signal['ask_next_change']=df_signal['ask_change'].replace(0,method='bfill')
df_signal['bid_next_change']=df_signal['bid_change'].replace(0,method='bfill')
df_signal_light = df_signal.drop(['index_x','index_y','signal','number_price_change','mid_price','ba_spread','micro_price','bid_price','ask_price','mid_price_change'], axis=1)

df_signal_light.corr()

Unnamed: 0,imbalance,g_star,time_since_price_change,ratio,weighted_g_star,cum_g_star,ratio_sum,ask_change,bid_change,ask_next_change,bid_next_change
imbalance,1.0,0.983937,0.01195,0.997336,0.494438,0.139164,0.017748,0.226754,0.222004,0.248945,0.246648
g_star,0.983937,1.0,0.011895,0.991576,0.483818,0.135557,0.018023,0.240529,0.240573,0.252865,0.252908
time_since_price_change,0.01195,0.011895,1.0,0.012054,0.066219,0.162801,0.705664,0.000979,0.000807,0.005354,0.0059
ratio,0.997336,0.991576,0.012054,1.0,0.492165,0.138017,0.018334,0.241841,0.241911,0.255584,0.255657
weighted_g_star,0.494438,0.483818,0.066219,0.492165,1.0,0.49798,0.109688,0.054827,0.054866,0.09932,0.099325
cum_g_star,0.139164,0.135557,0.162801,0.138017,0.49798,1.0,0.404153,0.008587,0.008609,0.022192,0.022244
ratio_sum,0.017748,0.018023,0.705664,0.018334,0.109688,0.404153,1.0,0.000256,0.000447,-0.001303,-0.001128
ask_change,0.226754,0.240529,0.000979,0.241841,0.054827,0.008587,0.000256,1.0,0.986681,0.645918,0.645899
bid_change,0.222004,0.240573,0.000807,0.241911,0.054866,0.008609,0.000447,0.986681,1.0,0.637313,0.653838
ask_next_change,0.248945,0.252865,0.005354,0.255584,0.09932,0.022192,-0.001303,0.645918,0.637313,1.0,0.993836


In [13]:
df_mvt['ask_change']=-df_mvt['ask_price'].diff(-1)
df_mvt['bid_change']=-df_mvt['bid_price'].diff(-1)
df_mvt_light = df_mvt.drop(['index_x','index_y','signal','number_price_change','mid_price','ba_spread','micro_price','bid_price','ask_price','mid_price_change', 'time_since_price_change'], axis=1)



df_mvt_light.corr()

Unnamed: 0,imbalance,g_star,ratio,weighted_g_star,cum_g_star,ratio_sum,ask_change,bid_change
imbalance,1.0,0.959446,0.995365,0.959446,0.959446,-0.002767,0.275173,0.269304
g_star,0.959446,1.0,0.97761,1.0,1.0,-0.001697,0.283395,0.283337
ratio,0.995365,0.97761,1.0,0.97761,0.97761,-0.002152,0.29273,0.292711
weighted_g_star,0.959446,1.0,0.97761,1.0,1.0,-0.001697,0.283395,0.283337
cum_g_star,0.959446,1.0,0.97761,1.0,1.0,-0.001697,0.283395,0.283337
ratio_sum,-0.002767,-0.001697,-0.002152,-0.001697,-0.001697,1.0,0.000566,0.000771
ask_change,0.275173,0.283395,0.29273,0.283395,0.283395,0.000566,1.0,0.986682
bid_change,0.269304,0.283337,0.292711,0.283337,0.283337,0.000771,0.986682,1.0


# Trading algorithm

In [69]:
df_trading_bot = df_dis_g.copy(deep=True)
df_trading_bot.drop(['index_x','index_y','next_bid_price','next_ask_price','signal','mid_price_change','number_price_change','time_since_price_change','weighted_g_star','cum_g_star','ratio_sum','s_profit',],axis=1,inplace=True)
max_pos=0.10
df_trading_bot['g_star_change']= np.sign(df_trading_bot['g_star']).diff()
df_trading_bot['g_star_change'].fillna(0, inplace=True)

In [70]:
# df_trading_bot['money']=0
# df_trading_bot['buy_position']=0
# df_trading_bot['buy_position_money']=0
# df_trading_bot['buy_position_invested']=0

# df_trading_bot['sell_position']=0
# df_trading_bot['sell_position']=0
# df_trading_bot['sell_position_money']=0
# df_trading_bot['profit']=0
# df_trading_bot['cum_profit']=0
# df_trading_bot['money']=100000
# df_trading_bot.head()


# #access previous row data


In [71]:
df_trading_bot.head()

Unnamed: 0_level_0,side,price,amount,bid_price,ask_price,mid_price,ba_spread,imbalance,imbalance_centered,g_star,micro_price,ratio,g_star_change
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2022-09-02 00:00:03.600000000,1,20122.6,0.014,20122.6,20122.5,20122.55,1,4,0.47974,0.016683,20122.566683,0.333654,0.0
2022-09-02 00:00:03.604000000,1,20122.6,0.008,20122.6,20122.5,20122.55,1,4,0.436073,0.016683,20122.566683,0.333654,0.0
2022-09-02 00:00:03.604999936,-1,20122.5,0.001,20122.6,20122.5,20122.55,1,4,0.355769,0.016683,20122.566683,0.333654,0.0
2022-09-02 00:00:03.604999936,1,20122.6,0.001,20122.6,20122.5,20122.55,1,4,0.342105,0.016683,20122.566683,0.333654,0.0
2022-09-02 00:00:03.614000128,1,20122.6,0.001,20122.6,20122.5,20122.55,1,4,0.335165,0.016683,20122.566683,0.333654,0.0


In [72]:
# #trading logic
# """if g_star > 0 and money > 0:
# place bid order at bid_price, add amount to position, substract money by bid_price * position
# if g_star < 0 and position > 0 and bid_price < ask_price: 
# sell position at ask_price, substract position, add money by ask_price * position
# profit = position * (aks_price - bid_price))
     
# if g_star < 0 and money > 0:
# place sell order at ask_price, add position
# if g_star > 0 and position > 0 and bid_price < ask_price:
# buy position at bid_price, substract position, add money by bid_price * position
    
# """

# #conditions to open a buy trade : g_star > 0 and money > 0
# df_trading_bot['buy_position'] = np.where((df_trading_bot['g_star'] > 0) & (df_trading_bot['money'] > 0), max_pos*df_trading_bot['money']/df_trading_bot['bid_price'], 0)
# df_trading_bot['buy_position_money'] = np.where((df_trading_bot['buy_position'] > 0),max_pos*df_trading_bot['money'], 0)
# df_trading_bot['buy_position_invested'] = df_trading_bot['buy_position_money'].cumsum()

# # conditions to close a buy trade : position > 0 and g_star_change < 0


In [75]:
class trade :
    def __init__(self, side, price, amount):
        self.side = side
        self.price = price
        self.amount = amount
        self.invested = price*amount
        self.realized = 0
        self.profit = 0
        
    def close(self, ask_price, bid_price) : 
        if self.side == 'buy' :
            #self.profit=(ask_price-self.price)*self.amount
            self.realized=ask_price*self.amount
        else : 
            #self.profit=(self.price-bid_price)*self.amount
            self.realized=bid_price*self.amount
            #self.profit = self.realized - self.invested
        
        
    # def __str__(self):
    #     return self.side + " " + str(self.price) + " " + str(self.amount) + " " + str(self.profit)
    # def __repr__(self):
    #     return self.side + " " + str(self.price) + " " + str(self.amount)  + " " + str(self.profit)
    
            
class portfolio : 
    def __init__(self, size, max_pos):
        self.size = size
        self.trades = []
        self.profit = 0
        self.g_star = 0 
        self.ask_price = 0
        self.bid_price = 0
        self.g_star_change = 0
        self.time = 0
        self.max_pos = max_pos*self.size
        self.to_invest = size
 
    def choose_trade(self):
        if self.g_star_change != 0:
            self.close_all()
        elif self.to_invest > 0 :
            if self.g_star>0 : self.open_trade('buy', self.bid_price, self.max_pos/self.bid_price)
            if self.g_star<0 : self.open_trade('sell', self.ask_price, self.max_pos/self.ask_price)
        
    def open_trade(self, side, price, amount):
        self.trades.append(trade(side, price, amount))
        self.to_invest -= self.trades[-1].invested
        
    def close_trades(self, trade):
        trade.close(self.bid_price, self.ask_price)
        self.to_invest += trade.realized
        #self.profit += trade.profit
        
    def close_all(self):
        for trade in self.trades:
            self.close_trades(trade)
        self.trades.clear()

    
    def update(self, df):
        self.g_star = df['g_star']
        self.ask_price = df['ask_price']
        self.bid_price = df['bid_price']
        self.g_star_change = df['g_star_change']
        self.choose_trade()
    

# rewrite the code to use a dataframe instead of a list of trades



In [80]:
df_trading_bot.iloc[0]['g_star']
portfolio1=portfolio(200, 0.1)
for i in range(0, int(len(df_trading_bot)/10)):
    portfolio1.update(df_trading_bot.iloc[i])
portfolio1.close_all()

In [79]:
portfolio1.profit

0

In [78]:
portfolio1.to_invest

200.7754258398991

In [50]:
portfolio1.to_invest+portfolio1.profit

257.0894251966955

In [4]:
#### dataframe test 2 ####      

df_trading_bot2 = df_trading_bot.copy(deep=True)
df_trading_bot2['trades'].iloc[1]=trade('buy',2000,1)


NameError: name 'df_trading_bot' is not defined

In [None]:
class trade :
    def __init__(self, side, price, amount):
        self.side = side
        self.price = price
        self.amount = amount
        self.invested = price*amount
        self.realized = 0
        self.profit = 0
        
def close(self, ask_price, bid_price) : 
        if self.side == 'buy' :
            self.profit=(ask_price-self.price)*self.amount
            self.realized=ask_price*self.amount
        else : 
            self.profit=(self.price-bid_price)*self.amount
            self.realized=bid_price*self.amount
    
            
def __init__(self, size, max_pos):
        self.size = size
        self.trades = []
        self.profit = 0
        self.g_star = 0 
        self.ask_price = 0
        self.bid_price = 0
        self.g_star_change = 0
        self.time = 0
        self.max_pos = max_pos
        self.to_invest = size
 
def choose_trade(self):
        if self.g_star_change != 0:
            self.close_all()
        elif self.g_star>0 and self.to_invest > 0 :
            self.open_trade('buy', self.bid_price, self.max_pos*self.size/self.bid_price)
        elif self.g_star<0 and self.to_invest > 0 :
            self.open_trade('sell', self.ask_price, self.max_pos*self.size/self.ask_price)
            
            ### then put a value 1 in a column called 'close' to mark where the trade is closed
        
def open_trade(self, side, price, amount):
        self.trades.append(trade(side, price, amount))
        self.to_invest -= self.trades[-1].invested
        
def close_trades(self, trade):
        trade.close(self.bid_price, self.ask_price)
        self.to_invest += trade.realized
        self.profit += trade.profit
        
def close_all(self):
        for trade in self.trades:
            self.close_trades(trade)
        self.trades.clear()


portfolio1=portfolio(200, 0.1)

# rewrite the code to use a dataframe instead of a list of trades



def create_desired_output(SBT):
    if SBT > 3.6:
        return 2
    if SBT >= 2.95 and SBT <= 3.6:
        return 3
    if SBT >= 2.6 and SBT <= 2.95:
        return 4
    if SBT >= 2.05 and SBT <= 2.6:
        return 5
    if SBT >= 1.31 and SBT <= 2.05:
        return 6
    if SBT < 1.31:
        return 7
    return ''
df['desired_output'] = df['SBT'].apply(lambda x: create_desired_output(x))
df


In [56]:
import numpy as np
import pandas as pd

In [63]:
df = df_trading_bot.copy(deep=True)
df.head()

Unnamed: 0_level_0,side,price,amount,bid_price,ask_price,mid_price,ba_spread,imbalance,imbalance_centered,g_star,micro_price,ratio,g_star_change
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2022-09-02 00:00:03.600000000,1,20122.6,0.014,20122.6,20122.5,20122.55,1,4,0.47974,0.016683,20122.566683,0.333654,0.0
2022-09-02 00:00:03.604000000,1,20122.6,0.008,20122.6,20122.5,20122.55,1,4,0.436073,0.016683,20122.566683,0.333654,0.0
2022-09-02 00:00:03.604999936,-1,20122.5,0.001,20122.6,20122.5,20122.55,1,4,0.355769,0.016683,20122.566683,0.333654,0.0
2022-09-02 00:00:03.604999936,1,20122.6,0.001,20122.6,20122.5,20122.55,1,4,0.342105,0.016683,20122.566683,0.333654,0.0
2022-09-02 00:00:03.614000128,1,20122.6,0.001,20122.6,20122.5,20122.55,1,4,0.335165,0.016683,20122.566683,0.333654,0.0


In [68]:
# Create a sample dataframe with trade data


%time
# Set the investment budget
budget = 200

# Initialize the dataframe to keep track of investments and profits
results = pd.DataFrame({'investment': np.zeros(len(df)), 'profit': np.zeros(len(df))}, index=df.index)

%time 

# Find the indices where the indicator is 1 (indicating a buy signal)
buy_signals = df.index[df['g_star'] > 0].tolist()

# Find the indices where the indicator is -1 (indicating a sell signal)
sell_signals = df.index[df['g_star_change'] != 0].tolist()

%time

# Combine the buy and sell signals into a single list of trades
trades = sorted(buy_signals + sell_signals)

# Allocate the investment budget to the buy signals
#results.loc[buy_signals, 'investment'] = budget / len(buy_signals)

%time
# Calculate the profits of the sell signals
results.loc[sell_signals, 'profit'] = df.loc[sell_signals, 'price'] - results.loc[sell_signals, 'investment']
print('here')
# Reinvest the profits into the next buy signals
investment_budget = budget
for i, trade in tqdm(enumerate(trades)):
    if df.loc[trade, 'indicator'] == -1:
        investment_budget += results.loc[trade, 'profit']
    results.loc[trade, 'investment'] = investment_budget / len(buy_signals[i:])

# Print the final results
final_budget = budget - results['investment'].sum() + results['profit'].sum()
print("Final budget:", final_budget)
print("Investments and profits:")
print(results)


CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs
CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 7.15 µs
CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 4.77 µs
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.87 µs
here


0it [00:00, ?it/s]

KeyError: 'indicator'