In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import os
from func import calc_vpin, imbalance
from data_load import load_data
from data_processing import transform_buy_sell_volume

In [3]:
# Đường dẫn tới thư mục cần kiểm tra
folder_path = "req_files"
# Kiểm tra nếu thư mục chưa tồn tại thì tạo mới
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print(f"Đã tạo thư mục: {folder_path}")
else:
    print(f"Thư mục '{folder_path}' đã tồn tại.")

Thư mục 'req_files' đã tồn tại.


In [4]:
df={}; sec_trades = {}
sym = ['STB', 'SAB','MWG', 'VCB','TCB']

In [5]:
## Load data
data_tick = load_data(folder="tick")
data_orderbook = load_data(folder="orderbook")

In [6]:
# Transform data
for s in sym:
    data = data_tick[s].copy()
    data.rename(columns = {"Gia KL": "PRICE", "KL": "SIZE"}, inplace = True)
    data.set_index("Date", inplace = True)
    data = data.resample("T").agg({
            'SIZE': 'sum',  # Cột volume tính tổng
            'PRICE': 'mean'    # Cột price tính trung bình
        })
    data.to_csv(f"req_files/{s}price.csv")
    sec_trades[s] = data

In [7]:
# Cal vpin
volume = {}
for key, val in sec_trades.items():
    volume[key] = int(val['SIZE'].resample("D").sum().mean()/50) # Sum của từng ngày, rồi lấy mean, rồi chia 50

for s in sym:
    print('Calculating VPIN')
    df[s] = calc_vpin(sec_trades[s],volume[s],50)
    df[s].to_csv(f"req_files/{s}VPIN.csv",index = True)
    print(s+' '+str(df[s].shape))

Calculating VPIN
STB (2700, 4)
Calculating VPIN
SAB (2709, 4)
Calculating VPIN
MWG (2700, 4)
Calculating VPIN
VCB (2700, 4)
Calculating VPIN
TCB (2700, 4)


In [8]:
## 
avg = pd.DataFrame()
print(avg.shape)
metric = 'CDF'
avg[metric] = np.nan
for stock,frame in df.items():
    frame = frame[[metric]].reset_index().drop_duplicates(subset='Time', keep='last').set_index('Time')
    avg = avg.merge(frame[[metric]],left_index=True,right_index=True,how='outer',suffixes=('',stock))
    print(avg.shape)
avg = avg.dropna(axis=0,how='all').fillna(method='ffill')
avg.to_csv('req_files/CDF.csv')

(0, 0)
(2032, 2)
(2995, 3)
(4106, 4)
(4784, 5)
(5276, 6)


In [9]:
fields = ['Time','CDFSTB','CDFSAB','CDFMWG','CDFVCB','CDFTCB']
df = pd.read_csv('req_files/CDF.csv',parse_dates=['Time'],index_col=[0],usecols = fields)

# rolling_pariwise_corr = pd.rolling_corr(df,window=50,pairwise=True)
rolling_pariwise_corr = df.rolling(window=50).corr()

thres = pd.DataFrame()
thres['AvgCorrAssets'] = rolling_pariwise_corr.groupby(by=['Time']).sum().sum(axis=1)/((len(fields)-1)**2)
thres.to_csv('req_files/AvgCorrAssets.csv')

In [10]:
STB = transform_buy_sell_volume(data_dict=data_orderbook, key1 = "STB")
STB["EX"] = "HNX"

In [11]:
STB.rename(columns = {
    "Gia_Mua": "BID",
    "KL_mua": "BIDSIZ",
    "Gia_Ban": "ASK",
    "KL_ban": "ASKSIZ"
    }, inplace = True)
STB.set_index("Date", inplace=True)

In [12]:
quote_imb=imbalance(STB)
quote_imb.to_csv('req_files/imbalance.csv')

In [13]:
sec_trades = data_tick["STB"]
sec_trades["EX"] = "HNX"
print('File read complete')

File read complete


In [14]:
# exchanges = sec_trades['EX'].unique()
# exchanges = exchanges[:-2]
# bucketsize_standard = 100000
# rolling_window = 50
# df_list = list()
# df_vpin_list = list()
# volume_exchanges = list()
# bucketsize = list()
# for i in range(len(exchanges)-1):
#     df_list.append(sec_trades[sec_trades['EX'] == exchanges[i]])
# for i in range(len(exchanges)):
#     volume_exchanges.append(df_list[i]['SIZE'].sum())
# nbuckets = 6574
# for i in range(len(exchanges)):
#     bucketsize = int(volume_exchanges[i]/nbuckets)
#     df_vpin_list.append(calc_vpin(df_list[i],bucketsize,rolling_window))
    
    
# avg = pd.DataFrame()
# metric = 'VPIN'
# avg[metric] = np.nan
# for i in range(len(exchanges)):
#     print(exchanges[i])
#     frame = df_vpin_list[i]
#     frame = frame[[metric]].reset_index().drop_duplicates(subset='Time', keep='last').set_index('Time')
#     avg = avg.merge(frame[[metric]],left_index=True,right_index=True,how='outer',suffixes=('',exchanges[i]))
#     print(avg.shape)
# avg = avg.dropna(axis=0,how='all').fillna(method='ffill')
# del avg['VPIN']
# avg = avg.dropna()
# print(avg)
# # rolling_pariwise_corr = pd.rolling_corr(avg,window=50,pairwise=True)
# rolling_pariwise_corr = avg.rolling(window=50).corr()
# thres = pd.DataFrame()
# thres['AvgCorrEx'] = rolling_pariwise_corr.groupby(by=['Time']).sum().sum(axis=1)/(len(exchanges)**2)
# print(thres.tail())
# thres.to_csv('AvgCorrEx.csv')

## Applying Theory Based and ML Based Trading Strategies

In [15]:
df_corr_assets = pd.read_csv('req_files/AvgCorrAssets.csv',parse_dates=['Time'],index_col='Time')
# df_corr_ex = pd.read_csv('req_files/AvgCorrEx.csv',parse_dates=['Time'],index_col='Time') # Chứng khoán VN mỗi mã cổ phiếu chỉ có 1 exchange nên không cần phần này
df_vpin = pd.read_csv('req_files/STBVPIN.csv',parse_dates=['Time'],usecols=['Time','CDF'],index_col='Time')
df_quote_imb = pd.read_csv('req_files/imbalance.csv',parse_dates=['Date'],index_col='Date')
df_quote_imb.index.name = 'TIME'
df_price = pd.read_csv('req_files/STBprice.csv')[["Date", "PRICE"]]
df_price.index.name = 'Date'

In [16]:
total_df = pd.DataFrame()
frame = df_corr_assets[[df_corr_assets.columns[0]]].reset_index().drop_duplicates(subset='Time', keep='last').set_index('Time')
total_df = total_df.merge(frame[[frame.columns[0]]],left_index=True,right_index=True,how='outer')
print (total_df.shape)

frame = df_vpin[[df_vpin.columns[0]]].reset_index().drop_duplicates(subset='Time', keep='last').set_index('Time')
total_df = total_df.merge(frame[[frame.columns[0]]],left_index=True,right_index=True,how='outer')
print (total_df.shape)

frame = df_quote_imb[[df_quote_imb.columns[0]]].reset_index().drop_duplicates(subset='TIME', keep='last').set_index('TIME')
total_df = total_df.merge(frame[[frame.columns[0]]],left_index=True,right_index=True,how='outer')
print (total_df.shape)

frame = df_price.drop_duplicates(subset='Date', keep='last').set_index('Date')
total_df = total_df.merge(frame[[frame.columns[0]]],left_index=True,right_index=True,how='outer')
print (total_df.shape)

total_df = total_df.dropna(axis=0,how='all').fillna(method='ffill').fillna(method='bfill')
total_df = total_df.dropna(how='any')
print (total_df.shape)

(5204, 1)
(5220, 2)
(76410, 3)
(152820, 4)
(83212, 4)


In [24]:
total_df.to_csv('req_files/data_qlearner.csv')

In [25]:
df = total_df.copy()
df.index.name = "Time"
df.head()

Unnamed: 0_level_0,AvgCorrAssets,CDF,quote_imb,PRICE
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-09-23 13:00:00,0.0,0.028291,-391120.0,30.941667
2024-09-23 13:01:00,0.0,0.028291,-68310.0,30.941667
2024-09-23 13:02:00,0.0,0.028291,-623890.0,30.941667
2024-09-23 13:03:00,0.0,0.028291,-623890.0,30.941667
2024-09-23 13:04:00,0.0,0.028291,-124940.0,30.941667


In [26]:
df

Unnamed: 0_level_0,AvgCorrAssets,CDF,quote_imb,PRICE
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-09-23 13:00:00,0.000000,0.028291,-391120.0,30.941667
2024-09-23 13:01:00,0.000000,0.028291,-68310.0,30.941667
2024-09-23 13:02:00,0.000000,0.028291,-623890.0,30.941667
2024-09-23 13:03:00,0.000000,0.028291,-623890.0,30.941667
2024-09-23 13:04:00,0.000000,0.028291,-124940.0,30.941667
...,...,...,...,...
2024-11-15 14:21:00,0.217215,0.542437,-38730.0,32.550000
2024-11-15 14:22:00,0.217215,0.542437,-38730.0,32.500000
2024-11-15 14:23:00,0.217215,0.542437,-38730.0,32.500000
2024-11-15 14:25:00,0.217215,0.542437,-38730.0,32.650000


In [35]:
df['returns']=df['PRICE'].pct_change()
df['returns']=df['returns'].shift(-1)
df=df.replace([np.inf, -np.inf], np.nan)
df=df.ffill()
df = df[:-1]
df.index = pd.to_datetime(df.index)
df=df.between_time('8:00','16:00')
df=df[df.index.dayofweek < 5]
# df=df[df.index<pd.to_datetime('2016-12-30')]
df.to_csv('new_data.csv')

savg=df[['AvgCorrAssets','quote_imb']].rolling(window=100).mean()
sstd=df[['AvgCorrAssets','quote_imb']].rolling(window=100).std()

In [36]:
zcorras=(df['AvgCorrAssets']-savg['AvgCorrAssets'])/sstd['AvgCorrAssets']
zimb=(df['quote_imb']-savg['quote_imb'])/sstd['quote_imb']

# zcorrex[zcorrex>3]=3
# zcorrex[zcorrex<-3]=-3
zcorras[zcorras>3]=3
zcorras[zcorras<-3]=-3
zimb[zimb>3]=3
zimb[zimb<-3]=-3


df_trading=pd.DataFrame({'price':df['PRICE'][100:],'cdf':df['CDF'][100:],'zcorras':zcorras[100:],'zimb':zimb[100:],'returns':df['returns'][100:], 'Time':df.index[100:]}
                        ).set_index('Time')
df_trading.to_csv('final.csv')