In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
from scipy.stats import skew, kurtosis

import warnings
warnings.filterwarnings('ignore')

In [2]:
symbol = "VN30"
if Path("Users").exists():  # Windows
    input_path = r"C:\Users\phamhoa\Downloads\thesis\data\Binance\agg\500"
    file_path = rf"{input_path}\{symbol}.csv"
else:  # Macbook
    input_path = "/Users/hoapham/Documents/Learning/thesis/data/Binance/agg/500"
    file_path = f"{input_path}/{symbol}.csv"

In [3]:

file_path = "/Users/hoapham/Documents/Learning/thesis/data/orderbook/41I1FB000.parquet"

df = pd.read_parquet(file_path)
df = df.drop(columns=["Unnamed: 0"], errors='ignore')
df = df.drop_duplicates()

In [4]:
df["Time"] = df.index
df["price"] = df["price"]/1000
df["quantity"] = df["vol"]
df = df.sort_values('Time').reset_index(drop=True)
df['datetime'] = pd.to_datetime(df['Time'], unit='ms', utc=False)
df = df[["datetime", "price", "quantity", "side"]]
df['datetime'] = df['datetime'].dt.floor('S')
df["side"] = df["side"].replace({"bu": 'buy', "sd": 'sell'})
agg = (df.groupby(['datetime', 'side'])
         .agg(price_mean=('price', 'mean'),
              qty_sum=('quantity', 'sum'))
         .reset_index()
         )
price_wide = agg.pivot(index='datetime', columns='side', values='price_mean').add_prefix('price_')
qty_wide   = agg.pivot(index='datetime', columns='side', values='qty_sum').add_prefix('qty_')
out_df = pd.concat([price_wide, qty_wide], axis=1).fillna(0.0)
out_df = out_df[['price_buy', 'price_sell', 'qty_buy', 'qty_sell']].sort_index()


# Tính toán các tham số đầu vào

In [5]:
window = 50

## Chuẩn bị bộ data: Từ time bar biến đổi thành volume bar

## Tính V - Dùng để chia Volume Bucket

In [6]:
def calc_V(out_df, number_bucket=50):
    out_df["total_qty"] = out_df["qty_buy"] + out_df["qty_sell"]
    # resample theo ngày
    daily_vol = out_df["total_qty"].resample("D").sum()

    # bỏ ngày đầu và ngày cuối vì không đủ dữ liệu
    daily_vol = daily_vol.iloc[1:-1]
    # Tính Volume Bucket
    V = int(daily_vol.mean() / number_bucket)
    out_df = out_df.drop(columns=["total_qty"])
    return V

## Chia Volume Buckets

In [7]:
def get_buckets(df, bucketSize: float) -> pd.DataFrame:
    d = df.copy()
    buckets = []
    BV = SV = filled = 0.0  # Buy Vol, Sell Vol, đã lấp đầy trong bucket hiện tại

    # tích lũy cho trung bình giá mua/bán
    bid_price_num = 0.0   # sum(alloc_buy * price_buy)
    ask_price_num = 0.0   # sum(alloc_sell * price_sell)
    total_price_num = 0.0 # sum((alloc_buy*price_buy + alloc_sell*price_sell))

    for ts, row in d.iterrows():
        buy_remain  = float(row['qty_buy'])
        sell_remain = float(row['qty_sell'])
        total_remain = buy_remain + sell_remain

        while total_remain > 0:
            space = bucketSize - filled
            take = min(space, total_remain)

            # phân bổ theo tỷ lệ buy/sell còn lại
            buy_share = (buy_remain / total_remain) if total_remain > 0 else 0.0
            alloc_buy = take * buy_share
            alloc_sell = take - alloc_buy

            # cộng dồn volume
            BV += alloc_buy
            SV += alloc_sell

            # cộng dồn cho từng loại giá
            if alloc_buy > 0:
                bid_price_num += alloc_buy * float(row['price_buy'])
                total_price_num += alloc_buy * float(row['price_buy'])
            if alloc_sell > 0:
                ask_price_num += alloc_sell * float(row['price_sell'])
                total_price_num += alloc_sell * float(row['price_sell'])

            # cập nhật trạng thái
            filled += take
            buy_remain  -= alloc_buy
            sell_remain -= alloc_sell
            total_remain = buy_remain + sell_remain

            # đủ bucket → ghi lại
            if filled >= bucketSize - 1e-12:
                total_vol = BV + SV
                bid_mean = (bid_price_num / BV) if BV > 0 else np.nan
                ask_mean = (ask_price_num / SV) if SV > 0 else np.nan
                avg_price = (total_price_num / total_vol) if total_vol > 0 else np.nan

                buckets.append({
                    'Time': ts,
                    'Buy': BV,
                    'Sell': SV,
                    'Price': avg_price,    # giá chung (VWAP toàn bucket)
                    'BidPrice': bid_mean,  # giá mua trung bình
                    'AskPrice': ask_mean   # giá bán trung bình
                })

                # reset cho bucket mới
                BV = SV = filled = 0.0
                bid_price_num = ask_price_num = total_price_num = 0.0

    return pd.DataFrame(buckets)


## 1. Tính VPIN

In [8]:

def calc_vpin(df, bucketSize, window):
    df_buckets = get_buckets(df, bucketSize)
    df_buckets["Volume"] = df_buckets["Buy"] + df_buckets["Sell"]

    # VPIN: rolling mean của |Buy - Sell| / V
    df_buckets['VPIN'] = abs(df_buckets['Buy'] - df_buckets['Sell']).rolling(window).mean() / bucketSize
    #CDF
    df_buckets['CDF'] = df_buckets['VPIN'].rank(pct=True)
    return df_buckets


# Lưu dữ liệu xuống

In [None]:
def save_data(data, number_bucket, V):
    if Path("C:").exists():  # Windows
        vpin_path = rf"{input_path}\VPIN"
    else:  # Macbook        
        vpin_path = f"{input_path}/VPIN"

    if not os.path.exists(vpin_path):
        os.makedirs(vpin_path)
    data.to_csv(f"{vpin_path}/{symbol}/{number_bucket}.csv", index=False)
    print(f"Saved VPIN data to {vpin_path}/{symbol}/{number_bucket}_{V}.csv")

In [10]:
def wrap_func(number_bucket):
    V = calc_V(out_df, number_bucket=number_bucket)
    data = calc_vpin(out_df, V, window)
    save_data(data,number_bucket, V)

In [11]:
number_buckets = list(range(10, 100, 5))
print(number_buckets)

[10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]


In [12]:
for nb in number_buckets:
    wrap_func(nb)

Saved VPIN data to /Users/hoapham/Documents/Learning/thesis/data/Binance/agg/500/VPIN/VN30/10_26795.csv
Saved VPIN data to /Users/hoapham/Documents/Learning/thesis/data/Binance/agg/500/VPIN/VN30/15_17863.csv
Saved VPIN data to /Users/hoapham/Documents/Learning/thesis/data/Binance/agg/500/VPIN/VN30/20_13397.csv
Saved VPIN data to /Users/hoapham/Documents/Learning/thesis/data/Binance/agg/500/VPIN/VN30/25_10718.csv
Saved VPIN data to /Users/hoapham/Documents/Learning/thesis/data/Binance/agg/500/VPIN/VN30/30_8931.csv
Saved VPIN data to /Users/hoapham/Documents/Learning/thesis/data/Binance/agg/500/VPIN/VN30/35_7655.csv
Saved VPIN data to /Users/hoapham/Documents/Learning/thesis/data/Binance/agg/500/VPIN/VN30/40_6698.csv
Saved VPIN data to /Users/hoapham/Documents/Learning/thesis/data/Binance/agg/500/VPIN/VN30/45_5954.csv
Saved VPIN data to /Users/hoapham/Documents/Learning/thesis/data/Binance/agg/500/VPIN/VN30/50_5359.csv
Saved VPIN data to /Users/hoapham/Documents/Learning/thesis/data/Bina