In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
from scipy.stats import skew, kurtosis

import warnings
warnings.filterwarnings('ignore')

In [2]:
number_bucket = 50
symbol = "VN30"
if Path("Users").exists():  # Windows
    input_path = r"C:\Users\phamhoa\Downloads\thesis\data\Binance\agg\500"
    file_path = rf"{input_path}\{symbol}.csv"
else:  # Macbook
    input_path = "/Users/hoapham/Documents/Learning/thesis/data/Binance/agg/500"
    file_path = f"{input_path}/{symbol}.csv"

In [3]:

file_path = "/Users/hoapham/Documents/Learning/thesis/data/orderbook/41I1FB000.parquet"

df = pd.read_parquet(file_path)
df = df.drop(columns=["Unnamed: 0"], errors='ignore')
# data = pd.read_csv(f"{input_path}/{symbol}_28M.csv")
# data = data.drop(columns=["Unnamed: 0"], errors='ignore')
# df = pd.concat([df, data], ignore_index=True)
df = df.drop_duplicates()

In [4]:
# Dữ liệu thô khi tải từ Binance về
df.head()

Unnamed: 0_level_0,price,vol,side
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-09-24 09:05:29.752000+07:00,1811500,10,bu
2025-09-24 09:05:29.752000+07:00,1813900,10,bu
2025-09-24 09:05:29.752000+07:00,1815900,10,bu
2025-09-24 09:05:29.753000+07:00,1819900,10,bu
2025-09-24 09:05:29.753000+07:00,1816000,40,bu


In [5]:
df.tail()

Unnamed: 0_level_0,price,vol,side
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-11-20 14:45:00.304000+07:00,1898100,520,sd
2025-11-20 14:45:00.330000+07:00,1898100,660,sd
2025-11-20 14:45:00.336000+07:00,1898100,450,sd
2025-11-20 14:45:00.340000+07:00,1898100,410,sd
2025-11-20 14:45:00.353000+07:00,1898100,1810,sd


In [6]:
df["Time"] = df.index

In [7]:
df.columns

Index(['price', 'vol', 'side', 'Time'], dtype='object')

In [8]:
df["price"] = df["price"]/1000

In [9]:
df["quantity"] = df["vol"]

In [10]:
# cols = [
#     "price",
#     "quantity",
#     "firstTradeId",
#     "lastTradeId",
#     "timestamp",
#     "buyerMaker",
#     "bestPriceMatch",
# ]
# df.columns = cols
# df = df[["timestamp", "price", "quantity", "buyerMaker"]]
# df.head()


In [11]:
df = df.sort_values('Time').reset_index(drop=True)

In [12]:
df['datetime'] = pd.to_datetime(df['Time'], unit='ms', utc=False)
df = df[["datetime", "price", "quantity", "side"]]
df.head()

Unnamed: 0,datetime,price,quantity,side
0,2025-09-24 09:05:29.752000+07:00,1811.5,10,bu
1,2025-09-24 09:05:29.752000+07:00,1813.9,10,bu
2,2025-09-24 09:05:29.752000+07:00,1815.9,10,bu
3,2025-09-24 09:05:29.753000+07:00,1819.9,10,bu
4,2025-09-24 09:05:29.753000+07:00,1816.0,40,bu


In [13]:
df.tail()

Unnamed: 0,datetime,price,quantity,side
92436,2025-11-20 14:45:00.304000+07:00,1898.1,520,sd
92437,2025-11-20 14:45:00.330000+07:00,1898.1,660,sd
92438,2025-11-20 14:45:00.336000+07:00,1898.1,450,sd
92439,2025-11-20 14:45:00.340000+07:00,1898.1,410,sd
92440,2025-11-20 14:45:00.353000+07:00,1898.1,1810,sd


In [14]:
df['datetime'] = df['datetime'].dt.floor('S')
df.head()

Unnamed: 0,datetime,price,quantity,side
0,2025-09-24 09:05:29+07:00,1811.5,10,bu
1,2025-09-24 09:05:29+07:00,1813.9,10,bu
2,2025-09-24 09:05:29+07:00,1815.9,10,bu
3,2025-09-24 09:05:29+07:00,1819.9,10,bu
4,2025-09-24 09:05:29+07:00,1816.0,40,bu


- buyerMaker cho biết buyer là maker hay taker:
    - buyerMaker = False → buyer là taker (mua chủ động ăn ask) ⇒ buy-initiated, giá khớp ở ask side
    - buyerMaker = True  → buyer là maker (đặt bid chờ, bị sell chủ động đập vào) ⇒ sell-initiated, giá khớp ở bid side
- => Hướng trade luôn xác định theo bên chủ động (taker).

In [15]:
# df['side'] = np.where(df['buyerMaker'], 'sell', 'buy')
# df.head()

In [16]:
df["side"] = df["side"].replace({"bu": 'buy', "sd": 'sell'})

In [17]:
agg = (df.groupby(['datetime', 'side'])
         .agg(price_mean=('price', 'mean'),
              qty_sum=('quantity', 'sum'))
         .reset_index()
         )

In [18]:
price_wide = agg.pivot(index='datetime', columns='side', values='price_mean').add_prefix('price_')
qty_wide   = agg.pivot(index='datetime', columns='side', values='qty_sum').add_prefix('qty_')


In [19]:
out_df = pd.concat([price_wide, qty_wide], axis=1).fillna(0.0)
out_df = out_df[['price_buy', 'price_sell', 'qty_buy', 'qty_sell']].sort_index()
out_df.head()

side,price_buy,price_sell,qty_buy,qty_sell
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-09-24 09:05:29+07:00,1815.44,0.0,80.0,0.0
2025-09-24 09:06:59+07:00,1811.2,0.0,10.0,0.0
2025-09-24 09:07:10+07:00,0.0,1811.0,0.0,10.0
2025-09-24 09:08:11+07:00,0.0,1811.2,0.0,10.0
2025-09-24 09:09:07+07:00,0.0,1810.0,0.0,10.0


# Tính toán các tham số đầu vào

In [20]:
window = 50
h = 50

## Chuẩn bị bộ data: Từ time bar biến đổi thành volume bar

## Tính V - Dùng để chia Volume Bucket

In [21]:
# Tính V
out_df["total_qty"] = out_df["qty_buy"] + out_df["qty_sell"]
# resample theo ngày
daily_vol = out_df["total_qty"].resample("D").sum()

# bỏ ngày đầu và ngày cuối vì không đủ dữ liệu
daily_vol = daily_vol.iloc[1:-1]
# Tính Volume Bucket
V = int(daily_vol.mean() / number_bucket)
out_df = out_df.drop(columns=["total_qty"])
V

5359

## Chia Volume Buckets

In [22]:
def get_buckets(df, bucketSize: float) -> pd.DataFrame:
    d = df.copy()
    buckets = []
    BV = SV = filled = 0.0  # Buy Vol, Sell Vol, đã lấp đầy trong bucket hiện tại

    # tích lũy cho trung bình giá mua/bán
    bid_price_num = 0.0   # sum(alloc_buy * price_buy)
    ask_price_num = 0.0   # sum(alloc_sell * price_sell)
    total_price_num = 0.0 # sum((alloc_buy*price_buy + alloc_sell*price_sell))

    for ts, row in d.iterrows():
        buy_remain  = float(row['qty_buy'])
        sell_remain = float(row['qty_sell'])
        total_remain = buy_remain + sell_remain

        while total_remain > 0:
            space = bucketSize - filled
            take = min(space, total_remain)

            # phân bổ theo tỷ lệ buy/sell còn lại
            buy_share = (buy_remain / total_remain) if total_remain > 0 else 0.0
            alloc_buy = take * buy_share
            alloc_sell = take - alloc_buy

            # cộng dồn volume
            BV += alloc_buy
            SV += alloc_sell

            # cộng dồn cho từng loại giá
            if alloc_buy > 0:
                bid_price_num += alloc_buy * float(row['price_buy'])
                total_price_num += alloc_buy * float(row['price_buy'])
            if alloc_sell > 0:
                ask_price_num += alloc_sell * float(row['price_sell'])
                total_price_num += alloc_sell * float(row['price_sell'])

            # cập nhật trạng thái
            filled += take
            buy_remain  -= alloc_buy
            sell_remain -= alloc_sell
            total_remain = buy_remain + sell_remain

            # đủ bucket → ghi lại
            if filled >= bucketSize - 1e-12:
                total_vol = BV + SV
                bid_mean = (bid_price_num / BV) if BV > 0 else np.nan
                ask_mean = (ask_price_num / SV) if SV > 0 else np.nan
                avg_price = (total_price_num / total_vol) if total_vol > 0 else np.nan

                buckets.append({
                    'Time': ts,
                    'Buy': BV,
                    'Sell': SV,
                    'Price': avg_price,    # giá chung (VWAP toàn bucket)
                    'BidPrice': bid_mean,  # giá mua trung bình
                    'AskPrice': ask_mean   # giá bán trung bình
                })

                # reset cho bucket mới
                BV = SV = filled = 0.0
                bid_price_num = ask_price_num = total_price_num = 0.0

    return pd.DataFrame(buckets)


## 1. Tính VPIN

In [23]:

def calc_vpin(df, bucketSize, window):
    df_buckets = get_buckets(df, bucketSize)
    df_buckets["Volume"] = df_buckets["Buy"] + df_buckets["Sell"]

    # VPIN: rolling mean của |Buy - Sell| / V
    df_buckets['VPIN'] = abs(df_buckets['Buy'] - df_buckets['Sell']).rolling(window).mean() / bucketSize

    # CDF:
    # # Cũ
    # df_buckets['CDF'] = df_buckets['VPIN'].rank(pct=True)
    # Mới
    df_buckets['Time'] = pd.to_datetime(df_buckets['Time'], utc=False)
    df_buckets['CDF'] = (
        df_buckets.groupby(df_buckets['Time'].dt.normalize())['VPIN']
                .rank(pct=True)
    )
    return df_buckets


In [24]:

data = calc_vpin(out_df, V, window)
data.tail()

Unnamed: 0,Time,Buy,Sell,Price,BidPrice,AskPrice,Volume,VPIN,CDF
2826,2025-11-20 14:25:15+07:00,1636.0,3723.0,1900.313249,1901.568888,1899.761483,5359.0,0.372114,0.83871
2827,2025-11-20 14:27:26+07:00,4440.0,919.0,1898.997114,1898.993093,1899.01654,5359.0,0.370606,0.806452
2828,2025-11-20 14:28:37+07:00,3381.0,1978.0,1898.462829,1898.492902,1898.411426,5359.0,0.364575,0.451613
2829,2025-11-20 14:29:51+07:00,3749.0,1610.0,1898.272905,1898.291411,1898.229814,5359.0,0.367785,0.709677
2830,2025-11-20 14:45:00+07:00,0.0,5359.0,1898.103545,,1898.103545,5359.0,0.384878,1.0


# Lưu dữ liệu xuống

In [25]:
if Path("C:").exists():  # Windows
    vpin_path = rf"{input_path}\VPIN"
else:  # Macbook        
    vpin_path = f"{input_path}/VPIN"

if not os.path.exists(vpin_path):
    os.makedirs(vpin_path)
data.to_csv(f"{vpin_path}/{symbol}_{number_bucket}.csv", index=False)

In [26]:
print(f"{vpin_path}/{symbol}_{number_bucket}.csv")

/Users/hoapham/Documents/Learning/thesis/data/Binance/agg/500/VPIN/VN30_50.csv
