In [1]:

import pandas as pd
from pathlib import Path
import warnings
import os
import pandas as pd
import matplotlib.pyplot as plt

import datetime
warnings.filterwarnings("ignore")


In [2]:
number_buckets = [10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]
symbol = "VN30"

In [3]:
def read_data(number_bucket):
    if Path("Users").exists():  # Windows
        input_path = r"C:\Users\phamhoa\Downloads\thesis\data\Binance\agg\500\VPIN"
        file_path = rf"{input_path}\{symbol}.csv"
    else:  # Macbook
        input_path = "/Users/hoapham/Documents/Learning/thesis/data/Binance/agg/500/VPIN/"
        file_path = f"{input_path}/{symbol}/{number_bucket}.csv"

    data = pd.read_csv(file_path, parse_dates=['Time'])

    data = data.drop(columns=["Unnamed: 0"], errors='ignore')
    data.drop("KyleLambda", axis=1, inplace=True, errors='ignore')
    return data

In [4]:
def get_output_path(number_bucket):
    output_dir = f"charts_robust_CDF_{number_bucket}"
    os.makedirs(output_dir, exist_ok=True)
    return output_dir

In [5]:

def wrap(number_bucket):
    data = read_data(number_bucket)
    output_dir = get_output_path(number_bucket)
    df = data.dropna().copy()
    df = df.sort_values("Time").set_index("Time")
    # df đã có: index = Time (datetime), cột Price, VPIN, CDF

    # Bỏ ngày 20/10 để TÍNH SCALE
    mask_not_20 = df.index.date != datetime.date(2025, 10, 20)
    df_scale = df.loc[mask_not_20].copy()

    # Chỉ lấy giờ giao dịch
    trading_scale = pd.concat([
        df_scale.between_time("09:00", "11:30"),
        df_scale.between_time("13:00", "15:00"),
    ])

    # Global Y cho Price
    GLOBAL_PRICE_MIN = trading_scale["Price"].min()
    GLOBAL_PRICE_MAX = trading_scale["Price"].max()

    # Global Y cho VPIN/CDF
    GLOBAL_VPINAX_MIN = min(trading_scale["VPIN"].min(), trading_scale["CDF"].min())
    GLOBAL_VPINAX_MAX = max(trading_scale["VPIN"].max(), trading_scale["CDF"].max())

    # Global Y cho bucket
    tmp = trading_scale.copy()
    tmp["hour"] = tmp.index.floor("H")
    GLOBAL_BUCKET_MAX = tmp.groupby("hour").size().max() * 1.4  # giống logic trước
    

    
    def plot_one_day(df_day, date_str, save_path):
        df_day = df_day.sort_index()
        if df_day.empty:
            return

        # ==== Giờ giao dịch ====
        t9     = pd.to_datetime("09:00").time()
        t1130  = pd.to_datetime("11:30").time()
        t13    = pd.to_datetime("13:00").time()
        t15    = pd.to_datetime("15:00").time()

        times = df_day.index.time

        # Lọc phiên sáng + chiều
        df_trade = df_day[((times >= t9) & (times < t1130)) |
                        ((times >= t13) & (times < t15))].copy()
        if df_trade.empty:
            return

        # ===========================
        # new_index = phút giao dịch (bỏ trưa)
        # ===========================
        df_trade["orig_time"] = df_trade.index

        day_date  = df_trade.index[0].normalize()
        t9_dt     = day_date + pd.Timedelta(hours=9)
        t1130_dt  = day_date + pd.Timedelta(hours=11, minutes=30)
        t13_dt    = day_date + pd.Timedelta(hours=13)

        morning_len = (t1130_dt - t9_dt).total_seconds() / 60.0

        def to_trading_min(ts):
            if ts < t1130_dt:
                return (ts - t9_dt).total_seconds() / 60.0
            else:
                return morning_len + (ts - t13_dt).total_seconds() / 60.0

        df_trade["new_index"] = df_trade["orig_time"].apply(to_trading_min)

        time_to_new = dict(zip(df_trade["orig_time"], df_trade["new_index"]))
        df_trade = df_trade.set_index("new_index").sort_index()

        # ===========================
        # Tạo figure
        # ===========================
        fig, (ax_price, ax_cnt) = plt.subplots(
            2, 1, figsize=(14, 7),
            sharex=True,
            gridspec_kw={"height_ratios": [3, 1]}
        )
        ax_vpin = ax_price.twinx()

        # ===========================
        # Shading CDF > 0.9
        # ===========================
        mask_high = (df_trade["CDF"] > 0.9)
        if mask_high.any():
            idx_series = df_trade.index
            in_span = False
            spans = []
            start = None
            prev_x = None

            for x, is_high in zip(idx_series, mask_high):
                if is_high and not in_span:
                    in_span = True
                    start = x
                    prev_x = x
                elif is_high and in_span:
                    prev_x = x
                elif (not is_high) and in_span:
                    spans.append((start, prev_x))
                    in_span = False
                    start = None
                    prev_x = None

            if in_span and start is not None:
                spans.append((start, prev_x))

            for s, e in spans:
                ax_price.axvspan(s, e, color="red", alpha=0.08)
                ax_cnt.axvspan(s, e, color="red", alpha=0.08)

        # ===========================
        # Vẽ Price + VPIN + CDF
        # ===========================
        ax_price.plot(df_trade.index, df_trade["Price"], lw=1.8, label="Price", color="tab:blue")
        ax_vpin.plot(df_trade.index, df_trade["VPIN"],  lw=1.3, label="VPIN", color="tab:green")
        ax_vpin.plot(df_trade.index, df_trade["CDF"],   lw=1.3, ls="--", label="CDF(VPIN)", color="tab:red")

        # ===========================
        # Y-axis PRICE tự scale range ≈ 50 điểm
        # ===========================
        p_min = df_trade["Price"].min()
        p_max = df_trade["Price"].max()
        spread = p_max - p_min

        if spread < 50:
            pad = (50 - spread) / 2.0
            y_min = p_min - pad
            y_max = p_max + pad
            
        elif spread < 60:
            pad = (60 - spread) / 2.0
            y_min = p_min - pad
            y_max = p_max + pad 
                
        elif spread < 80:
            pad = (80 - spread) / 2.0
            y_min = p_min - pad
            y_max = p_max + pad
        
        elif spread < 100:
            pad = (100 - spread) / 2.0
            y_min = p_min - pad
            y_max = p_max + pad
        elif spread < 120:
            pad = (120 - spread) / 2.0
            y_min = p_min - pad
            y_max = p_max + pad
        elif spread < 150:
            pad = (150 - spread) / 2.0
            y_min = p_min - pad
            y_max = p_max + pad
        else:
            pad = 0.05 * spread
            y_min = p_min - pad
            y_max = p_max + pad

        ax_price.set_ylim(y_min, y_max)

        # VPIN/CDF giữ global scale để dễ so sánh giữa các ngày khác
        ax_vpin.set_ylim(GLOBAL_VPINAX_MIN, GLOBAL_VPINAX_MAX)

        # ===========================
        # Tick theo giờ thật
        # ===========================
        xticks = []
        xtick_labels = []

        def get_session_ticks(start, end, label_start, label_end=None):
            sub = df_day.between_time(start, end)
            valid = [t for t in sub.index if t in time_to_new]
            if valid:
                start_idx = time_to_new[valid[0]]
                end_idx   = time_to_new[valid[-1]]
                xticks.append(start_idx)
                xtick_labels.append(label_start)
                if label_end:
                    xticks.append(end_idx)
                    xtick_labels.append(label_end)
            return None, None

        get_session_ticks("09:00", "11:30", "09:00", "11:30")
        get_session_ticks("13:00", "15:00", "13:00", "15:00")

        if len(xticks) == 4:
            mid_break = (xticks[1] + xticks[2]) / 2.0
            xticks = [xticks[0], mid_break, xticks[3]]
            xtick_labels = [xtick_labels[0], "11:30/13:00", xtick_labels[3]]

        if xticks:
            ax_price.set_xticks(xticks)
            ax_price.set_xticklabels(xtick_labels, ha="center")

        # Phân tách phiên chiều
        if len(xticks) >= 2:
            for ax in (ax_price, ax_cnt):
                ax.axvline(xticks[1], ls=":", color="grey", alpha=0.5)

        # ===========================
        # BUCKET COUNT per hour
        # ===========================
        df_trade["hour"] = df_trade["orig_time"].dt.floor("H")
        buckets_per_hour = df_trade.groupby("hour").size().sort_index()

        hours = list(buckets_per_hour.index)
        buckets_vals = buckets_per_hour.values

        bar_x = []
        for h in hours:
            idxs = df_trade.index[df_trade["hour"] == h]
            x_center = (idxs.min() + idxs.max()) / 2.0
            bar_x.append(x_center)

        bar_w = 10
        max_idx = buckets_vals.argmax()
        colors = ["orange" if i == max_idx else "lightgrey" for i in range(len(hours))]

        ax_cnt.bar(bar_x, buckets_vals, width=bar_w, color=colors, edgecolor="none", alpha=0.95)

        ax_cnt.text(bar_x[max_idx], buckets_vals[max_idx] * 1.03,
                    str(int(buckets_vals[max_idx])),
                    ha="center", va="bottom",
                    fontsize=10, fontweight="bold", color="darkorange")

        ax_cnt.set_ylim(0, GLOBAL_BUCKET_MAX)

        plt.subplots_adjust(hspace=0.08)

        ax_price.set_ylabel("Price")
        ax_price.grid(ls="--", alpha=0.3)
        ax_vpin.set_ylabel("VPIN / CDF(VPIN)")
        ax_cnt.set_ylabel("Buckets / giờ")
        ax_cnt.grid(ls="--", alpha=0.3)

        lines1, labels1 = ax_price.get_legend_handles_labels()
        lines2, labels2 = ax_vpin.get_legend_handles_labels()
        ax_price.legend(lines1 + lines2, labels1 + labels2, loc="upper left")

        plt.tight_layout()
        plt.savefig(save_path, dpi=200)
        plt.close(fig)
        
    for day, df_day in df.groupby(df.index.date):
        date_str = day.strftime("%Y-%m-%d")
        save_path = os.path.join(output_dir, f"{date_str}.png")
        print(save_path)
        plot_one_day(df_day, date_str, save_path)


In [6]:
for number_bucket in number_buckets:
    wrap(number_bucket)

charts_robust_CDF_10/2025-10-20.png
charts_robust_CDF_10/2025-10-21.png
charts_robust_CDF_10/2025-10-22.png
charts_robust_CDF_10/2025-10-23.png
charts_robust_CDF_10/2025-10-24.png
charts_robust_CDF_10/2025-10-27.png
charts_robust_CDF_10/2025-10-28.png
charts_robust_CDF_10/2025-10-29.png
charts_robust_CDF_10/2025-10-30.png
charts_robust_CDF_10/2025-10-31.png
charts_robust_CDF_10/2025-11-03.png
charts_robust_CDF_10/2025-11-04.png
charts_robust_CDF_10/2025-11-05.png
charts_robust_CDF_10/2025-11-06.png
charts_robust_CDF_10/2025-11-07.png
charts_robust_CDF_10/2025-11-10.png
charts_robust_CDF_10/2025-11-11.png
charts_robust_CDF_10/2025-11-12.png
charts_robust_CDF_10/2025-11-13.png
charts_robust_CDF_10/2025-11-14.png
charts_robust_CDF_10/2025-11-17.png
charts_robust_CDF_10/2025-11-18.png
charts_robust_CDF_10/2025-11-19.png
charts_robust_CDF_10/2025-11-20.png
charts_robust_CDF_15/2025-10-17.png
charts_robust_CDF_15/2025-10-20.png
charts_robust_CDF_15/2025-10-21.png
charts_robust_CDF_15/2025-10