**HƯỚNG DẪN CHẠY**

*Nhóm chạy code theo thứ tự từng cell từ trên xuống xuống dưới*

**Một số điểm lưu ý:**

- *Thời gian chạy các block đa số lâu (Khoảng 10 phút riêng block 6 khoảng 25 phút)*

- *Các block 8,9,10,11 có lưu kết quả file csv* 

- **Các file csv kết quả nhóm có upload lên github**

**Tải các thư viện cần thiết** 

In [None]:
!pip install pandas numpy torch scikit-learn matplotlib
!pip install --extra-index-url https://fiinquant.github.io/fiinquantx/simple fiinquantx
!pip install --upgrade --extra-index-url https://fiinquant.github.io/fiinquantx/simple fiinquantx

**Block 1: tải dữ liệu lịch sử và realtime**

In [1]:
# Block 1 — Login & Lấy dữ liệu tất cả HOSE/HNX/UPCOM
import pandas as pd
from FiinQuantX import FiinSession, BarDataUpdate
# --- Login ---
username = "DSTC_18@fiinquant.vn"
password = "Fiinquant0606"

client = FiinSession(
    username=username,
    password=password
).login()

# --- Lấy danh sách cổ phiếu từng sàn ---
tickers_hose  = list(client.TickerList(ticker="VNINDEX"))     # HOSE
print(f"Số mã HOSE: {len(tickers_hose)}")

# --- Lấy dữ liệu lịch sử toàn bộ ---
event_history = client.Fetch_Trading_Data(
    realtime=False,
    tickers=tickers_hose,
    fields=['open','high','low','close','volume','bu','sd','fs','fn'], 
    adjusted=True,
    by="1d",
    from_date="2023-01-01"   # Lấy dữ liệu từ 2023 tới nay
)

df_all = event_history.get_data()
print("History ban đầu:", df_all.head())

# --- Callback realtime ---
def onDataUpdate(data: BarDataUpdate):
    global df_all
    df_update = data.to_dataFrame()
    df_all = pd.concat([df_all, df_update])
    df_all = df_all.drop_duplicates()
    print("Realtime update:")
    print(df_update.head())

# --- Bật realtime nối tiếp dữ liệu ---
event_realtime = client.Fetch_Trading_Data(
    realtime=True,
    tickers=tickers_hose,
    fields=['open','high','low','close','volume','bu','sd','fs','fn'], 
    adjusted=True,
    by="1d",
    period=1,
    callback=onDataUpdate
)


Số mã HOSE: 413
Fetching data, it may take a while. Please wait...
History ban đầu:   ticker         timestamp      open      high       low     close     volume  \
0    AAA  2023-01-03 00:00  6539.643  6866.145  6539.643  6866.145  1543984.0   
1    AAA  2023-01-04 00:00  6866.145  7000.587  6827.733  6827.733  1302505.0   
2    AAA  2023-01-05 00:00  6866.145  6904.557  6808.527  6885.351   980473.0   
3    AAA  2023-01-06 00:00  6885.351  6990.984  6818.130  6856.542  1431699.0   
4    AAA  2023-01-09 00:00  6914.160  6962.175  6760.512  6789.321  1121385.0   

         bu        sd           fs           fn  
0  938600.0  504700.0   40579000.0  899404000.0  
1  462900.0  780600.0  151639000.0   36850000.0  
2  487200.0  473700.0  343911000.0  -59103000.0  
3  564300.0  828300.0  345999000.0 -294312000.0  
4  414000.0  631800.0  514557000.0 -483197000.0  


**Block 2: lấy dữ liệu FA, lọc các mã không hợp lệ**

In [3]:
# Block 2 — Lấy dữ liệu FA theo quý (VNINDEX) với retry + backoff, skip ticker không hợp lệ và lưu incremental

import os
import re
import time
import random
import pandas as pd
from vnstock import Finance

# --- Dùng tickers_hose từ Block 1 nếu có, nếu không lấy lại từ client ---
try:
    tickers_vnindex = tickers_hose
except NameError:
    tickers_vnindex = list(client.TickerList(ticker="VNINDEX"))

print(f"Số mã VNINDEX tổng: {len(tickers_vnindex)}")

# --- Output files ---
master_file = "vnindex_fa_quarterly_vnstock.csv"
per_ticker_dir = "vnindex_fa_by_ticker"
invalid_file = "vnindex_invalid_tickers.txt"
os.makedirs(per_ticker_dir, exist_ok=True)

# --- Retry / backoff parameters ---
MAX_RETRIES = 6
BASE_DELAY = 5
BACKOFF_BASE = 2.0
JITTER = 1.0
RATE_LIMIT_PATTERNS = [
    r"rate limit exceeded",
    r"you have sent too many requests",
    r"too many requests",
    r"rate limit",
    r"vci.*rate",
]
INVALID_TICKER_PATTERNS = [
    r"mã chứng khoán không hợp lệ",
    r"chỉ cổ phiếu mới có thông tin",
    r"không phải mã cổ phiếu",
    r"invalid symbol",
    r"symbol is invalid",
]

def parse_wait_seconds_from_msg(msg):
    if not msg:
        return None
    m = re.search(r"after\s+(\d+)\s*seconds?", msg, flags=re.IGNORECASE)
    if m:
        return int(m.group(1))
    m2 = re.search(r"after\s+(\d+)\s*s\b", msg, flags=re.IGNORECASE)
    if m2:
        return int(m2.group(1))
    m3 = re.search(r"(\d+)", msg)
    if m3:
        return int(m3.group(1))
    return None

# --- Resume: đọc danh sách tickers đã xử lý trong master file nếu có ---
processed = set()
if os.path.exists(master_file):
    try:
        df_exist = pd.read_csv(master_file, usecols=['ticker'])
        processed.update(df_exist['ticker'].astype(str).unique().tolist())
        print(f"Resume: đã phát hiện {len(processed)} ticker đã xử lý trong '{master_file}'.")
    except Exception:
        pass

# --- Resume: đọc danh sách ticker invalid đã lưu trước đó ---
invalid_set = set()
if os.path.exists(invalid_file):
    try:
        with open(invalid_file, "r", encoding="utf-8") as f:
            for line in f:
                s = line.strip()
                if s:
                    invalid_set.add(s)
        processed.update(invalid_set)  # coi invalid như đã xử lý để skip lần sau
        print(f"Resume: {len(invalid_set)} ticker đã được đánh dấu INVALID và sẽ bị bỏ qua.")
    except Exception:
        pass

# --- Thu thập dữ liệu ---
fa_list = []
successful = []
skipped_invalid = []
failed = []

for t in tickers_vnindex:
    if t in processed:
        print(f"Skip {t} (đã có trong processed/master/invalid).")
        continue

    attempt = 0
    success = False
    last_exception_msg = None

    while attempt < MAX_RETRIES and not success:
        attempt += 1
        try:
            finance = Finance(symbol=t, source='VCI')
            df = finance.ratio(period='quarterly')
            df = pd.DataFrame(df) if df is not None else pd.DataFrame()

            if df.empty:
                print(f"[{t}] ⚠️ Không có dữ liệu FA (vnstock). Bỏ qua (không lưu).")
                # không ghi master nếu trống; đánh dấu là đã thử (không add vào processed để có thể thử lại sau nếu muốn)
                success = True
                break

            # Đảm bảo có cột ticker
            if 'ticker' not in df.columns:
                df['ticker'] = t
            else:
                df['ticker'] = df['ticker'].fillna(t)

            # Nếu có ReportDate (meta) thì drop đi
            if 'ReportDate' in df.columns:
                df = df.drop(columns=['ReportDate'])

            # Coerce numeric cho các cột (nếu có thể)
            for c in df.columns:
                if c != 'ticker':
                    df[c] = pd.to_numeric(df[c], errors='ignore')

            # --- Lưu per-ticker CSV ---
            per_file = os.path.join(per_ticker_dir, f"{t}_fa.csv")
            df.to_csv(per_file, index=False, encoding='utf-8-sig')

            # --- Append vào master CSV ---
            write_header = not os.path.exists(master_file)
            df.to_csv(master_file, mode='a', header=write_header, index=False, encoding='utf-8-sig')

            print(f"[{t}] ✓ Lấy xong và lưu (rows={len(df)}).")
            fa_list.append(df)
            successful.append(t)
            processed.add(t)  # mark processed so resume will skip next time
            success = True

            # nhẹ nhàng sleep giữa requests
            time.sleep(BASE_DELAY + random.uniform(0, JITTER))

        except Exception as e:
            msg = str(e)
            last_exception_msg = msg
            lower_msg = msg.lower()

            # --- Nếu lỗi do ticker không hợp lệ -> bỏ luôn, không retry ---
            is_invalid = any(re.search(pat, lower_msg) for pat in INVALID_TICKER_PATTERNS)
            if is_invalid:
                print(f"[{t}] ❌ Mã không hợp lệ — BỎ QUA (không retry). Message: {msg}")
                skipped_invalid.append(t)
                processed.add(t)
                # lưu vào file invalid để lần sau không thử lại
                try:
                    with open(invalid_file, "a", encoding="utf-8") as f:
                        f.write(t + "\n")
                except Exception:
                    pass
                break

            # --- Nếu nghi ngờ rate-limit hoặc server báo wait time ---
            is_rate_limit = any(pat in lower_msg for pat in RATE_LIMIT_PATTERNS)
            parsed_wait = parse_wait_seconds_from_msg(msg)

            if is_rate_limit or parsed_wait:
                wait_sec = parsed_wait if parsed_wait and parsed_wait > 0 else (BACKOFF_BASE ** attempt)
                wait_sec = wait_sec + random.uniform(0, JITTER)
                print(f"[{t}] ⚠️ Rate limit detected (attempt {attempt}/{MAX_RETRIES}). "
                      f"Đợi {wait_sec:.1f}s rồi thử lại. Message: {msg}")
                time.sleep(wait_sec)
                continue
            else:
                backoff = (BACKOFF_BASE ** attempt) + random.uniform(0, JITTER)
                print(f"[{t}] ⚠️ Lỗi khi fetch (attempt {attempt}/{MAX_RETRIES}): {msg}. "
                      f"Đợi {backoff:.1f}s rồi thử lại.")
                time.sleep(backoff)
                continue

    # nếu vòng retry kết thúc mà không success và không thuộc invalid -> mark failed
    if not success and t not in skipped_invalid:
        print(f"[{t}] ❌ Không lấy được dữ liệu sau {MAX_RETRIES} lần. Bỏ qua ticker này. Lỗi cuối: {last_exception_msg}")
        failed.append(t)

# --- Kết quả tổng kết ---
print("=== Hoàn tất Block 2 (vnstock) ===")
print(f"Tổng tickers xử lý thành công: {len(successful)}")
print(f"Tổng tickers bị đánh dấu INVALID và bỏ qua: {len(skipped_invalid)}")
print(f"Tổng tickers thất bại (retry hết nhưng không invalid): {len(failed)}")

if fa_list:
    sample_df = pd.concat(fa_list, ignore_index=True, sort=False).head(10)
    display(sample_df)
else:
    print("❗ Không có dữ liệu FA thu được cho các ticker đã chạy.")


Số mã VNINDEX tổng: 413
Resume: đã phát hiện 3 ticker đã xử lý trong 'vnindex_fa_quarterly_vnstock.csv'.
Skip CCC (đã có trong processed/master/invalid).
Skip SBG (đã có trong processed/master/invalid).
[FUCTVGF3] ❌ Mã không hợp lệ — BỎ QUA (không retry). Message: Mã chứng khoán không hợp lệ. Chỉ cổ phiếu mới có thông tin.
[FUEIP100] ❌ Mã không hợp lệ — BỎ QUA (không retry). Message: Mã chứng khoán không hợp lệ. Chỉ cổ phiếu mới có thông tin.
[GMH] ✓ Lấy xong và lưu (rows=18).
[FUEKIV30] ❌ Mã không hợp lệ — BỎ QUA (không retry). Message: Mã chứng khoán không hợp lệ. Chỉ cổ phiếu mới có thông tin.
[NO1] ✓ Lấy xong và lưu (rows=17).
[FUCTVGF4] ❌ Mã không hợp lệ — BỎ QUA (không retry). Message: Mã chứng khoán không hợp lệ. Chỉ cổ phiếu mới có thông tin.
[RYG] ✓ Lấy xong và lưu (rows=7).
[FUEDCMID] ❌ Mã không hợp lệ — BỎ QUA (không retry). Message: Mã chứng khoán không hợp lệ. Chỉ cổ phiếu mới có thông tin.
[FUEKIVFS] ❌ Mã không hợp lệ — BỎ QUA (không retry). Message: Mã chứng khoán không 

Unnamed: 0_level_0,Meta,Meta,Meta,Chỉ tiêu cơ cấu nguồn vốn,Chỉ tiêu cơ cấu nguồn vốn,Chỉ tiêu cơ cấu nguồn vốn,Chỉ tiêu cơ cấu nguồn vốn,Chỉ tiêu hiệu quả hoạt động,Chỉ tiêu hiệu quả hoạt động,Chỉ tiêu hiệu quả hoạt động,...,Chỉ tiêu định giá,Chỉ tiêu định giá,Chỉ tiêu định giá,Chỉ tiêu định giá,Chỉ tiêu định giá,Chỉ tiêu định giá,Chỉ tiêu định giá,Chỉ tiêu định giá,Chỉ tiêu định giá,ticker
Unnamed: 0_level_1,ticker,yearReport,lengthReport,(ST+LT borrowings)/Equity,Debt/Equity,Fixed Asset-To-Equity,Owners' Equity/Charter Capital,Asset Turnover,Fixed Asset Turnover,Days Sales Outstanding,...,Market Capital (Bn. VND),Outstanding Share (Mil. Shares),P/E,P/B,P/S,P/Cash Flow,EPS (VND),BVPS (VND),EV/EBITDA,Unnamed: 21_level_1
0,GMH,2025,2,0.0,0.051588,0.12465,1.090538,0.562538,4.422492,63.351078,...,145200000000.0,16500000.0,11.480752,0.806941,1.362896,3.795689,356.816813,10905.381178,11.204889,GMH
1,GMH,2025,1,0.022473,0.051838,0.130708,1.089856,0.495824,3.693597,71.229634,...,131175000000.0,16500000.0,16.68135,0.729454,1.406508,3.940808,121.619694,10898.564366,20.561323,GMH
2,GMH,2024,4,0.0,0.038005,0.138373,1.077694,0.461186,3.248087,69.708499,...,122925000000.0,16500000.0,20.639945,0.691291,1.395969,10.732768,120.501797,10776.944672,19.378872,GMH
3,GMH,2024,3,0.038418,0.106987,0.146877,1.065434,0.456444,3.178684,66.693418,...,131340000000.0,16500000.0,21.487381,0.747113,1.491061,7.664694,167.561987,10654.338632,31.526449,GMH
4,GMH,2024,2,0.015499,0.063503,0.156407,1.048888,0.438959,2.972644,64.985075,...,136620000000.0,16500000.0,19.196756,0.789407,1.585955,11.360902,66.896642,10488.880887,18.004344,GMH
5,GMH,2024,1,0.009567,0.032987,0.150374,1.142198,0.473768,3.133843,60.882722,...,148500000000.0,16500000.0,16.702805,0.787954,1.568495,14.30098,5.990141,11421.984245,15.23697,GMH
6,GMH,2023,4,0.0,0.047413,0.157043,1.142922,0.572311,3.624814,53.195105,...,165825000000.0,16500000.0,12.102004,0.880344,1.460188,-104.265001,130.001223,11415.994104,12.30996,GMH
7,GMH,2023,3,0.040551,0.115587,0.165952,1.128155,0.637397,4.058792,50.114334,...,160050000000.0,16500000.0,9.127335,0.859811,1.234861,59.962717,228.434875,11281.546389,9.337824,GMH
8,GMH,2023,2,0.0,0.088011,0.175449,1.105311,0.682272,4.248544,47.441126,...,159225000000.0,16500000.0,8.101948,0.873057,1.143669,24.127825,174.4054,11053.111514,9.445047,GMH
9,GMH,2023,1,0.043288,0.101775,0.169373,1.137871,0.724659,4.420403,40.66549,...,166650000000.0,16500000.0,6.932984,0.887623,1.121519,12.31126,293.15301,11378.706114,7.042915,GMH


**Block 3: Chuẩn hóa FA và gộp dữ liệu với giá**

In [4]:
# Block 3 — Chuẩn hoá FA (từ file per-ticker có 2 dòng header) + Merge với giá và lưu CSV

import os
import pandas as pd

# --- Các cột FA cần lấy ---
fa_fields = ["Debt/Equity", "Net Profit Margin (%)", "P/E", "P/B"]
need_cols = ["ticker","yearReport","lengthReport"] + fa_fields

# --- Đọc toàn bộ FA từ thư mục ---
folder = "vnindex_fa_by_ticker"
fa_list = []

for file in os.listdir(folder):
    if file.endswith("_fa.csv"):
        path = os.path.join(folder, file)
        try:
            # ⚡ Bỏ dòng header đầu (Meta...), lấy dòng thứ 2 làm header
            df = pd.read_csv(path, header=1)

            # Đảm bảo đủ cột cần thiết
            for col in need_cols:
                if col not in df.columns:
                    df[col] = pd.NA

            df = df[need_cols]
            fa_list.append(df)
        except Exception as e:
            print(f"⚠️ Lỗi đọc {file}: {e}")

# --- Gộp toàn bộ FA ---
fa_data = pd.concat(fa_list, ignore_index=True)
print(f"Đọc dữ liệu FA từ {len(fa_list)} ticker, tổng {len(fa_data)} dòng")

# --- Chuẩn hoá giá (df_all từ Block 1) ---
df_price = df_all[df_all["ticker"].isin(fa_data["ticker"].unique())].copy()
df_price["timestamp"] = pd.to_datetime(df_price["timestamp"])
df_price = df_price.sort_values(["ticker","timestamp"])

# tạo key (fa_year, fa_quarter) = quý trước để tránh data leak
pi = df_price["timestamp"].dt.to_period("Q")
prev_pi = pi - 1
df_price["fa_year"] = prev_pi.dt.year.astype(int)
df_price["fa_quarter"] = prev_pi.dt.quarter.astype(int)

# --- Chuẩn hoá FA ---
fa_clean = fa_data.rename(columns={
    "yearReport": "fa_year",
    "lengthReport": "fa_quarter"
})
# ép kiểu int cho chắc
fa_clean["fa_year"] = fa_clean["fa_year"].astype("Int64")
fa_clean["fa_quarter"] = fa_clean["fa_quarter"].astype("Int64")

fa_clean = (
    fa_clean.sort_values(["ticker","fa_year","fa_quarter"])
            .drop_duplicates(subset=["ticker","fa_year","fa_quarter"], keep="last")
)

# --- Merge giá + FA ---
df_merged = df_price.merge(
    fa_clean,
    on=["ticker","fa_year","fa_quarter"],
    how="left"
)

# FFill để lấp chỗ trống
df_merged = df_merged.sort_values(["ticker","timestamp"])
df_merged[fa_fields] = df_merged.groupby("ticker")[fa_fields].ffill()

# --- Lưu ra CSV ---
output_file = "vnindex_price_fa_merged.csv"
df_merged.to_csv(output_file, index=False, encoding="utf-8-sig")
print(f"✅ Đã lưu dữ liệu merged giá + FA vào: {output_file}")

# --- Preview ---
print("Sample merged:")
print(df_merged.head())
print("Số mã merge thành công:", df_merged['ticker'].nunique())


Đọc dữ liệu FA từ 391 ticker, tổng 17586 dòng
✅ Đã lưu dữ liệu merged giá + FA vào: vnindex_price_fa_merged.csv
Sample merged:
  ticker  timestamp      open      high       low     close     volume  \
0    AAA 2023-01-03  6539.643  6866.145  6539.643  6866.145  1543984.0   
1    AAA 2023-01-04  6866.145  7000.587  6827.733  6827.733  1302505.0   
2    AAA 2023-01-05  6866.145  6904.557  6808.527  6885.351   980473.0   
3    AAA 2023-01-06  6885.351  6990.984  6818.130  6856.542  1431699.0   
4    AAA 2023-01-09  6914.160  6962.175  6760.512  6789.321  1121385.0   

         bu        sd           fs           fn  fa_year  fa_quarter  \
0  938600.0  504700.0   40579000.0  899404000.0     2022           4   
1  462900.0  780600.0  151639000.0   36850000.0     2022           4   
2  487200.0  473700.0  343911000.0  -59103000.0     2022           4   
3  564300.0  828300.0  345999000.0 -294312000.0     2022           4   
4  414000.0  631800.0  514557000.0 -483197000.0     2022           4

**Xóa biến df_all không cần thiết nữa để giảm dung lượng RAM**

**Block 4: Tính các chỉ số TA dựa vào thư viện FiinQuant và ghép dữ liệu**

In [31]:
# Block 4 — Tính các chỉ số TA cho từng mã (KHÔNG có regime)

import pandas as pd
import numpy as np

# --- Load dữ liệu merge từ Block 3 ---
df_merged = pd.read_csv("vnindex_price_fa_merged.csv", parse_dates=["timestamp"])

# --- Khởi tạo Indicator ---
fi = client.FiinIndicator()

# --- Hàm tính TA cho từng ticker ---
def add_ta_indicators(df):
    df = df.sort_values("timestamp").copy().reset_index(drop=True)

    # EMA
    df['ema_5']  = fi.ema(df['close'], window=5)
    df['ema_20'] = fi.ema(df['close'], window=20)
    df['ema_50'] = fi.ema(df['close'], window=50)

    # MACD
    df['macd']        = fi.macd(df['close'], window_fast=12, window_slow=26)
    df['macd_signal'] = fi.macd_signal(df['close'], window_fast=12, window_slow=26, window_sign=9)
    df['macd_diff']   = fi.macd_diff(df['close'], window_fast=12, window_slow=26, window_sign=9)

    # RSI
    df['rsi'] = fi.rsi(df['close'], window=14)

    # Bollinger Bands
    df['bollinger_hband'] = fi.bollinger_hband(df['close'], window=20, window_dev=2)
    df['bollinger_lband'] = fi.bollinger_lband(df['close'], window=20, window_dev=2)

    # ATR
    df['atr'] = fi.atr(df['high'], df['low'], df['close'], window=14)

    # OBV
    df['obv'] = fi.obv(df['close'], df['volume'])

    # VWAP
    df['vwap'] = fi.vwap(df['high'], df['low'], df['close'], df['volume'], window=14)

    return df

# --- Áp dụng TA cho toàn bộ df_merged ---
df_with_ta = df_merged.groupby("ticker", group_keys=False).apply(add_ta_indicators)

# --- Lưu ra CSV (không còn regime nữa) ---
output_file = "vnindex_price_fa_ta.csv"
df_with_ta.to_csv(output_file, index=False, encoding="utf-8-sig")
print(f"✅ Đã lưu dữ liệu có TA (từng mã, KHÔNG regime) vào: {output_file}")

# --- Preview ---
print("Sample with TA:")
print(df_with_ta[['ticker','timestamp','close','ema_20','ema_50','macd','rsi']].head())
print("Shape sau khi thêm TA:", df_with_ta.shape)

✅ Đã lưu dữ liệu có TA (từng mã, KHÔNG regime) vào: vnindex_price_fa_ta.csv
Sample with TA:
  ticker  timestamp     close  ema_20  ema_50  macd  rsi
0    AAA 2023-01-03  6866.145     NaN     NaN   NaN  NaN
1    AAA 2023-01-04  6827.733     NaN     NaN   NaN  NaN
2    AAA 2023-01-05  6885.351     NaN     NaN   NaN  NaN
3    AAA 2023-01-06  6856.542     NaN     NaN   NaN  NaN
4    AAA 2023-01-09  6789.321     NaN     NaN   NaN  NaN
Shape sau khi thêm TA: (264721, 29)


**Xóa bớt biến df_merged không còn cần thiết để giảm dung lượng RAM**

In [5]:
import gc
del df_merged
gc.collect()

1802

**Block 5: Chuẩn hóa dữ liệu FA và TA**

In [32]:
# Block 5 — Feature engineering & scaling (NO regime, short version)

import numpy as np

# --- Danh sách cột FA & TA ---
fa_features = ["Debt/Equity", "Net Profit Margin (%)", "P/E", "P/B"]

ta_features = [
    "ema_5","ema_20","ema_50","macd","macd_signal","macd_diff",
    "rsi","bollinger_hband","bollinger_lband","atr","obv","vwap"
]

# --- Chuẩn hoá FA: cross-section min-max scaling theo ngày ---
def scale_fa_minmax(df):
    df_scaled = df.copy()
    for f in fa_features:
        vals = pd.to_numeric(df[f], errors="coerce")
        vmin, vmax = vals.min(), vals.max()
        if np.isfinite(vmin) and np.isfinite(vmax) and vmax > vmin:
            df_scaled[f] = (vals - vmin) / (vmax - vmin)
        else:
            df_scaled[f] = np.nan
    return df_scaled

df_scaled_fa = df_with_ta.groupby("timestamp", group_keys=False).apply(scale_fa_minmax)

# --- Chuẩn hoá TA: rolling z-score theo từng ticker ---
def zscore_rolling(series, window=60):
    return (series - series.rolling(window).mean()) / series.rolling(window).std()

df_scaled = df_scaled_fa.groupby("ticker", group_keys=False).apply(
    lambda g: g.assign(**{f"{col}_z": zscore_rolling(g[col], 60) for col in ta_features})
)

# --- Drop các cột gốc TA, giữ bản z-score ---
keep_cols = ["ticker","timestamp"] + fa_features + [f"{col}_z" for col in ta_features]
df_features = df_scaled[keep_cols].dropna().reset_index(drop=True)

print("Sample features:")
print(df_features.head())
print("Shape sau khi scaling & dropna:", df_features.shape)

Sample features:
  ticker  timestamp  Debt/Equity  Net Profit Margin (%)       P/E       P/B  \
0    AAA 2023-06-14     0.302753               0.976223  0.581524  0.119378   
1    AAM 2023-06-14     0.274571               0.976478  0.567593  0.081723   
2    AAT 2023-06-14     0.289872               0.976190  0.566385  0.088712   
3    ABR 2023-06-14     0.281477               0.979266  0.568159  0.136659   
4    ABS 2023-06-14     0.302868               0.977524  0.572447  0.092493   

    ema_5_z  ema_20_z  ema_50_z    macd_z  macd_signal_z  macd_diff_z  \
0  1.335124  1.662545  1.799046 -0.008320       0.591579    -1.164222   
1 -0.860284 -0.664630 -0.242035 -0.764137      -0.817184    -0.109792   
2  3.147082  3.376253  3.548276  2.967265       3.142393     2.231773   
3  0.348835  1.002783  1.511919 -0.803966      -0.276326    -1.417017   
4  2.772696  3.133127  3.354561  2.500178       2.788283     1.191728   

      rsi_z  bollinger_hband_z  bollinger_lband_z     atr_z     obv_z

**Xóa các biến df_with_ta, df_scaled, df_scaled_fa không cần thiết nữa**

In [12]:
del df_with_ta, df_scaled, df_scaled_fa
gc.collect()


0

**Block 6: Giảm chiều dữ liệu bằng t-SNE và phân cụm bằng DBSCAN**

In [7]:
# Block 6 — Giảm chiều dữ liệu & phân cụm (t-SNE + DBSCAN)

from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN

# --- Chọn các cột features để phân cụm ---
# FA features: khớp với Block 3 & Block 5
fa_features = ["Debt/Equity", "Net Profit Margin (%)", "P/E", "P/B"]

# TA features đã được chuẩn hoá z-score ở Block 5
ta_features_z = [c for c in df_features.columns if c.endswith("_z")]

feature_cols = fa_features + ta_features_z

# --- Thêm cột tháng để snapshot ---
df_features["month"] = df_features["timestamp"].dt.to_period("M")

cluster_results = []

for (month, g) in df_features.groupby("month"):
    if len(g) < 10:   # quá ít cổ phiếu thì bỏ
        continue

    X = g[feature_cols].values

    # --- t-SNE giảm chiều còn 2D ---
    tsne = TSNE(n_components=2, perplexity=30, learning_rate="auto", init="random", random_state=42)
    X_emb = tsne.fit_transform(X)

    # --- DBSCAN phân cụm ---
    db = DBSCAN(eps=0.5, min_samples=5).fit(X_emb)
    labels = db.labels_

    temp = g[["ticker","timestamp"]].copy()
    temp["cluster"] = labels
    temp["tsne_x"] = X_emb[:,0]
    temp["tsne_y"] = X_emb[:,1]
    temp["month"]  = str(month)

    cluster_results.append(temp)

df_clusters = pd.concat(cluster_results, ignore_index=True)

print("Cluster sample:")
print(df_clusters.head())
print("Số cụm mỗi tháng:")
print(df_clusters.groupby("month")["cluster"].nunique())


  File "c:\Users\USER\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 255, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


Cluster sample:
  ticker  timestamp  cluster     tsne_x     tsne_y    month
0    AAA 2023-06-14       -1  17.444849 -43.739433  2023-06
1    AAA 2023-06-15       -1  17.398275 -43.821537  2023-06
2    AAA 2023-06-16       -1   5.181763 -58.317707  2023-06
3    AAA 2023-06-19       -1   4.829109 -57.954765  2023-06
4    AAA 2023-06-20       -1   4.363069 -57.532513  2023-06
Số cụm mỗi tháng:
month
2023-06     28
2023-07    127
2023-08    118
2023-09     61
2023-10    115
2023-11     74
2023-12    110
2024-01    126
2024-02     47
2024-03     96
2024-04     68
2024-05     75
2024-06    109
2024-07    110
2024-08    100
2024-09     97
2024-10    151
2024-11    108
2024-12    107
2025-01     62
2025-02     97
2025-03    113
2025-04     62
2025-05     99
2025-06    135
2025-07    141
2025-08    110
2025-09     77
2025-10      1
Name: cluster, dtype: int64


**Block 7: Xây tensors (clusters mapping) và masks (active stocks)**

In [36]:
# Block 7.2 — Tensors & Masks (fix: lưu ra folder khác)

import numpy as np
import pandas as pd
import os, gc, json

LOOKBACK = 64   # window size
DATA_DIR = "C:/tensors_out"   # đổi sang C ổ cho đơn giản
os.makedirs(DATA_DIR, exist_ok=True)
print("📂 Tensor data directory:", DATA_DIR)

feature_cols = [c for c in df_features.columns if c not in ["ticker","timestamp","cluster","month"]]

tensor_index = []

for c_id, g in df_clusters.groupby("cluster"):
    if c_id == -1:   # noise bỏ qua
        continue

    tickers = sorted(g["ticker"].unique())
    g_feat = df_features[df_features["ticker"].isin(tickers)].copy()

    # Pivot: index = timestamp, columns = (ticker, feature)
    pivoted = g_feat.pivot(index="timestamp", columns="ticker", values=feature_cols)
    pivoted.columns = pd.MultiIndex.from_product([tickers, feature_cols])

    # Mask
    mask_df = ~pivoted.isna()
    pivoted_filled = pivoted.ffill().bfill()

    T, N, F = len(pivoted_filled.index), len(tickers), len(feature_cols)
    X = pivoted_filled.values.reshape(T, N, F)
    M = mask_df.values.reshape(T, N, F).astype(np.int8)

    cluster_tensors, cluster_masks, cluster_dates = [], [], []
    for i in range(LOOKBACK, T):
        cluster_tensors.append(X[i-LOOKBACK:i])
        cluster_masks.append(M[i-LOOKBACK:i])
        cluster_dates.append(pivoted_filled.index[i])  # ngày cuối của window

    if cluster_tensors:
        X_arr = np.array(cluster_tensors, dtype=np.float16)  # tiết kiệm RAM
        M_arr = np.array(cluster_masks, dtype=np.int8)

        tensor_file = f"cluster_{c_id}_tensor.npy"
        mask_file   = f"cluster_{c_id}_mask.npy"

        np.save(os.path.join(DATA_DIR, tensor_file), X_arr)
        np.save(os.path.join(DATA_DIR, mask_file), M_arr)

        tensor_index.append({
            "cluster": int(c_id),
            "tickers": tickers,
            "dates": [str(d) for d in cluster_dates],
            "dates_shifted": [str(d+pd.Timedelta(days=1)) for d in cluster_dates],
            "tensor_file": tensor_file,
            "mask_file": mask_file
        })

        print(f"✅ Cluster {c_id}: tensor {X_arr.shape}, mask {M_arr.shape} saved.")

    del g_feat, pivoted, pivoted_filled, mask_df, X, M, cluster_tensors, cluster_masks
    gc.collect()

# Save metadata
with open(os.path.join(DATA_DIR, "tensor_index.json"), "w") as f:
    json.dump(tensor_index, f, indent=2)

print("🎯 Done Block 7.2: tensors + masks saved for all clusters.")

📂 Tensor data directory: C:/tensors_out
✅ Cluster 0: tensor (512, 64, 15, 16), mask (512, 64, 15, 16) saved.
✅ Cluster 1: tensor (512, 64, 14, 16), mask (512, 64, 14, 16) saved.
✅ Cluster 2: tensor (512, 64, 20, 16), mask (512, 64, 20, 16) saved.
✅ Cluster 3: tensor (512, 64, 21, 16), mask (512, 64, 21, 16) saved.
✅ Cluster 4: tensor (512, 64, 28, 16), mask (512, 64, 28, 16) saved.
✅ Cluster 5: tensor (512, 64, 25, 16), mask (512, 64, 25, 16) saved.
✅ Cluster 6: tensor (512, 64, 25, 16), mask (512, 64, 25, 16) saved.
✅ Cluster 7: tensor (512, 64, 22, 16), mask (512, 64, 22, 16) saved.
✅ Cluster 8: tensor (512, 64, 22, 16), mask (512, 64, 22, 16) saved.
✅ Cluster 9: tensor (512, 64, 25, 16), mask (512, 64, 25, 16) saved.
✅ Cluster 10: tensor (512, 64, 30, 16), mask (512, 64, 30, 16) saved.
✅ Cluster 11: tensor (512, 64, 26, 16), mask (512, 64, 26, 16) saved.
✅ Cluster 12: tensor (512, 64, 27, 16), mask (512, 64, 27, 16) saved.
✅ Cluster 13: tensor (512, 64, 25, 16), mask (512, 64, 25, 1

**Block 7.5: Chuẩn bị dữ liệu backtest(loại bỏ các cột dữ liệu không cần thiết nữa)**

In [44]:
# Block 7.5 — Chuẩn bị dữ liệu backtest cho reward thật (từ CSV Block 3)
import gc
import pandas as pd

# Load lại từ file đã lưu ở Block 3
df_price = pd.read_csv("vnindex_price_fa_merged.csv", parse_dates=["timestamp"])

# Chỉ giữ OHLC cần thiết
df_backtest = df_price[["ticker","timestamp","open","high","low","close"]].copy()
df_backtest = df_backtest.sort_values(["timestamp","ticker"]).reset_index(drop=True)

print("✅ Done Block 7.5: df_backtest sẵn sàng cho reward.")
print("Kích thước df_backtest:", df_backtest.shape)
print("Số tickers unique:", df_backtest['ticker'].nunique())
print("Khoảng thời gian:", df_backtest['timestamp'].min(), "→", df_backtest['timestamp'].max())

# Lưu lại để Block 10x dùng cho SL/TP backtest
df_backtest.to_csv("./backtest_ddpg/df_backtest.csv", index=False)


✅ Done Block 7.5: df_backtest sẵn sàng cho reward.
Kích thước df_backtest: (264721, 6)
Số tickers unique: 391
Khoảng thời gian: 2023-01-03 00:00:00 → 2025-10-02 00:00:00


**Block 8: Huấn luyện A3C theo từng cụm**

In [None]:
# Block 8 — A3C multi-stock per-cluster (clean version)

import os, gc, json, csv
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim



DATA_DIR   = "./tensors/"
SIG_DIR    = "./signals/"
MODEL_DIR  = "./models/"
os.makedirs(SIG_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

SIG_FILE = os.path.join(SIG_DIR, "a3c_signals.csv")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Reset signals file
if os.path.exists(SIG_FILE):
    os.remove(SIG_FILE)
with open(SIG_FILE, "w", newline="") as f:
    csv.writer(f).writerow(["date","ticker","signal"])

# Load metadata
with open(os.path.join(DATA_DIR, "tensor_index.json"), "r") as f:
    tensor_index = json.load(f)

# --- Model ---
class A3CNet(nn.Module):
    def __init__(self, n_features, hidden=64):
        super().__init__()
        self.lstm = nn.LSTM(input_size=n_features, hidden_size=hidden, batch_first=True)
        self.actor = nn.Linear(hidden, 3)   # short, flat, long
        self.critic = nn.Linear(hidden, 1)
    def forward(self, x):
        out, _ = self.lstm(x)
        h = out[:, -1, :]
        return self.actor(h), self.critic(h)

# --- Loss ---
def a3c_loss(logits, values, actions, rewards, beta=0.01):
    adv = rewards - values.squeeze(-1)
    critic = adv.pow(2).mean()
    logp = torch.log_softmax(logits, dim=-1)
    actor = -(logp.gather(1, actions.unsqueeze(1)).squeeze(1) * adv.detach()).mean()
    entropy = -(torch.softmax(logits, dim=-1) * logp).sum(-1).mean()
    return actor + 0.5*critic - beta*entropy

# --- Training & inference ---
def process_cluster(meta, epochs=3, lr=1e-3, batch_size=256):
    c_id, tickers = meta["cluster"], meta["tickers"]
    dates = pd.to_datetime(meta["dates"])
    dates_shifted = pd.to_datetime(meta["dates_shifted"])

    X = np.load(os.path.join(DATA_DIR, meta["tensor_file"]), mmap_mode="r")
    M = np.load(os.path.join(DATA_DIR, meta["mask_file"]), mmap_mode="r")
    if X.size == 0:
        return
    B, T, N, F = X.shape
    print(f"Cluster {c_id} | X={X.shape}")

    # --- Lấy giá để tính reward ---
    px = []
    for tk in tickers:
        s = df_backtest[df_backtest["ticker"]==tk].set_index("timestamp")["close"]
        s = s.reindex(dates_shifted).ffill().bfill().values
        px.append(s)
    px = np.stack(px, axis=1)  # (B,N)
    r = np.zeros_like(px, dtype=np.float32)
    r[1:] = np.log(px[1:] / np.maximum(px[:-1], 1e-9))  # daily log-return

    # --- Model + optimizer ---
    model = A3CNet(F).to(device)
    opt = optim.Adam(model.parameters(), lr=lr)

    # Mini-batch generator
    total = B*N
    def iterator():
        for start in range(0, total, batch_size):
            end = min(total, start+batch_size)
            xb, mb, rb, idx = [], [], [], []
            for s in range(start,end):
                b, n = divmod(s, N)
                xb.append(X[b,:,n,:])
                mb.append(M[b,:,n,:])
                rb.append(r[b,n])
                idx.append((b,n))
            yield np.stack(xb), np.stack(mb), np.array(rb), idx

    # --- Train ---
    for ep in range(epochs):
        loss_ep = 0
        for xb, mb, rb, _ in iterator():
            xb = torch.tensor(xb, dtype=torch.float32).to(device)
            rb = torch.tensor(rb, dtype=torch.float32).to(device)
            xb = xb * torch.tensor(mb, dtype=torch.float32).to(device)

            logits, vals = model(xb)
            dist = torch.distributions.Categorical(logits=logits)
            act = dist.sample()

            # Mapping: 0=short, 1=flat, 2=long
            reward = torch.where(act==2, rb, torch.where(act==0, -rb, torch.zeros_like(rb)))
            loss = a3c_loss(logits, vals, act, reward)

            opt.zero_grad(); loss.backward(); opt.step()
            loss_ep += loss.item()
        print(f"  Epoch {ep+1}/{epochs}, Loss={loss_ep:.4f}")
        gc.collect(); torch.cuda.empty_cache()

    # --- Save model ---
    model_path = os.path.join(MODEL_DIR, f"a3c_cluster_{c_id}.pt")
    torch.save(model.state_dict(), model_path)
    print(f"  ✅ Saved model checkpoint: {model_path}")

    # --- Inference & save signals ---
    with open(SIG_FILE,"a",newline="") as f:
        w = csv.writer(f)
        with torch.no_grad():
            for xb, mb, _, idx in iterator():
                xb = torch.tensor(xb, dtype=torch.float32).to(device)
                xb = xb * torch.tensor(mb, dtype=torch.float32).to(device)
                raw_acts = torch.argmax(model(xb)[0], dim=-1).cpu().numpy()  # 0,1,2
                acts = np.where(raw_acts==2, 1, np.where(raw_acts==0, -1, 0))  # map -> -1,0,1
                for k,(b,n) in enumerate(idx):
                    w.writerow([dates_shifted[b], tickers[n], int(acts[k])])
                del xb, acts
                gc.collect(); torch.cuda.empty_cache()

    del X, M, px, r, model, opt
    gc.collect(); torch.cuda.empty_cache()

# --- Run all clusters ---
for meta in tensor_index:
    process_cluster(meta)

print(f"✅ Done Block 8: signals saved to {SIG_FILE}, models in {MODEL_DIR}")

Cluster 0 | X=(512, 64, 15, 19)
  Epoch 1/3, Loss=0.5582
  Epoch 2/3, Loss=-0.0633
  Epoch 3/3, Loss=-0.3345
  ✅ Saved model checkpoint: ./models/a3c_cluster_0.pt
Cluster 1 | X=(512, 64, 14, 19)
  Epoch 1/3, Loss=-0.3479
  Epoch 2/3, Loss=-0.1559
  Epoch 3/3, Loss=-0.3015
  ✅ Saved model checkpoint: ./models/a3c_cluster_1.pt
Cluster 2 | X=(512, 64, 20, 19)
  Epoch 1/3, Loss=-0.5230
  Epoch 2/3, Loss=-0.3853
  Epoch 3/3, Loss=-0.4694
  ✅ Saved model checkpoint: ./models/a3c_cluster_2.pt
Cluster 3 | X=(512, 64, 21, 19)
  Epoch 1/3, Loss=-0.4557
  Epoch 2/3, Loss=-0.4514
  Epoch 3/3, Loss=-0.4496
  ✅ Saved model checkpoint: ./models/a3c_cluster_3.pt
Cluster 4 | X=(512, 64, 28, 19)
  Epoch 1/3, Loss=-1.1942
  Epoch 2/3, Loss=-0.5445
  Epoch 3/3, Loss=-0.6573
  ✅ Saved model checkpoint: ./models/a3c_cluster_4.pt
Cluster 5 | X=(512, 64, 25, 19)
  Epoch 1/3, Loss=0.3239
  Epoch 2/3, Loss=-0.4962
  Epoch 3/3, Loss=-0.5282
  ✅ Saved model checkpoint: ./models/a3c_cluster_5.pt
Cluster 6 | X=(512

**Block 9: Suy luận từ mô hình A3C**

In [40]:
# Block 9 — Inference từ checkpoint A3C

import os, gc, json, csv
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

DATA_DIR = "./tensors/"
MODEL_DIR = "./models/"
SIG_DIR   = "./signals/"
os.makedirs(SIG_DIR, exist_ok=True)

SIG_FILE = os.path.join(SIG_DIR, "a3c_signals_infer.csv")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Reset signals file
if os.path.exists(SIG_FILE):
    os.remove(SIG_FILE)
with open(SIG_FILE, "w", newline="") as f:
    csv.writer(f).writerow(["date","ticker","signal"])

# Load metadata
with open(os.path.join(DATA_DIR, "tensor_index.json"), "r") as f:
    tensor_index = json.load(f)

# --- Model định nghĩa lại (giống Block 8) ---
class A3CNet(nn.Module):
    def __init__(self, n_features, hidden=64):
        super().__init__()
        self.lstm = nn.LSTM(input_size=n_features, hidden_size=hidden, batch_first=True)
        self.actor = nn.Linear(hidden, 3)   # short, flat, long
        self.critic = nn.Linear(hidden, 1)
    def forward(self, x):
        out, _ = self.lstm(x)
        h = out[:, -1, :]
        return self.actor(h), self.critic(h)

# --- Inference function ---
def infer_cluster(meta, batch_size=256):
    c_id, tickers, dates, dates_shifted = meta["cluster"], meta["tickers"], meta["dates"], meta["dates_shifted"]
    X = np.load(os.path.join(DATA_DIR, meta["tensor_file"]), mmap_mode="r")
    M = np.load(os.path.join(DATA_DIR, meta["mask_file"]), mmap_mode="r")
    if X.size == 0:
        return
    B, T, N, F = X.shape
    print(f"[Inference] Cluster {c_id} | X={X.shape}")

    # Load model checkpoint
    model_path = os.path.join(MODEL_DIR, f"a3c_cluster_{c_id}.pt")
    if not os.path.exists(model_path):
        print(f"⚠️ Model checkpoint not found: {model_path}, skip")
        return
    model = A3CNet(F).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    # Inference & save signals
    total = B * N
    with open(SIG_FILE, "a", newline="") as f:
        w = csv.writer(f)
        with torch.no_grad():
            for start in range(0, total, batch_size):
                end = min(total, start+batch_size)
                xb, mb, idx = [], [], []
                for s in range(start, end):
                    b, n = divmod(s, N)
                    xb.append(X[b, :, n, :])
                    mb.append(M[b, :, n, :])
                    idx.append((b, n))
                xb = torch.tensor(np.stack(xb), dtype=torch.float32).to(device)
                mb = torch.tensor(np.stack(mb), dtype=torch.float32).to(device)

                # Áp dụng mask
                xb = xb * mb

                acts = torch.argmax(model(xb)[0], dim=-1).cpu().numpy() - 1  # (-1,0,1)
                for k,(b,n) in enumerate(idx):
                    # dùng dates[b] (ngày cuối window), nhưng reward tính T+1 (dates_shifted)
                    w.writerow([dates[b], tickers[n], int(acts[k])])
                del xb, mb, acts
                gc.collect(); torch.cuda.empty_cache()

    del X, M, model
    gc.collect(); torch.cuda.empty_cache()

# --- Run inference all clusters ---
for meta in tensor_index:
    infer_cluster(meta)

print(f"✅ Done Block 9: inference signals saved to {SIG_FILE}")

[Inference] Cluster 0 | X=(512, 64, 15, 19)
[Inference] Cluster 1 | X=(512, 64, 14, 19)
[Inference] Cluster 2 | X=(512, 64, 20, 19)
[Inference] Cluster 3 | X=(512, 64, 21, 19)
[Inference] Cluster 4 | X=(512, 64, 28, 19)
[Inference] Cluster 5 | X=(512, 64, 25, 19)
[Inference] Cluster 6 | X=(512, 64, 25, 19)
[Inference] Cluster 7 | X=(512, 64, 22, 19)
[Inference] Cluster 8 | X=(512, 64, 22, 19)
[Inference] Cluster 9 | X=(512, 64, 25, 19)
[Inference] Cluster 10 | X=(512, 64, 30, 19)
[Inference] Cluster 11 | X=(512, 64, 26, 19)
[Inference] Cluster 12 | X=(512, 64, 27, 19)
[Inference] Cluster 13 | X=(512, 64, 25, 19)
[Inference] Cluster 14 | X=(512, 64, 27, 19)
[Inference] Cluster 15 | X=(512, 64, 32, 19)
[Inference] Cluster 16 | X=(512, 64, 29, 19)
[Inference] Cluster 17 | X=(512, 64, 31, 19)
[Inference] Cluster 18 | X=(512, 64, 29, 19)
[Inference] Cluster 19 | X=(512, 64, 31, 19)
[Inference] Cluster 20 | X=(512, 64, 26, 19)
[Inference] Cluster 21 | X=(512, 64, 33, 19)
[Inference] Cluster 

**Block 10 : Huấn luyện Cluster DDPG (chỉ với trường hợp vị thế long)**

In [41]:
# Block 10-prep — Chuẩn bị dữ liệu DDPG (state & reward arrays)

import os, json
import numpy as np
import pandas as pd

DATA_DIR   = "./tensors/"
SIG_DIR    = "./signals/"
OUTPUT_DIR = "./ddpg_prep/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ===== Hyper-params =====
EXECUTION_LAG  = 2
STATE_LKBK     = 10
MIN_NAMES_PER_CLUSTER = 2

# ===== Load artifacts =====
# 1. A3C signals (Block 9 output)
signals = pd.read_csv(os.path.join(SIG_DIR, "a3c_signals_infer.csv"))
signals["date"] = pd.to_datetime(signals["date"])

# 2. Giá (Block 7.5 output)
df_px = df_backtest.rename(columns={"timestamp": "date"}).copy()
df_px["date"] = pd.to_datetime(df_px["date"])
px_wide = df_px.pivot(index="date", columns="ticker", values="close").sort_index()
ret_wide = px_wide.pct_change().fillna(0.0)

# 3. Map ticker → cluster
with open(os.path.join(DATA_DIR, "tensor_index.json"), "r") as f:
    tensor_index = json.load(f)
ticker2cluster = {}
for meta in tensor_index:
    for tk in meta["tickers"]:
        ticker2cluster.setdefault(tk, meta["cluster"])
ticker2cluster = pd.Series(ticker2cluster)

# ===== Align signals & returns =====
sig_wide_raw = signals.pivot_table(index="date", columns="ticker", values="signal", aggfunc="last").sort_index()
idx_all = ret_wide.index.union(sig_wide_raw.index)
ret_wide = ret_wide.reindex(idx_all).fillna(0.0)
sig_wide = sig_wide_raw.reindex(idx_all).fillna(0.0)
sig_wide_lag = sig_wide.shift(EXECUTION_LAG)

tickers = [t for t in ret_wide.columns if t in ticker2cluster.index]
ret_wide = ret_wide[tickers].astype("float32")
sig_wide_lag = sig_wide_lag[tickers].astype("float32")
cluster_of = ticker2cluster.loc[tickers]

clusters = sorted(cluster_of.unique().tolist())
cluster_members = {c: cluster_of[cluster_of == c].index.tolist() for c in clusters}
C = len(clusters)

# ===== Helper: build state arrays =====
def build_state_arrays(ret_w, sig_lag, start, end, K=STATE_LKBK):
    R = ret_w.loc[start:end]
    S = sig_lag.loc[start:end]
    dates = R.index

    act_cols, ret_cols = [], []
    ACTIVE_masks = {}
    for c in clusters:
        tks = cluster_members[c]
        if not tks:
            act_c = pd.Series(0.0, index=dates, name=c)
            ret_c = pd.Series(0.0, index=dates, name=c)
            ACTIVE_masks[c] = pd.DataFrame(0.0, index=dates, columns=tks)
        else:
            S_c, R_c = S[tks], R[tks]
            active_mask = (S_c > 0).astype("float32")
            ACTIVE_masks[c] = active_mask
            denom = active_mask.sum(axis=1).replace(0, np.nan)
            w = active_mask.div(denom, axis=0).fillna(0.0)
            act_c = active_mask.mean(axis=1).astype("float32")
            ret_c = (R_c * w).sum(axis=1).astype("float32")
        act_cols.append(act_c.rename(c))
        ret_cols.append(ret_c.rename(c))

    act_df = pd.concat(act_cols, axis=1).astype("float32")
    cret_df = pd.concat(ret_cols, axis=1).astype("float32")

    def stack_lookback(df, K):
        mats = []
        for k in range(K):
            mats.append(df.shift(k).fillna(0.0))
        return np.concatenate([m.values[:, :, None] for m in mats], axis=2)

    A3 = stack_lookback(act_df, K)
    R3 = stack_lookback(cret_df, K)
    valid = np.arange(A3.shape[0]) >= (K - 1)
    dates2 = dates[valid]
    S_mat = np.concatenate([A3[valid].reshape(len(dates2), -1),
                            R3[valid].reshape(len(dates2), -1)], axis=1).astype("float32")
    R_mat = cret_df.loc[dates2].values.astype("float32")
    ACTIVE_masks = {c: ACTIVE_masks[c].loc[dates2] for c in clusters}
    return S_mat, R_mat, dates2, ACTIVE_masks

# ===== Train/Test split =====
TRAIN_START = pd.Timestamp("2023-01-01")
TRAIN_END   = pd.Timestamp("2024-12-31")
TEST_START  = pd.Timestamp("2025-01-01")
TEST_END    = ret_wide.index.max()

S_train, R_train, d_train, ACTIVE_train = build_state_arrays(ret_wide, sig_wide_lag, TRAIN_START, TRAIN_END)
S_test,  R_test,  d_test,  ACTIVE_test  = build_state_arrays(ret_wide, sig_wide_lag, TEST_START, TEST_END)

# ===== Save to disk =====
np.save(os.path.join(OUTPUT_DIR,"S_train.npy"), S_train)
np.save(os.path.join(OUTPUT_DIR,"R_train.npy"), R_train)
np.save(os.path.join(OUTPUT_DIR,"S_test.npy"),  S_test)
np.save(os.path.join(OUTPUT_DIR,"R_test.npy"),  R_test)

pd.Series(d_train).to_csv(os.path.join(OUTPUT_DIR,"dates_train.csv"), index=False)
pd.Series(d_test).to_csv(os.path.join(OUTPUT_DIR,"dates_test.csv"), index=False)

print("✅ Block 10-prep done.")
print("Train:", S_train.shape, R_train.shape, "from", d_train.min(), "to", d_train.max())
print("Test:",  S_test.shape,  R_test.shape,  "from", d_test.min(),  "to", d_test.max())

✅ Block 10-prep done.
Train: (490, 1560) (490, 78) from 2023-01-16 00:00:00 to 2024-12-31 00:00:00
Test: (176, 1560) (176, 78) from 2025-01-15 00:00:00 to 2025-10-02 00:00:00


In [42]:
# Block 10a — Train DDPG Actor–Critic

import os, gc
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

PREP_DIR   = "./ddpg_prep/"
MODEL_DIR  = "./ddpg_models/"
os.makedirs(MODEL_DIR, exist_ok=True)

# ===== Hyper-params =====
EPOCHS     = 60
BATCH_SIZE = 64
LR_ACTOR   = 1e-4
LR_CRITIC  = 5e-4
GAMMA      = 0.95
TAU        = 1e-2
NOISE_STD  = 0.05
HIDDEN     = 128
SEED       = 42

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(SEED); np.random.seed(SEED)

# ===== Load data =====
S_train = np.load(os.path.join(PREP_DIR,"S_train.npy"))
R_train = np.load(os.path.join(PREP_DIR,"R_train.npy"))
dates_train = np.loadtxt(os.path.join(PREP_DIR,"dates_train.csv"), dtype=str, delimiter=",", skiprows=1)

s_dim = S_train.shape[1]
a_dim = R_train.shape[1]

print("Train set:", S_train.shape, R_train.shape)

# ===== Define networks =====
class Actor(nn.Module):
    def __init__(self, s_dim, a_dim, hidden=HIDDEN):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(s_dim, hidden), nn.ReLU(),
            nn.Linear(hidden, hidden), nn.ReLU(),
            nn.Linear(hidden, a_dim)
        )
    def forward(self, s):
        return torch.softmax(self.net(s), dim=-1)  # simplex

class Critic(nn.Module):
    def __init__(self, s_dim, a_dim, hidden=HIDDEN):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(s_dim+a_dim, hidden), nn.ReLU(),
            nn.Linear(hidden, hidden), nn.ReLU(),
            nn.Linear(hidden, 1)
        )
    def forward(self, s, a):
        return self.net(torch.cat([s, a], dim=-1))

class Buffer:
    def __init__(self, maxlen=20000):
        self.buf=[]; self.maxlen=maxlen
    def push(self, s,a,r,s2):
        if len(self.buf)>=self.maxlen: self.buf.pop(0)
        self.buf.append((s,a,r,s2))
    def sample(self, bs):
        n=min(bs,len(self.buf))
        idx=np.random.choice(len(self.buf), n, replace=False)
        s,a,r,s2 = zip(*[self.buf[i] for i in idx])
        return (np.array(s,np.float32), np.array(a,np.float32),
                np.array(r,np.float32).reshape(-1,1), np.array(s2,np.float32))

def soft_update(src, tgt, tau):
    with torch.no_grad():
        for p, tp in zip(src.parameters(), tgt.parameters()):
            tp.data.mul_(1-tau); tp.data.add_(tau*p.data)

def port_reward(w, r):
    return float(np.dot(w, r))

# ===== Init models =====
actor = Actor(s_dim, a_dim).to(device)
critic = Critic(s_dim, a_dim).to(device)
t_actor = Actor(s_dim, a_dim).to(device); t_actor.load_state_dict(actor.state_dict())
t_critic= Critic(s_dim, a_dim).to(device); t_critic.load_state_dict(critic.state_dict())

optA = optim.Adam(actor.parameters(), lr=LR_ACTOR)
optC = optim.Adam(critic.parameters(), lr=LR_CRITIC)
mse = nn.MSELoss()
buf = Buffer()

# ===== Training loop =====
for ep in range(EPOCHS):
    c_loss=a_loss=0.0
    for t in range(len(S_train)-1):
        s  = torch.from_numpy(S_train[t]).float().to(device).unsqueeze(0)
        s2 = torch.from_numpy(S_train[t+1]).float().to(device).unsqueeze(0)
        with torch.no_grad():
            w = actor(s).cpu().numpy()[0]
        # exploration: add noise in logit space
        logits = np.log(w+1e-9)+np.random.normal(0,NOISE_STD,size=a_dim)
        w_e = np.exp(logits); w_e = (w_e/w_e.sum()).astype("float32")

        r = port_reward(w_e, R_train[t])
        buf.push(S_train[t], w_e, r, S_train[t+1])

        if len(buf.buf) >= BATCH_SIZE:
            sb, ab, rb, s2b = buf.sample(BATCH_SIZE)
            sb,ab,rb,s2b = (torch.tensor(sb).to(device),
                            torch.tensor(ab).to(device),
                            torch.tensor(rb).to(device),
                            torch.tensor(s2b).to(device))

            with torch.no_grad():
                a2 = t_actor(s2b)
                q2 = t_critic(s2b,a2)
                y  = rb + GAMMA*q2

            q = critic(sb,ab)
            lc = mse(q,y)
            optC.zero_grad(); lc.backward(); optC.step()

            ap = actor(sb)
            la = -critic(sb,ap).mean()
            optA.zero_grad(); la.backward(); optA.step()

            soft_update(actor,t_actor,TAU)
            soft_update(critic,t_critic,TAU)

            c_loss += lc.item(); a_loss += la.item()

    print(f"[Epoch {ep+1}/{EPOCHS}] Critic={c_loss:.4f} | Actor={a_loss:.4f}")
    gc.collect(); torch.cuda.empty_cache()

# ===== Save checkpoint =====
torch.save(actor.state_dict(), os.path.join(MODEL_DIR,"actor.pt"))
torch.save(critic.state_dict(), os.path.join(MODEL_DIR,"critic.pt"))
print(f"✅ Block 10a done. Saved models to {MODEL_DIR}")

Train set: (490, 1560) (490, 78)
[Epoch 1/60] Critic=0.0305 | Actor=-23.3010
[Epoch 2/60] Critic=0.0398 | Actor=-26.1175
[Epoch 3/60] Critic=0.0699 | Actor=-23.5888
[Epoch 4/60] Critic=0.0645 | Actor=-21.8119
[Epoch 5/60] Critic=0.0579 | Actor=-20.4562
[Epoch 6/60] Critic=0.0515 | Actor=-18.6667
[Epoch 7/60] Critic=0.0443 | Actor=-16.4401
[Epoch 8/60] Critic=0.0324 | Actor=-14.2746
[Epoch 9/60] Critic=0.0313 | Actor=-13.3598
[Epoch 10/60] Critic=0.0282 | Actor=-12.9135
[Epoch 11/60] Critic=0.0269 | Actor=-11.8459
[Epoch 12/60] Critic=0.0287 | Actor=-11.3130
[Epoch 13/60] Critic=0.0289 | Actor=-12.0125
[Epoch 14/60] Critic=0.0283 | Actor=-9.9463
[Epoch 15/60] Critic=0.0271 | Actor=-8.0240
[Epoch 16/60] Critic=0.0358 | Actor=-8.6355
[Epoch 17/60] Critic=0.0420 | Actor=-8.6133
[Epoch 18/60] Critic=0.0409 | Actor=-6.7488
[Epoch 19/60] Critic=0.0379 | Actor=-5.4266
[Epoch 20/60] Critic=0.0351 | Actor=-4.7666
[Epoch 21/60] Critic=0.0345 | Actor=-3.9104
[Epoch 22/60] Critic=0.0292 | Actor=-3.

In [45]:
# Block 10b — Backtest & Inference (grid-search SL/TP) — FULL (robust version)
import os, gc, json, csv, math
import numpy as np
import pandas as pd
from datetime import timedelta
import torch, torch.nn as nn

# ----------------- CONFIG -----------------
DATA_DIR     = "./tensors/"
SIG_DIR      = "./signals/"
MODEL_DIR    = "./ddpg_models/"
OUTPUT_DIR   = "./backtest_grid/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

SIG_FILE     = os.path.join(SIG_DIR, "a3c_signals_infer.csv")   # from Block 9
DF_PRICE     = "./backtest_ddpg/df_backtest.csv"               # expected path (Open/High/Low/Close)
FALLBACK_PRICE = "vnindex_price_fa_merged.csv"                # fallback if DF_PRICE missing or incomplete
ACTOR_PATH   = os.path.join(MODEL_DIR, "actor.pt")             # actor checkpoint from 10a
TENSOR_INDEX = os.path.join(DATA_DIR, "tensor_index.json")

# grid for SL/TP (as fraction, e.g. 0.01 = 1%)
GRID_SL = [0.01, 0.02, 0.03, 0.05]   # stop-loss distances
GRID_TP = [0.02, 0.04, 0.06, 0.10]   # take-profit distances

# other backtest params
ENTRY_LAG = 1         # signal at day T -> entry at open T+ENTRY_LAG
INIT_CAP   = 10_000.0
TOP_K_PER_CLUSTER = 5
MIN_NAMES_PER_CLUSTER = 1
MAX_HOLD_DAYS = 30     # optional max hold to avoid infinite long holds
ENTRY_ON = "open"
USE_LOG_RET = False    # returns additive or multiplicative

# test split
TEST_START = pd.Timestamp("2025-01-01")
# TEST_END inferred from price
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ----------------- HELPERS -----------------
def load_signals(path):
    df = pd.read_csv(path)
    if "date" in df.columns:
        df["date"] = pd.to_datetime(df["date"])
    elif "timestamp" in df.columns:
        df["date"] = pd.to_datetime(df["timestamp"])
    else:
        raise ValueError("signals file must have 'date' or 'timestamp' column")
    return df[["date","ticker","signal"]]

def compute_stats_from_series(port_series):
    ret = port_series.pct_change().dropna()
    if len(ret)==0:
        return {"ROI":0.0,"Vol":0.0,"Sharpe":0.0,"MaxDD":0.0,"WinRate":0.0}
    roi = port_series.iloc[-1]/port_series.iloc[0]-1.0
    vol = ret.std() * (252**0.5)
    sharpe = (ret.mean()/ret.std()) * (252**0.5) if ret.std()>0 else 0.0
    dd = (port_series/port_series.cummax()-1.0).min()
    winrate = (ret>0).mean()
    return {"ROI":roi, "Vol":vol, "Sharpe":sharpe, "MaxDD":dd, "WinRate":winrate}

def intraday_check(entry_price, day_high, day_low, sl_pct, tp_pct):
    sl_price = entry_price * (1 - sl_pct)
    tp_price = entry_price * (1 + tp_pct)
    # conservative: stop-first if both hit
    if day_low <= sl_price:
        return sl_price, "sl"
    if day_high >= tp_price:
        return tp_price, "tp"
    return None, "hold"

# ----------------- Load price data (with fallback) -----------------
print("Loading price data...")
if not os.path.exists(DF_PRICE):
    print(f"Warning: {DF_PRICE} not found, trying fallback {FALLBACK_PRICE}")
    if not os.path.exists(FALLBACK_PRICE):
        raise FileNotFoundError(f"Neither {DF_PRICE} nor fallback {FALLBACK_PRICE} found. Put OHLC data into {DF_PRICE}.")
    df_px = pd.read_csv(FALLBACK_PRICE, parse_dates=["timestamp"])
else:
    df_px = pd.read_csv(DF_PRICE, parse_dates=["timestamp"])

# standardize column name
if "timestamp" in df_px.columns:
    df_px = df_px.rename(columns={"timestamp":"date"})
df_px["date"] = pd.to_datetime(df_px["date"])

# ensure required OHLC columns
for c in ["open","high","low","close"]:
    if c not in df_px.columns:
        raise ValueError(f"Price Data must contain '{c}' column (found columns: {df_px.columns.tolist()})")

# pivot wide
px_wide_open = df_px.pivot(index="date", columns="ticker", values="open").sort_index()
px_wide_high = df_px.pivot(index="date", columns="ticker", values="high").sort_index()
px_wide_low  = df_px.pivot(index="date", columns="ticker", values="low").sort_index()
px_wide_close= df_px.pivot(index="date", columns="ticker", values="close").sort_index()

# align full calendar and fill small gaps
idx_all = px_wide_close.index
px_wide_open  = px_wide_open.reindex(idx_all).ffill().bfill()
px_wide_high  = px_wide_high.reindex(idx_all).ffill().bfill()
px_wide_low   = px_wide_low.reindex(idx_all).ffill().bfill()
px_wide_close = px_wide_close.reindex(idx_all).ffill().bfill()

TEST_END = px_wide_close.index.max()
print("Test range available:", TEST_START.date(), "->", TEST_END.date())

# ----------------- Load signals & tensor_index -----------------
print("Loading signals...")
signals = load_signals(SIG_FILE)

with open(TENSOR_INDEX,"r") as fh:
    tensor_index = json.load(fh)

# build ticker->cluster mapping (first seen)
ticker2cluster = {}
for meta in tensor_index:
    c = int(meta["cluster"])
    for tk in meta["tickers"]:
        if tk not in ticker2cluster:
            ticker2cluster[tk] = c
ticker2cluster = pd.Series(ticker2cluster)

# restrict to tickers present in price & mapping
tickers = sorted(list(set(px_wide_close.columns).intersection(set(ticker2cluster.index))))
print(f"Tickers available in price & mapping: {len(tickers)}")
px_wide_open  = px_wide_open.reindex(columns=tickers)
px_wide_high  = px_wide_high.reindex(columns=tickers)
px_wide_low   = px_wide_low.reindex(columns=tickers)
px_wide_close = px_wide_close.reindex(columns=tickers)

# signals -> wide (last signal per ticker/day)
sig_wide = signals.pivot_table(index="date", columns="ticker", values="signal", aggfunc="last").reindex(idx_all).fillna(0.0)
# shift for entry lag
sig_entry = sig_wide.shift(ENTRY_LAG).fillna(0.0)

# build clusters
clusters = sorted(list(set(ticker2cluster.loc[tickers].values)))
cluster_members = {c: [tk for tk in tickers if ticker2cluster.get(tk)==c] for c in clusters}

# ---------- Build cluster-level state arrays (for actor input) ----------
STATE_LKBK = 10

def build_state_arrays_for_backtest(sig_lag, ret_wide, clusters, cluster_members, K=STATE_LKBK, start=None, end=None):
    R = ret_wide.loc[start:end]
    S = sig_lag.loc[start:end]
    dates = R.index

    act_cols, ret_cols = [], []
    ACTIVE_masks = {}
    for c in clusters:
        tks = cluster_members.get(c, [])
        if len(tks)==0:
            act_c = pd.Series(0.0, index=dates, name=c)
            ret_c = pd.Series(0.0, index=dates, name=c)
            ACTIVE_masks[c] = pd.DataFrame(0.0,index=dates,columns=tks)
        else:
            S_c = S.reindex(columns=tks).fillna(0.0)
            R_c = R.reindex(columns=tks).fillna(0.0)
            active_mask = (S_c > 0).astype("float32")
            ACTIVE_masks[c] = active_mask
            denom = active_mask.sum(axis=1).replace(0, np.nan)
            w = active_mask.div(denom, axis=0).fillna(0.0)
            act_c = active_mask.mean(axis=1).astype("float32")
            ret_c = (R_c * w).sum(axis=1).astype("float32")
        act_cols.append(act_c.rename(c))
        ret_cols.append(ret_c.rename(c))

    act_df = pd.concat(act_cols, axis=1).astype("float32")
    cret_df = pd.concat(ret_cols, axis=1).astype("float32")

    def stack_lookback(df, K):
        mats = []
        for k in range(K):
            mats.append(df.shift(k).fillna(0.0))
        return np.concatenate([m.values[:,:,None] for m in mats], axis=2)

    A3 = stack_lookback(act_df, K)
    R3 = stack_lookback(cret_df, K)

    valid = np.arange(A3.shape[0]) >= (K-1)
    A3 = A3[valid]; R3 = R3[valid]
    dates2 = dates[valid]

    S_mat = np.concatenate([A3.reshape(len(dates2), -1), R3.reshape(len(dates2), -1)], axis=1).astype("float32")
    R_mat = cret_df.loc[dates2].values.astype("float32")
    ACTIVE_masks = {c: ACTIVE_masks[c].loc[dates2] for c in clusters}
    return S_mat, R_mat, dates2, ACTIVE_masks

# compute returns wide
ret_wide = px_wide_close.pct_change().fillna(0.0)

S_all, R_all, dates_all, ACTIVE_all = build_state_arrays_for_backtest(sig_entry, ret_wide, clusters, cluster_members, K=STATE_LKBK, start=TEST_START, end=TEST_END)
print("State matrix prepared:", S_all.shape, "dates:", len(dates_all))
s_dim = S_all.shape[1]; a_dim = len(clusters)

# ---------- Load actor (cluster-level) ----------
if not os.path.exists(ACTOR_PATH):
    raise FileNotFoundError(f"Actor model not found at {ACTOR_PATH}. Run Block 10a first.")
print("Loading actor model...")
class ActorNet(nn.Module):
    def __init__(self, s_dim, a_dim, hidden=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(s_dim, hidden), nn.ReLU(),
            nn.Linear(hidden, hidden), nn.ReLU(),
            nn.Linear(hidden, a_dim)
        )
    def forward(self, s):
        return torch.softmax(self.net(s), dim=-1)

actor = ActorNet(s_dim, a_dim).to(DEVICE)
actor.load_state_dict(torch.load(ACTOR_PATH, map_location=DEVICE))
actor.eval()

# ---------- Grid-search loop ----------
summary_rows = []
cfg_id = 0

for sl in GRID_SL:
    for tp in GRID_TP:
        if tp <= sl: continue
        cfg_id += 1
        cfg_name = f"SL{int(sl*100)}_TP{int(tp*100)}"
        print(f"\n=== Running config {cfg_id}: {cfg_name} (sl={sl:.3f}, tp={tp:.3f}) ===")

        out_dir = os.path.join(OUTPUT_DIR, cfg_name)
        os.makedirs(out_dir, exist_ok=True)

        dates = dates_all
        n_days = len(dates)
        capital = INIT_CAP
        port_vals = pd.Series(index=dates, dtype="float64")
        port_vals.iloc[0] = capital

        prev_weights = pd.Series(0.0,index=tickers)
        pos_info = {}  # ticker -> {entry_price, entry_date, weight, hold_days}
        turnover_series = pd.Series(0.0,index=dates)
        daily_returns = pd.Series(0.0,index=dates)

        # compute cluster weights
        cluster_weights = []
        with torch.no_grad():
            for i in range(n_days):
                s = torch.from_numpy(S_all[i:i+1]).float().to(DEVICE)
                w_c = actor(s).cpu().numpy()[0].astype("float32")
                w_c = np.clip(w_c, 0, None)
                if w_c.sum() <= 1e-9:
                    w_c = np.ones_like(w_c)/len(w_c)
                else:
                    w_c = w_c / w_c.sum()
                cluster_weights.append(w_c)
        cluster_weights = np.stack(cluster_weights, axis=0)  # (n_days, C)

        momentum_window = 5
        for i, dt in enumerate(dates):
            w_c = cluster_weights[i]

            # build target ticker weights
            target_w_ticker = pd.Series(0.0, index=tickers, dtype="float32")
            for j, c in enumerate(clusters):
                members = cluster_members.get(c, [])
                if not members: continue
                # candidates: active signal > 0 (sig_entry)
                candidates = [tk for tk in members if (tk in sig_entry.columns and sig_entry.loc[dt, tk] > 0)]
                if len(candidates) < MIN_NAMES_PER_CLUSTER:
                    continue
                # momentum ranking
                mom_scores = {}
                for tk in candidates:
                    try:
                        idx_dt = px_wide_close.index.get_loc(dt)
                        start_idx = max(0, idx_dt-momentum_window)
                        start_date = px_wide_close.index[start_idx]
                        vstart = px_wide_close.loc[start_date, tk]
                        vend = px_wide_close.loc[dt, tk]
                        r = (vend / vstart - 1.0) if vstart>0 else 0.0
                    except Exception:
                        r = 0.0
                    mom_scores[tk] = r
                topk = sorted(mom_scores.keys(), key=lambda x: mom_scores[x], reverse=True)[:TOP_K_PER_CLUSTER]
                if len(topk)==0: continue
                share = float(w_c[j]) / len(topk)
                for tk in topk:
                    target_w_ticker[tk] += share

            # normalize
            ssum = target_w_ticker.sum()
            if ssum <= 1e-12:
                target_w_ticker[:] = 1.0/len(target_w_ticker)
            else:
                target_w_ticker = target_w_ticker / ssum

            turnover = float(np.sum(np.abs(target_w_ticker.values - prev_weights.values)))
            turnover_series.iloc[i] = turnover

            # Intraday SL/TP check for existing positions
            realized_pnl = 0.0
            closed_tickers = []
            for tk, info in list(pos_info.items()):
                entry_price = info["entry_price"]
                day_high = px_wide_high.loc[dt, tk]
                day_low  = px_wide_low.loc[dt, tk]
                exit_price, exit_type = intraday_check(entry_price, day_high, day_low, sl, tp)
                if exit_price is not None:
                    wpos = info["weight"]
                    pnl = wpos * ((exit_price / entry_price) - 1.0)
                    realized_pnl += pnl
                    closed_tickers.append(tk)
                    del pos_info[tk]

            # New entries / rebalancing executed at open today
            new_entries = []
            for tk in tickers:
                new_w = float(target_w_ticker[tk])
                old_w = float(prev_weights.get(tk, 0.0))
                if new_w > 0 and (tk not in pos_info or abs(new_w - old_w) > 1e-6):
                    entry_price = px_wide_open.loc[dt, tk]
                    pos_info[tk] = {"entry_price": entry_price, "entry_date": dt, "weight": new_w, "hold_days": 0}
                    new_entries.append(tk)
                else:
                    if tk in pos_info:
                        pos_info[tk]["weight"] = new_w

            prev_weights = target_w_ticker.copy()

            # Compute day return:
            # - for new entries: open->close
            # - for existing: close / prev_close -1
            day_ret = 0.0
            prev_close_all = px_wide_close.shift(1).loc[dt]
            close_today = px_wide_close.loc[dt]
            for tk in tickers:
                w = float(prev_weights[tk])
                if math.isclose(w, 0.0): continue
                if tk in new_entries:
                    op = px_wide_open.loc[dt, tk]
                    cl = close_today[tk]
                    ret = (cl / op - 1.0) if op>0 else 0.0
                else:
                    prev_c = prev_close_all[tk]
                    cl = close_today[tk]
                    ret = (cl / prev_c - 1.0) if prev_c>0 else 0.0
                day_ret += w * ret

            total_ret = day_ret + realized_pnl
            COST_BPS = 30
            fee = (COST_BPS / 1e4) * turnover
            total_ret_net = total_ret - fee

            capital = capital * (1.0 + total_ret_net)
            port_vals.iloc[i] = capital
            daily_returns.iloc[i] = total_ret_net

            # update hold days & forced close if exceed MAX_HOLD_DAYS
            for tk in list(pos_info.keys()):
                pos_info[tk]["hold_days"] = pos_info[tk].get("hold_days",0) + 1
                if pos_info[tk]["hold_days"] >= MAX_HOLD_DAYS:
                    # we close at close price (already mark-to-market applied), remove position
                    del pos_info[tk]

            if (i % 50) == 0:
                gc.collect()

        # end daily loop

        stats = compute_stats_from_series(port_vals)
        port_vals.to_csv(os.path.join(out_dir, "portfolio_value.csv"))
        daily_returns.to_csv(os.path.join(out_dir, "daily_returns.csv"))
        turnover_series.to_csv(os.path.join(out_dir, "turnover.csv"))

        summary_rows.append({
            "config": cfg_name,
            "sl": sl,
            "tp": tp,
            "ROI": stats["ROI"],
            "Vol": stats["Vol"],
            "Sharpe": stats["Sharpe"],
            "MaxDD": stats["MaxDD"],
            "WinRate": stats["WinRate"],
            "final_capital": float(port_vals.dropna().iloc[-1]) if port_vals.dropna().size>0 else float(capital),
            "n_days": int(port_vals.dropna().shape[0])
        })

        print(f"Finished {cfg_name}: ROI={stats['ROI']:.3%}, Sharpe={stats['Sharpe']:.3f}, MaxDD={stats['MaxDD']:.3%}")
        pd.DataFrame(summary_rows).to_csv(os.path.join(OUTPUT_DIR, "grid_summary_partial.csv"), index=False)

# final summary
summary_df = pd.DataFrame(summary_rows)
summary_df = summary_df.sort_values(by="Sharpe", ascending=False).reset_index(drop=True)
summary_df.to_csv(os.path.join(OUTPUT_DIR, "grid_summary.csv"), index=False)
print("\n=== Grid search complete. Summary saved to:", os.path.join(OUTPUT_DIR, "grid_summary.csv"))
print(summary_df.head(10))

Loading price data...
Test range available: 2025-01-01 -> 2025-10-02
Loading signals...
Tickers available in price & mapping: 388
State matrix prepared: (176, 1560) dates: 176
Loading actor model...

=== Running config 1: SL1_TP2 (sl=0.010, tp=0.020) ===
Finished SL1_TP2: ROI=127.583%, Sharpe=5.801, MaxDD=-7.540%

=== Running config 2: SL1_TP4 (sl=0.010, tp=0.040) ===
Finished SL1_TP4: ROI=274.212%, Sharpe=8.173, MaxDD=-6.566%

=== Running config 3: SL1_TP6 (sl=0.010, tp=0.060) ===
Finished SL1_TP6: ROI=371.965%, Sharpe=8.442, MaxDD=-6.307%

=== Running config 4: SL1_TP10 (sl=0.010, tp=0.100) ===
Finished SL1_TP10: ROI=401.417%, Sharpe=7.862, MaxDD=-7.590%

=== Running config 5: SL2_TP4 (sl=0.020, tp=0.040) ===
Finished SL2_TP4: ROI=171.099%, Sharpe=5.930, MaxDD=-12.056%

=== Running config 6: SL2_TP6 (sl=0.020, tp=0.060) ===
Finished SL2_TP6: ROI=257.940%, Sharpe=6.689, MaxDD=-10.943%

=== Running config 7: SL2_TP10 (sl=0.020, tp=0.100) ===
Finished SL2_TP10: ROI=313.261%, Sharpe=6.71

**code mới nhất nằm ngày trên**

**BLOCK 10.5**

**Block 11: Thống kê kết quả và vẽ biểu đồ**

In [None]:
# Block 11a — Hiệu suất & Stress Test (Auto chọn Sharpe tốt nhất từ Grid)
import os, numpy as np, pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

OUTPUT_DIR   = "./backtest_grid/"
BEST_DIR     = "./backtest_best/"
os.makedirs(BEST_DIR, exist_ok=True)

GRID_SUMMARY = os.path.join(OUTPUT_DIR, "grid_summary.csv")
if not os.path.exists(GRID_SUMMARY):
    raise FileNotFoundError("Chưa có grid_summary.csv từ Block 10B")

# ---------- Helpers ----------
def max_drawdown(series: pd.Series) -> float:
    peak = series.cummax()
    dd = (series / peak) - 1.0
    return float(dd.min())

def compute_stats(port_val: pd.Series, bench_val: pd.Series | None = None):
    port_ret = port_val.pct_change().fillna(0.0)
    stats = {
        "Ngày bắt đầu": port_val.index.min().strftime("%Y-%m-%d"),
        "Ngày kết thúc": port_val.index.max().strftime("%Y-%m-%d"),
        "Giá trị cuối": float(port_val.iloc[-1]),
        "ROI (%)": float((port_val.iloc[-1] / port_val.iloc[0] - 1.0) * 100),
        "Biến động (năm, %)": float(port_ret.std() * np.sqrt(252) * 100),
        "Sharpe": float((port_ret.mean() / port_ret.std()) * np.sqrt(252)) if port_ret.std() > 0 else 0,
        "Sortino": float((port_ret.mean() / port_ret[port_ret < 0].std()) * np.sqrt(252)) if port_ret[port_ret < 0].std() > 0 else 0,
        "MaxDrawdown (%)": float(max_drawdown(port_val) * 100),
        "Tỷ lệ phiên thắng (%)": float((port_ret > 0).mean() * 100),
        "Số ngày": int(len(port_ret))
    }
    if bench_val is not None and len(bench_val) > 1:
        stats.update({
            "Giá trị cuối (VNINDEX)": float(bench_val.iloc[-1]),
            "ROI VNINDEX (%)": float((bench_val.iloc[-1] / bench_val.iloc[0] - 1.0) * 100),
            "Chênh lệch so với VNINDEX (pp)": stats["ROI (%)"] - ((bench_val.iloc[-1] / bench_val.iloc[0] - 1.0) * 100)
        })
    return stats

def plot_equity(port_val, bench_val, title, path):
    plt.figure(figsize=(10,6))
    plt.plot(port_val, label="Chiến lược", linewidth=1.6)
    if bench_val is not None:
        plt.plot(bench_val, label="VNINDEX", linewidth=1.2)
    plt.title(title); plt.legend(); plt.grid(True, alpha=0.3)
    plt.savefig(path, dpi=150); plt.close()

# ---------- 1. Chọn config Sharpe cao nhất ----------
summary = pd.read_csv(GRID_SUMMARY)
best_cfg = summary.sort_values(by="Sharpe", ascending=False).iloc[0]
best_name = best_cfg["config"]
print(f"🏆 Config tốt nhất: {best_name} | Sharpe={best_cfg['Sharpe']:.3f}, ROI={best_cfg['ROI']:.2%}")

cfg_dir = os.path.join(OUTPUT_DIR, best_name)
port_path = os.path.join(cfg_dir, "portfolio_value.csv")
daily_path = os.path.join(cfg_dir, "daily_returns.csv")

# ---------- 2. Load dữ liệu ----------
port_val = pd.read_csv(port_path, index_col=0, parse_dates=True).iloc[:,0].sort_index()
daily_returns = pd.read_csv(daily_path, index_col=0, parse_dates=True).iloc[:,0].sort_index()

# benchmark từ df_backtest
df_backtest_path = "./backtest_ddpg/df_backtest.csv"
if not os.path.exists(df_backtest_path):
    raise FileNotFoundError("df_backtest.csv chưa sẵn sàng")
df_backtest = pd.read_csv(df_backtest_path, parse_dates=["timestamp"])
df_backtest = df_backtest.groupby(["timestamp","ticker"],as_index=False).agg({"close":"last"})
px = df_backtest.pivot(index="timestamp", columns="ticker", values="close").sort_index()
bench_val = px["VNINDEX"].reindex(port_val.index).ffill().bfill() if "VNINDEX" in px.columns else None
bench_ret = bench_val.pct_change().fillna(0.0) if bench_val is not None else None

# ---------- 3. Compute Stats ----------
stats_test = compute_stats(port_val, bench_val)

# ---------- 4. Plot cơ bản ----------
plot_equity(port_val, bench_val, f"Equity Curve ({best_name})", os.path.join(BEST_DIR,"equity.png"))

plt.figure(figsize=(9,5))
plt.hist(daily_returns.dropna(), bins=50, alpha=0.6, label="Chiến lược")
if bench_ret is not None:
    plt.hist(bench_ret.dropna(), bins=50, alpha=0.6, label="VNINDEX")
plt.title("Histogram lợi nhuận ngày"); plt.legend(); plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(BEST_DIR,"hist.png")); plt.close()

# ---------- 5. Stress Test (ví dụ Trump 46%) ----------
stress_start, stress_end = pd.Timestamp("2025-03-26"), pd.Timestamp("2025-04-15")
sub_port = port_val.loc[stress_start:stress_end]
sub_bench = bench_val.loc[stress_start:stress_end] if bench_val is not None else None
stats_stress = {}
if len(sub_port) > 1:
    stats_stress = compute_stats(sub_port, sub_bench)
    plot_equity(sub_port, sub_bench, f"Stress Test ({best_name})", os.path.join(BEST_DIR,"equity_stress.png"))

# ---------- 6. Save ----------
all_stats={"BestConfig":stats_test}
if stats_stress: all_stats["StressTest"]=stats_stress
pd.DataFrame(all_stats).T.to_csv(os.path.join(BEST_DIR,"stats.csv"))

print("\n📊 Kết quả config tốt nhất:"); print(pd.Series(stats_test))
if stats_stress: 
    print("\n📊 Stress Test:"); print(pd.Series(stats_stress))
print(f"\n✅ Block 11a hoàn tất. Lưu kết quả tại {BEST_DIR}")

In [47]:
# Block 11 — Hiệu suất & Stress Test (VNINDEX Benchmark)
import os, numpy as np, pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from FiinQuantX import FiinSession   # <-- cần package này

GRID_DIR   = "./backtest_grid/"
OUTPUT_DIR = "./backtest_ddpg/"
os.makedirs(OUTPUT_DIR, exist_ok=True)
STATS_FILE = os.path.join(OUTPUT_DIR, "stats_test.csv")

INIT_CAPITAL = 10_000.0
BENCHMARK_TKR = "VNINDEX"

# ---------- Helpers ----------
def max_drawdown(series: pd.Series) -> float:
    peak = series.cummax()
    dd = (series / peak) - 1.0
    return float(dd.min())

def compute_stats(port_val: pd.Series, bench_val: pd.Series | None = None):
    port_ret = port_val.pct_change().fillna(0.0)
    stats = {
        "Ngày bắt đầu": port_val.index.min().strftime("%Y-%m-%d"),
        "Ngày kết thúc": port_val.index.max().strftime("%Y-%m-%d"),
        "Giá trị cuối": float(port_val.iloc[-1]),
        "ROI (%)": float((port_val.iloc[-1] / port_val.iloc[0] - 1.0) * 100),
        "Biến động (năm, %)": float(port_ret.std() * np.sqrt(252) * 100),
        "Sharpe": float((port_ret.mean() / port_ret.std()) * np.sqrt(252)) if port_ret.std() > 0 else 0,
        "Sortino": float((port_ret.mean() / port_ret[port_ret < 0].std()) * np.sqrt(252)) if port_ret[port_ret < 0].std() > 0 else 0,
        "MaxDrawdown (%)": float(max_drawdown(port_val) * 100),
        "Tỷ lệ phiên thắng (%)": float((port_ret > 0).mean() * 100),
        "Số ngày": int(len(port_ret))
    }
    if bench_val is not None and len(bench_val) > 1:
        stats.update({
            "Giá trị cuối (VNINDEX)": float(bench_val.iloc[-1]),
            "ROI VNINDEX (%)": float((bench_val.iloc[-1] / bench_val.iloc[0] - 1.0) * 100),
            "Chênh lệch so với VNINDEX (pp)": stats["ROI (%)"] - ((bench_val.iloc[-1] / bench_val.iloc[0] - 1.0) * 100)
        })
    return stats

# ---------- 1. Chọn config Sharpe cao nhất ----------
summary_path = os.path.join(GRID_DIR, "grid_summary.csv")
if not os.path.exists(summary_path):
    raise FileNotFoundError("grid_summary.csv chưa có. Hãy chạy Block 10B trước.")

summary = pd.read_csv(summary_path)
best_cfg = summary.sort_values(by="Sharpe", ascending=False).iloc[0]
best_name = best_cfg["config"]
print(f"🏆 Config Sharpe cao nhất: {best_name} | Sharpe={best_cfg['Sharpe']:.3f}, ROI={best_cfg['ROI']:.2%}")

cfg_dir = os.path.join(GRID_DIR, best_name)
port_val = pd.read_csv(os.path.join(cfg_dir, "portfolio_value.csv"), index_col=0, parse_dates=True).iloc[:,0].sort_index()
returns = port_val.pct_change().fillna(0.0)

# ---------- 2. Fetch Benchmark VNINDEX ----------
print("📈 Fetching VNINDEX for benchmark (buy & hold)...")
client = FiinSession(username="DSTC_18@fiinquant.vn", password="Fiinquant0606").login()
bench = client.Fetch_Trading_Data(
    realtime=False, tickers=BENCHMARK_TKR, fields=['close'],
    adjusted=True, by="1d", from_date=str(port_val.index.min().date())
).get_data()
bench["date"] = pd.to_datetime(bench["timestamp"])
bench = bench.set_index("date")["close"].sort_index().reindex(port_val.index).ffill().bfill()
bench_ret = bench.pct_change().fillna(0.0)
bench_val = (1 + bench_ret).cumprod() * INIT_CAPITAL

# ---------- 3. Stats toàn kỳ ----------
stats_test = compute_stats(port_val, bench_val)

# Equity
plt.figure(figsize=(10,6))
plt.plot(port_val, label="Chiến lược", linewidth=1.6)
plt.plot(bench_val, label="VNINDEX (Buy&Hold)", linewidth=1.2)
plt.title("Equity Curve (Test)"); plt.legend(); plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(OUTPUT_DIR, "equity_test.png")); plt.close()

# Histogram
plt.figure(figsize=(9,5))
plt.hist(returns.dropna(), bins=50, alpha=0.6, label="Chiến lược")
plt.hist(bench_ret.dropna(), bins=50, alpha=0.6, label="VNINDEX")
plt.title("Histogram lợi nhuận ngày (Test)"); plt.xlabel("Daily Return"); plt.ylabel("Tần suất")
plt.legend(); plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(OUTPUT_DIR, "hist_test.png")); plt.close()

# ---------- 4. Stress Test (Trump 46%) ----------
stress_start, stress_end = pd.Timestamp("2025-03-26"), pd.Timestamp("2025-04-15")
stress_name = "Trump thông báo đánh thuế 46% hàng hóa Việt Nam"
sub_port, sub_bench = port_val.loc[stress_start:stress_end], bench_val.loc[stress_start:stress_end]
stats_stress = {}
if len(sub_port)>1:
    stats_stress = compute_stats(sub_port, sub_bench)

# ---------- Save Stats ----------
all_stats={"Test":stats_test}
if stats_stress: all_stats["Stress Test"]=stats_stress
pd.DataFrame(all_stats).T.to_csv(STATS_FILE)

print("📊 Kết quả Test:"); print(pd.Series(stats_test))
if stats_stress: print(f"\n📊 Stress Test ({stress_name}):"); print(pd.Series(stats_stress))
print(f"\n✅ Block 11 hoàn tất. Stats lưu tại: {STATS_FILE}, Charts in {OUTPUT_DIR}")

🏆 Config Sharpe cao nhất: SL1_TP6 | Sharpe=8.442, ROI=371.97%
📈 Fetching VNINDEX for benchmark (buy & hold)...
Fetching data, it may take a while. Please wait...
📊 Kết quả Test:
Ngày bắt đầu                        2025-01-15
Ngày kết thúc                       2025-10-02
Giá trị cuối                      47054.913636
ROI (%)                             371.965031
Biến động (năm, %)                    26.95445
Sharpe                                8.411143
Sortino                              21.632261
MaxDrawdown (%)                      -6.306809
Tỷ lệ phiên thắng (%)                64.204545
Số ngày                                    176
Giá trị cuối (VNINDEX)            13369.493116
ROI VNINDEX (%)                      33.694931
Chênh lệch so với VNINDEX (pp)        338.2701
dtype: object

📊 Stress Test (Trump thông báo đánh thuế 46% hàng hóa Việt Nam):
Ngày bắt đầu                        2025-03-26
Ngày kết thúc                       2025-04-15
Giá trị cuối                      132

**Block 12A: gửi tín hiệu lên telegram với dữ liệu quá khứ**

`Lưu ý` tuy quá khứ nhưng nhóm không để bị nhìn trước tương lai 

In [48]:
#Block 12a — Gửi tín hiệu Telegram (Offline replay, simple version)

import os, json, requests, time
import pandas as pd
import matplotlib.pyplot as plt

# ==== Đường dẫn ====
OUTPUT_DIR = "./backtest_grid/"
SUMMARY_PATH = os.path.join(OUTPUT_DIR, "grid_summary.csv")

# ==== Load config ====
with open("config.json","r") as f:
    cfg = json.load(f)

TG_TOKEN = cfg["telegram"]["bot_token"]
TG_CHAT_ID = cfg["telegram"]["chat_id"]
TG_THREAD_ID = cfg["telegram"]["message_thread_id"]

# ==== Chọn config tốt nhất (theo Sharpe) ====
summary = pd.read_csv(SUMMARY_PATH)
best_cfg = summary.sort_values(by="Sharpe", ascending=False).iloc[0]["config"]
cfg_dir = os.path.join(OUTPUT_DIR, best_cfg)
print(f"Using best config: {best_cfg}")

def send_daily_report(report_date: str | pd.Timestamp, sleep_sec:int=2):
    """
    Gửi báo cáo Telegram cho 1 ngày trong backtest (offline replay).
    """
    report_date = pd.Timestamp(report_date)

    # --- NAV ---
    pv = pd.read_csv(os.path.join(cfg_dir,"portfolio_value.csv"), index_col=0, parse_dates=True).iloc[:,0]
    if report_date not in pv.index:
        print(f"⚠️ {report_date.date()} không có trong NAV index")
        return
    nav_today = pv.loc[:report_date].iloc[-1]

    # --- Snapshot danh mục ---
    snap_path = os.path.join(cfg_dir, f"positions_snapshot_{report_date.date()}.csv")
    positions_txt = ""
    if os.path.exists(snap_path):
        pos = pd.read_csv(snap_path, parse_dates=["snapshot_date","entry_date"])
        pos = pos[pos["position_size_pct"] > 0]  # chỉ giữ vị thế còn vốn
        if len(pos)>0:
            for _,r in pos.iterrows():
                positions_txt += (
                    f"— {r['ticker']}: Mua {r['entry_price']:.2f} ngày {r['entry_date'].date()}, "
                    f"SL {r['sl_level']:.2f}, TP {r['tp_level']:.2f}, "
                    f"Giá hiện {r['last_price']:.2f}, "
                    f"Lãi/lỗ {r['current_unrealized_pct']:.2f}%, "
                    f"Tỷ trọng {r['position_size_pct']*100:.1f}%\n"
                )
        else:
            positions_txt = "— Không có vị thế nào đang mở\n"
    else:
        positions_txt = "— (Không tìm thấy snapshot)\n"

    # --- Tín hiệu hôm đó ---
    sig_path = os.path.join(cfg_dir, f"signals_today_{report_date.date()}.csv")
    signals_txt, closed_txt, opened_txt = "", "", ""
    if os.path.exists(sig_path):
        sigs = pd.read_csv(sig_path, parse_dates=["entry_date","exit_date"])
        if len(sigs)>0:
            # Lệnh mở
            opened = sigs[sigs["action"]=="BUY"].copy()
            if len(opened)>0:
                for _,r in opened.iterrows():
                    pos_size = r.get("position_size_pct", 0.0) * 100
                    opened_txt += (
                        f"— {r['ticker']}: Mua {r['entry_price']:.2f}, "
                        f"TP {r['tp_level']:.2f}, SL {r['sl_level']:.2f}, "
                        f"Tỷ trọng {pos_size:.1f}%\n"
                    )
            else:
                opened_txt = "— Không có lệnh mở nào\n"

            # Lệnh đóng
            closed = sigs[sigs["action"]=="SELL"].copy()
            if len(closed)>0:
                for _,r in closed.iterrows():
                    entry_price = r.get("entry_price", None)
                    exit_price  = r.get("exit_price", None)
                    pos_size    = r.get("position_size_pct", 0.0) * 100
                    if pd.notna(entry_price) and pd.notna(exit_price) and entry_price>0:
                        pnl_pct = (exit_price/entry_price - 1)*100
                    else:
                        pnl_pct = float("nan")
                    closed_txt += (
                        f"— {r['ticker']}: Mua {entry_price:.2f} → "
                        f"Bán {exit_price:.2f}, "
                        f"{'Lãi' if pnl_pct>0 else 'Lỗ'} {pnl_pct:.2f}%, "
                        f"Tỷ trọng {pos_size:.1f}%\n"
                    )
            else:
                closed_txt = "— Không có lệnh đóng nào\n"

            # Tín hiệu gốc
            for _,r in sigs.iterrows():
                if r["action"]=="BUY":
                    signals_txt += (
                        f"🟢 MUA {r['ticker']} giá {r['entry_price']:.2f}, "
                        f"TP {r['tp_level']:.2f}, SL {r['sl_level']:.2f}\n"
                    )
                else:
                    exit_price = r["exit_price"] if pd.notna(r["exit_price"]) else 0.0
                    signals_txt += (
                        f"🔴 BÁN {r['ticker']} giá {exit_price:.2f}, "
                        f"loại {r.get('exit_type','NA')}\n"
                    )
        else:
            signals_txt = "— Không có tín hiệu giao dịch hôm nay\n"
    else:
        signals_txt = "— (Không tìm thấy file tín hiệu)\n"
        opened_txt = "— (Không tìm thấy file tín hiệu)\n"
        closed_txt = "— (Không tìm thấy file tín hiệu)\n"

    # --- Chart equity đến ngày đó ---
    plt.figure(figsize=(8,5))
    plt.plot(pv.loc[:report_date], label="Chiến lược")
    plt.title(f"Equity đến {report_date.date()}")
    plt.grid(True, alpha=0.3); plt.legend()
    chart_path = os.path.join(cfg_dir, f"equity_until_{report_date.date()}.png")
    plt.savefig(chart_path, dpi=150); plt.close()

    # --- Compose message ---
    msg = f"""
📅 Ngày {report_date.date()}

💰 Giá trị tài sản: {nav_today:,.0f}

📊 Danh mục hiện tại:
{positions_txt}

💡 Lệnh đóng hôm nay:
{closed_txt}

🟢 Lệnh mở hôm nay:
{opened_txt}

📌 Tín hiệu trong ngày:
{signals_txt}
""".strip()

    # --- Gửi text ---
    send_url = f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage"
    requests.post(send_url, data={
        "chat_id": TG_CHAT_ID,
        "message_thread_id": TG_THREAD_ID,
        "text": msg
    })
    time.sleep(sleep_sec)

    # --- Gửi chart ---
    photo_url = f"https://api.telegram.org/bot{TG_TOKEN}/sendPhoto"
    with open(chart_path,"rb") as f:
        requests.post(photo_url, data={
            "chat_id": TG_CHAT_ID,
            "message_thread_id": TG_THREAD_ID,
            "caption": f"Equity Curve đến {report_date.date()}"
        }, files={"photo":f})
    time.sleep(sleep_sec)

    print(f"✅ Đã gửi báo cáo Telegram cho ngày {report_date.date()}")

Using best config: SL1_TP6


In [None]:
# Cell runner — gửi báo cáo Telegram từ 26/03 đến 16/04/2025

import pandas as pd

start = pd.Timestamp("2025-03-26")
end   = pd.Timestamp("2025-04-16")

for d in pd.date_range(start, end, freq="D"):
    try:
        send_daily_report(d, sleep_sec=10)  # sleep 2 giây giữa mỗi tin nhắn
    except Exception as e:
        print(f"⚠️ Lỗi khi gửi ngày {d.date()}: {e}")