In [None]:
from __future__ import annotations

# ── 标准库（stdlib） ─────────────────────────────────────────────────────────────
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"

import time
import random
from pathlib import Path
from collections import deque

# ── 第三方（third-party） ───────────────────────────────────────────────────────
import numpy as np
import pandas as pd
import polars as pl
import pyarrow as pa
import pyarrow.dataset as ds
import torch
import lightning as L
from torch.utils.data import DataLoader, IterableDataset

from lightning.pytorch.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    LearningRateMonitor,
)
from lightning.pytorch.loggers import TensorBoardLogger

from pytorch_forecasting import (
    TimeSeriesDataSet,
    TemporalFusionTransformer,
)
from pytorch_forecasting.metrics import MAE, RMSE
from pytorch_forecasting.data.encoders import (
    NaNLabelEncoder,
)

# 你的工程工具
from pipeline.io import cfg, P, fs, storage_options, ensure_dir_local, ensure_dir_az

def _now(): return time.strftime("%Y-%m-%d %H:%M:%S")
print(f"[{_now()}] imports ok")

SyntaxError: invalid syntax (572149434.py, line 4)

In [2]:
# 导入文件 /mnt/data/js/exp/v1/reports/fi/features__fs__1600-1690__cv2-g2-r4__seed42__top1000__1758551023.txt
input_file = "/mnt/data/js/exp/v1/reports/fi/features__fs__1600-1690__cv2-g2-r4__seed42__top1000__1758551023.txt"

with open(input_file, 'r') as file:
    data = file.read()
lines = data.split('\n')
features = [line.split()[0] for line in lines if line.strip() and not line.startswith('#')]
print(f"[{_now()}] loaded {len(features)} features from {input_file}")  

# ── 本地（local） ────────────────────────────────────────────────────────────────


[2025-10-03 17:16:03] loaded 787 features from /mnt/data/js/exp/v1/reports/fi/features__fs__1600-1690__cv2-g2-r4__seed42__top1000__1758551023.txt


In [3]:
features

['time_pos',
 'time_sin',
 'time_cos',
 'time_bucket',
 'feature_36',
 'feature_06',
 'feature_08__ewm5',
 'responder_5_prevday_std',
 'responder_3_prevday_std',
 'responder_4_prev_tail_d1',
 'feature_53__rstd3',
 'feature_16__ewm5',
 'feature_01__ewm5',
 'feature_21__rz3',
 'responder_7_prevday_std',
 'responder_8_prevday_mean',
 'feature_38__ewm5',
 'feature_05__ewm5',
 'responder_1_close_roll3_std',
 'responder_3_prevday_mean',
 'feature_37__ewm5',
 'responder_4_prevday_std',
 'responder_3_prev_tail_d1',
 'responder_6_prevday_std',
 'responder_2_prevday_mean',
 'responder_0_prevday_std',
 'responder_3_prev2day_close',
 'responder_8_prevday_std',
 'responder_2_prev_tail_d1',
 'responder_4_prevday_mean',
 'feature_04',
 'feature_66__ewm5',
 'responder_8_close_roll3_std',
 'responder_1_prev_tail_d1',
 'feature_16',
 'responder_7_prevday_mean',
 'responder_5_prev2day_close',
 'responder_5_close_roll3_mean',
 'responder_8_prev2day_close',
 'feature_07__ewm5',
 'responder_0_close_roll3_me

In [4]:
import re

# ---- 关键配置（按需改）----
target_col = cfg["target"]                 # e.g. "responder_6"
g_sym, g_date, g_time = cfg["keys"]        # e.g. ("symbol_id","date_id","time_id")
weight_col = cfg["weight"]

time_features = ["time_pos", "time_sin", "time_cos", "time_bucket"]

# 严格匹配基础特征：feature_后跟两位数字、无后缀
base_features = [f for f in features if re.fullmatch(r"feature_\d{2}", f)]

# 响应类历史特征：以 responder_ 开头
resp_his_feats = [f for f in features if f.startswith("responder_")]

# 其他派生的 feature_*（有后缀），但排除基础特征
feat_his_feats = [f for f in features if f.startswith("feature_") and f not in base_features]

In [5]:

feature_cols = list(dict.fromkeys(base_features + resp_his_feats + feat_his_feats))

need_cols = list(dict.fromkeys([g_sym, g_date, g_time, weight_col, target_col] + time_features + feature_cols))

# CV & 训练
N_SPLITS   = 2
GAP_DAYS   = 7
TRAIN_TO_VAL = 4               # 训练:验证 = 4:1
ENC_LEN    = 10
PRED_LEN   = 1
BATCH_SIZE = 1024
LR = 1e-3
HIDDEN     = 64
HEADS      = 2
DROPOUT    = 0.1
MAX_EPOCHS_PER_SHARD = 1
CHUNK_DAYS = 20               # 训练分片：每片多少天

print("config ready")

config ready


In [6]:
panel_dir = P("az", cfg["paths"].get("panel_shards", "panel_shards"))
data_path  = fs.glob(f"{panel_dir}/*.parquet")
az_path = [f"az://{p}" for p in data_path]
lf_data = pl.scan_parquet(az_path, storage_options=storage_options)

In [7]:
# 全局 time_idx 仅一次
lf_grid = (
    lf_data.select([g_date, g_time]).unique()
        .sort([g_date, g_time])
        .with_row_index("time_idx")
        .with_columns(pl.col("time_idx").cast(pl.Int64))
)

grid_path = P("local", "tft/panel/grid_timeidx.parquet"); ensure_dir_local(Path(grid_path).parent.as_posix())

lf_grid.collect(streaming=True).write_parquet(grid_path, compression="zstd")

In [8]:
grid_lazy = pl.scan_parquet(grid_path)

# 接入 time_idx + 只保留所需列（仍是 Lazy）
lf0 = (
    lf_data.join(grid_lazy, on=[g_date, g_time], how="left")
        .select(need_cols + ["time_idx"])
        .sort([g_date, g_time, g_sym])
)

print(f"[{_now()}] lazyframe ready")

[2025-10-03 17:16:03] lazyframe ready


In [9]:
# 全量天列表
all_days = (
    lf0.select(pl.col(g_date)).unique().sort(by=g_date)
       .collect(streaming=True).get_column(g_date).to_numpy()
)

In [10]:
# 构造滑动时间窗 CV
def make_sliding_cv_by_days(all_days: np.ndarray, *, n_splits: int, gap_days: int, train_to_val: int):
    all_days = np.asarray(all_days).ravel()
    K, R, G = n_splits, train_to_val, gap_days
    usable = len(all_days) - G
    if usable <= 0 or K <= 0 or R <= 0: return []
    V_base, rem = divmod(usable, R + K)
    if V_base <= 0: return []
    T = R * V_base
    v_lens = [V_base + 1 if i < rem else V_base for i in range(K)]
    folds, v_lo = [], T + G
    for V_i in v_lens:
        v_hi, tr_hi, tr_lo = v_lo + V_i, v_lo - G, v_lo - G - T
        if tr_lo < 0 or v_hi > len(all_days): break
        folds.append((all_days[tr_lo:tr_hi], all_days[v_lo:v_hi]))
        v_lo = v_hi
    return folds

folds_by_day = make_sliding_cv_by_days(all_days, n_splits=N_SPLITS, gap_days=GAP_DAYS, train_to_val=TRAIN_TO_VAL)
assert len(folds_by_day) > 0, "no CV folds constructed"

In [11]:
folds_by_day

[(array([1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1613, 1614, 1615,
         1616, 1617, 1618, 1619, 1620, 1621, 1622, 1623, 1624, 1625, 1626,
         1627, 1628, 1629, 1630, 1631, 1632, 1633, 1634, 1635, 1636, 1637,
         1638, 1639, 1640, 1641, 1642, 1643, 1644, 1645, 1646, 1647, 1648,
         1649, 1650, 1651, 1652, 1653, 1654, 1655, 1656, 1657, 1658, 1659,
         1660], dtype=int32),
  array([1668, 1669, 1670, 1671, 1672, 1673, 1674, 1675, 1676, 1677, 1678,
         1679, 1680, 1681, 1682], dtype=int32)),
 (array([1620, 1621, 1622, 1623, 1624, 1625, 1626, 1627, 1628, 1629, 1630,
         1631, 1632, 1633, 1634, 1635, 1636, 1637, 1638, 1639, 1640, 1641,
         1642, 1643, 1644, 1645, 1646, 1647, 1648, 1649, 1650, 1651, 1652,
         1653, 1654, 1655, 1656, 1657, 1658, 1659, 1660, 1661, 1662, 1663,
         1664, 1665, 1666, 1667, 1668, 1669, 1670, 1671, 1672, 1673, 1674,
         1675], dtype=int32),
  array([1683, 1684, 1685, 1686, 1687, 1688, 1689, 1690, 1691, 169

In [12]:
# 选“标准化统计区间”上界：第1折训练天的最大值（不含验证）
stats_hi = int(folds_by_day[0][0][-1])
print(f"stats_hi (for global z-score) = {stats_hi}; first-fold train days end at this day.")

stats_hi (for global z-score) = 1660; first-fold train days end at this day.


In [13]:
# ========== 1) 连续特征：一次性处理 inf->null、打缺失标记、组内 ffill、兜底 0 ==========
# 先做 flag（要基于原始缺失），再做填充；合并成两次 with_columns，避免在 for 循环里多次改列
inf2null_exprs = [pl.when(pl.col(c).is_infinite()).then(None).otherwise(pl.col(c)).alias(c)
                for c in feature_cols] # inf -> null 不产生新列
flags_exprs    = [pl.col(c).is_null().cast(pl.Int8).alias(f"{c}__isna")
                for c in feature_cols] # 产生新列
fill_exprs     = [pl.col(c).forward_fill().over(g_sym).fill_null(0.0).alias(c)
                for c in feature_cols] # 填充，覆盖原列

lf_clean = (
    lf0.with_columns(inf2null_exprs)         # inf -> null
    .with_columns(flags_exprs)            # 缺失标记（基于原始缺失）
    .with_columns(fill_exprs)             # 组内 ffill + 兜底 0
)


In [14]:
# 为数据标准化做准备

# ========== 2) 训练区间（<= stats_hi）按组计算 mu/std（一次） ==========
lf_stats_sym = (
    lf_clean.filter(pl.col(g_date) <= stats_hi)
    .group_by(g_sym)
    .agg([pl.col(c).mean().alias(f"mu_{c}") for c in feature_cols] +
        [pl.col(c).std(ddof=0).alias(f"std_{c}") for c in feature_cols])
)

# 训练期全局统计（作为回退）
lf_stats_glb = (
    lf_clean.filter(pl.col(g_date) <= stats_hi)
    .select([pl.col(c).mean().alias(f"mu_{c}_glb") for c in feature_cols] +
            [pl.col(c).std(ddof=0).alias(f"std_{c}_glb") for c in feature_cols])
)

# 3) 把全局统计作为常量列加到每行（cross join 方式）
lf_z = lf_clean.join(lf_stats_glb, how="cross")

# 4) join per-symbol 统计，并对每个特征做回退 & z-score

lf_z = lf_z.join(lf_stats_sym, on=g_sym, how="left")

# 对每个特征做回退 & z-score
eps = 1e-6
z_cols = []
for c in feature_cols:
    mu_c_sym, std_c_sym = f"mu_{c}", f"std_{c}"
    mu_c_glb, std_c_glb = f"mu_{c}_glb", f"std_{c}_glb"
    c_z = f"{c}_z"
    lf_z = lf_z.with_columns(
        pl.when(pl.col(mu_c_sym).is_null()).then(pl.col(mu_c_glb)).otherwise(pl.col(mu_c_sym)).alias(f"{c}_mu_use"),
        pl.when(pl.col(std_c_sym).is_null() | (pl.col(std_c_sym) == 0)).then(pl.col(std_c_glb)).otherwise(pl.col(std_c_sym)).alias(f"{c}_std_use")
    ).with_columns(
        ((pl.col(c) - pl.col(f"{c}_mu_use")) / (pl.col(f"{c}_std_use") + eps)).alias(c_z)
    ).drop([mu_c_glb, std_c_glb, mu_c_sym, std_c_sym, f"{c}_mu_use", f"{c}_std_use"])
    z_cols.append(c_z)
    
    
# 5) 输出列（z_特征 + isna 标记 + 时间/分类/目标/权重）
namark_cols = [f"{c}__isna" for c in feature_cols]
out_cols = [g_sym, g_date, g_time, "time_idx", weight_col, target_col] + time_features + z_cols + namark_cols

lf_out = lf_z.select(out_cols).sort([g_date, g_time, g_sym])


In [15]:

# ========== 4) ==========
# 关键：不要“逐天 collect”，而是每次收集一批天，然后一次性按 day 分区写入，显著减少 IO 次数


tft_root = P("az", "tft"); ensure_dir_az(tft_root)
clean_dir = f"{tft_root}/clean"; ensure_dir_az(clean_dir)


In [None]:
CHUNK_DAYS = 30  # 可根据机器内存/速度调整；比如 10~30 天一块
day_list = list(map(int, all_days))
day_chunks = [day_list[i:i+CHUNK_DAYS] for i in range(0, len(day_list), CHUNK_DAYS)]


for ci, chunk in enumerate(day_chunks, 1):
    df_chunk = lf_out.filter(pl.col(g_date).is_in(chunk)).collect()
    table = df_chunk.to_arrow()

    ds.write_dataset(
        data=table,
        base_dir=clean_dir,
        filesystem=fs,
        format="parquet",
        partitioning=ds.partitioning(pa.schema([(g_date, pa.int32())])),  # or pa.int64()
        existing_data_behavior="overwrite_or_ignore",  # 按需改： "delete_matching" / "overwrite_or_ignore"
    )
    print(f"[{_now()}] chunk {ci}/{len(day_chunks)} -> days {chunk[0]}..{chunk[-1]} written")

print(fs.ls(clean_dir)[:5])

In [16]:
# === ===

# 设置随机种子
L.seed_everything(int(cfg.get("seed", 42)), workers=True)

logs_root = P("local", "tft/logs"); ensure_dir_local(Path(logs_root).as_posix())
ckpts_root = P("local", "tft/ckpts"); ensure_dir_local(Path(ckpts_root).as_posix())


class ShardedBatchStream(IterableDataset):
    def __init__(
        self,
        template_tsd,
        shard_days,
        clean_dir: str,
        g_sym: str,
        batch_size: int = 1024,
        num_workers: int = 8,
        shuffle_within_shard: bool = True,
        buffer_batches: int = 0,
        seed: int = 42,
    ):
        super().__init__()
        self.template = template_tsd
        self.days = list(map(int, shard_days))
        self.clean_dir = clean_dir
        self.g_sym = g_sym
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.shuffle_within_shard = shuffle_within_shard
        self.buffer_batches = buffer_batches
        self.seed = seed

    def __iter__(self):
        rng = random.Random(self.seed)
        days = self.days[:]
        rng.shuffle(days)

        from collections import deque
        buf = deque()

        for d in days:
            paths = fs.glob(f"{self.clean_dir}/{d}/*.parquet")
            if not paths:
                raise RuntimeError(f"no data files found for day {d} in {self.clean_dir}")
            paths = [f"az://{p}" for p in paths]
            pdf = pl.scan_parquet(paths, storage_options=storage_options).collect().to_pandas()
            if pdf.empty:
                raise RuntimeError(f"empty data for day {d} in {self.clean_dir}")
            pdf[self.g_sym] = pdf[self.g_sym].astype("str").astype("category")

            tsds = TimeSeriesDataSet.from_dataset(
                self.template,
                data=pdf.sort_values([self.g_sym, "time_idx"]),
                stop_randomization=False,
            )

            dl = tsds.to_dataloader(
                train=True,
                batch_size=self.batch_size,
                num_workers=self.num_workers,
                shuffle=self.shuffle_within_shard,
                pin_memory=True,
                persistent_workers=False,
            )

            for batch in dl:
                if self.buffer_batches > 0:
                    buf.append(batch)
                    if len(buf) >= self.buffer_batches:
                        k = rng.randrange(len(buf))
                        if k:
                            buf.rotate(-k)
                        yield buf.popleft()
                else:
                    yield batch

        while buf:
            yield buf.popleft()


best_ckpt_paths = []
fold_metrics = []

# ---- 训练集 & 验证集 folds----
for fold_id, (train, val) in enumerate(folds_by_day, start=1):
    print(f"[fold {fold_id}] train {train[0]}..{train[-1]} ({len(train)} days), "
          f"val {val[0]}..{val[-1]} ({len(val)} days)")
    
    days_sorted = np.sort(train)
    
    # ---- Template（用第一天分片建立，固化 encoders/scalers）----
    TEMPLATE_DAYS = min(10, len(days_sorted))   # 你可按需调大/调小，比如 5/7/全部

    tmpl_paths = []
    for d in days_sorted[:TEMPLATE_DAYS]:
        tmpl_paths.extend(fs.glob(f"{clean_dir}/{d}/*.parquet"))
    tmpl_paths = [f"az://{p}" for p in tmpl_paths]
    
    pdf_tmpl = pl.scan_parquet(tmpl_paths, storage_options=storage_options).collect().to_pandas()
    pdf_tmpl[g_sym] = pdf_tmpl[g_sym].astype("str").astype("category")
    
    print(f"[fold {fold_id}] template days={list(map(int, days_sorted[:TEMPLATE_DAYS]))}, "
    f"template shape={pdf_tmpl.shape}")
    
    # 验证集
    val_paths = []
    for d in val:
        val_paths.extend(fs.glob(f"{clean_dir}/{d}/*.parquet"))
    val_paths = [f"az://{p}" for p in val_paths]
    pdf_val = pl.scan_parquet(val_paths, storage_options=storage_options).collect().to_pandas()
    pdf_val[g_sym] = pdf_val[g_sym].astype("str").astype("category")
    
    print(f"template {pdf_tmpl.shape}, val {pdf_val.shape}")
    
    unknown_reals = time_features + z_cols + namark_cols
    
    identity_scalers = {name: None for name in unknown_reals} # 我们的自变量连续特征只有unknown_reals      
    template = TimeSeriesDataSet(
        pdf_tmpl.sort_values([g_sym, "time_idx"]),
        time_idx="time_idx",
        target=target_col,
        group_ids=[g_sym],
        weight=weight_col,
        max_encoder_length=ENC_LEN,
        max_prediction_length=PRED_LEN,
        
        static_categoricals=[g_sym],
        time_varying_unknown_reals=unknown_reals,
        
        lags=None,  # 不用自动滞后
        
        categorical_encoders={g_sym: NaNLabelEncoder(add_nan=True)},
        add_relative_time_idx=False, 
        add_target_scales=False, 
        add_encoder_length=False,
        
        allow_missing_timesteps=True,
        
        target_normalizer=None,
        scalers=identity_scalers,
    )

    validation = TimeSeriesDataSet.from_dataset(
        template, data=pdf_val.sort_values([g_sym, "time_idx"]), stop_randomization=True
    )
    val_loader = validation.to_dataloader(
        train=False, batch_size=BATCH_SIZE, num_workers=min(8, max(1, os.cpu_count() - 2))
    )

    len(val_loader), pdf_tmpl.shape, pdf_val.shape
    
    train_stream = ShardedBatchStream(
        template_tsd=template,
        shard_days=days_sorted,
        clean_dir=clean_dir,
        g_sym=g_sym,
        batch_size=BATCH_SIZE,
        num_workers=min(8, max(1, os.cpu_count() - 2)),
        shuffle_within_shard=True,
        buffer_batches=16,   # 0 代表关闭跨分片缓冲打乱；8~64 可微调
        seed=42,
    )
    # 外层 DataLoader 不再做 batch/多进程
    train_loader = DataLoader(train_stream, batch_size=None, num_workers=0)
    
    # 在 callbacks 定义前，先为本折建独立目录
    ckpt_dir_fold = Path(ckpts_root) / f"fold_{fold_id}"
    ensure_dir_local(ckpt_dir_fold.as_posix())

    callbacks = [
        EarlyStopping(monitor="val_RMSE", mode="min", patience=5),
        ModelCheckpoint(
            monitor="val_RMSE",
            mode="min",
            save_top_k=1,
            dirpath=ckpt_dir_fold.as_posix(),            # 每折独立目录
            filename=f"fold{fold_id}-tft-best-{{epoch:02d}}-{{val_RMSE:.5f}}",  # 文件名含 fold
        ),
        LearningRateMonitor(logging_interval="step"),
    ]

    # （可选）logger 名字也带上 fold，便于区分
    logger = TensorBoardLogger(save_dir=logs_root, name=f"tft_f{fold_id}", default_hp_metric=False)
    
    VAL_EVERY_STEPS = 50
    trainer = L.Trainer(
        accelerator="gpu", devices=1, precision="bf16-mixed",
        max_epochs=5,
        val_check_interval=VAL_EVERY_STEPS,
        num_sanity_val_steps=0,
        gradient_clip_val=0.5,
        log_every_n_steps=50,
        callbacks=callbacks,
        logger=logger,
        default_root_dir=ckpts_root,
    )

    # 创建模型
    tft = TemporalFusionTransformer.from_dataset(
        template,
        loss=MAE(),
        logging_metrics=[RMSE()],
        learning_rate=float(cfg.get("tft", {}).get("lr", 1e-3)),
        hidden_size=int(cfg.get("tft", {}).get("hidden_size", 128)),
        attention_head_size=int(cfg.get("tft", {}).get("heads", 4)),
        dropout=float(cfg.get("tft", {}).get("dropout", 0.2)),
        reduce_on_plateau_patience=4,
    )

    trainer.fit(tft, train_dataloaders=train_loader, val_dataloaders=val_loader)
    # 每折训练后：
    es_cb = callbacks[0]  # EarlyStopping
    ckpt_cb = callbacks[1]

    print("epoch_end_at   :", trainer.current_epoch)              # 停下时的 epoch 索引（0 基）
    print("global_step    :", trainer.global_step)                # 训练过的 step 数
    print("val_best_score :", float(ckpt_cb.best_model_score))    # 最优 val_RMSE
    print("es_stopped_ep  :", getattr(es_cb, "stopped_epoch", None))  # 触发早停的 epoch
    print("es_wait_count  :", getattr(es_cb, "wait_count", None))     # 连续未提升的验证次数

    best_ckpt_paths.append(ckpt_cb.best_model_path)
    fold_metrics.append(float(ckpt_cb.best_model_score))  # 这是监控的 val_RMSE

    # CV 聚合（简单平均或按验证样本数加权平均）
    cv_rmse = np.mean(fold_metrics)  # 或按样本数加权
    print(f"[CV] mean val_RMSE = {cv_rmse:.6f}")
    
    
    


Seed set to 42


[fold 1] train 1605..1660 (56 days), val 1668..1682 (15 days)
[fold 1] template days=[1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1613, 1614], template shape=(376552, 1571)
template (376552, 1571), val (552728, 1571)


Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/admin_ml/Jackson/projects/js/JS/.venv/lib/python3.10/site-packages/lightning/pytorch/utilities/parsing.py:210: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
/home/admin_ml/Jackson/projects/js/JS/.venv/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:751: Checkpoint directory /mnt/data/js/exp/v1/tft/ckpts/fold_1 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/admin_ml/Jackson/projects/js/JS/.venv/lib/python3.10/site-packages/lightning/pytorch/utilities/model_summary/model_summary.py:231: Precision bf16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.

   | Name              

Epoch 0: |          | 0/? [00:00<?, ?it/s] 

RuntimeError: NVML_SUCCESS == DriverAPI::get()->nvmlInit_v2_() INTERNAL ASSERT FAILED at "/pytorch/c10/cuda/CUDACachingAllocator.cpp":1098, please report a bug to PyTorch. 