# 初始化

In [1]:
# 环境与依赖

# 基础包
import tempfile

import os, gc, glob, json, yaml, time
from pathlib import Path
import numpy as np, pandas as pd, polars as pl
import lightgbm as lgb
from dataclasses import dataclass
import pyarrow.parquet as pq
from typing import Sequence, Optional, Union, List, Tuple, Iterable, Mapping

import matplotlib.pyplot as plt
# Azure & 文件系统
import fsspec
from getpass import getpass
from dotenv import load_dotenv
load_dotenv()  # 默认会加载当前目录下的 .env 文件



# 连接云空间

ACC = os.getenv("AZURE_STORAGE_ACCOUNT_NAME")
KEY = os.getenv("AZURE_STORAGE_ACCOUNT_KEY")
if not ACC or not KEY:
    raise RuntimeError("Azure credentials not found. Please set them in .env")
storage_options = {"account_name": ACC, "account_key": KEY}
fs = fsspec.filesystem("az", **storage_options)




# 定义路径辅助函数

# 读取配置（唯一来源）
cfg = yaml.safe_load(open("config/data.yaml"))

# 路径辅助函数
def P(kind: str, subpath: str = "") -> str:
    container  = str(cfg["blob"]["container"]).strip("/")
    prefix     = str(cfg["blob"]["prefix"]).strip("/")
    version    = str(cfg["exp_root"]).strip("/")
    local_root = Path(cfg["local"]["root"])

    sub = str(subpath).strip("/")  # 只做最小化处理；你也可以直接用 subpath

    if kind == "az":
        base = f"az://{container}" + (f"/{prefix}" if prefix else "") + f"/{version}"
        return f"{base}/{sub}" if sub else base
    if kind == "np":
        base = f"{container}" + (f"/{prefix}" if prefix else "") + f"/{version}"
        return f"{base}/{sub}" if sub else base
    if kind == "local":
        base = (local_root / version).as_posix()
        return f"{base}/{sub}" if sub else base
    raise ValueError("kind must be 'az', 'np', or 'local'")



In [2]:
# 全局变量

FEATURE_ALL = [f"feature_{i:02d}" for i in range(79)]
RESP_COLS   = [f"responder_{i}" for i in range(0, 9)]

In [3]:
# 读取数据

# 我们先用2个parquet文件做示范
np_paths = fs.glob(f"az://jackson/js_exp/raw/train.parquet/partition_id=[4,5]/*.parquet")

paths=[]
for p in np_paths:
    paths.append("az://"+p)
lb = pl.scan_parquet(paths, storage_options=storage_options)


# EDA

In [None]:
# EDA：每个交易日的symbol覆盖情况,是否覆盖全程
lf_s_d = lb.select(['date_id', 'symbol_id'])

per_date = (
    lf_s_d.group_by("date_id")
      .agg(pl.col("symbol_id").n_unique().alias("n_symbols"))
      .sort("date_id")
)

max_n = per_date.select(pl.max("n_symbols")).collect().item()
summary = per_date.with_columns([
    pl.lit(max_n).alias("max_n"),
    (pl.col("n_symbols") == max_n).alias("is_full_universe")
])

dates_missing = summary.filter(pl.col("is_full_universe") == False).select("date_id")
# summary.collect(); dates_missing.collect()

dates_missing.collect()

In [None]:
# 先选一个横向，纵向都比较小的样本,按照data_id来选一小块快速试验

ls = lb.filter(pl.col('symbol_id').is_in([1,2,3,4,5]) & pl.col('date_id').is_in([1400,1420]))


# 数据预处理

添加time bucket, 将日分片

In [None]:
# 添加时间特征 bucket，将一天T ticks分为B部分
B = cfg['trading']['bucket_size']
T = cfg['trading']['ticks']

def clip_upper(expr: pl.Expr, ub: int) -> pl.Expr:
    return pl.when(expr > pl.lit(ub)).then(pl.lit(ub)).otherwise(expr)
lb = lb.with_columns(
    bucket_raw = pl.col('time_id') * pl.lit(B) // pl.lit(T) # 这里T 我们假设为全局常数，不分组计算
).with_columns(
    time_bucket = clip_upper(pl.col('bucket_raw'), B - 1).cast(pl.UInt8)
).drop(pl.col('bucket_raw'))


In [None]:
lb.collect_schema().names()

Clip

In [None]:
def rolling_sigma_clip(
    lf: pl.LazyFrame,
    clip_features: Sequence[str],
    over_cols: Sequence[str],
    *,
    is_sorted: bool = False,
    window: int = 50,
    k: float = 3.0,
    ddof: int = 1,
    min_valid: int = 10,
    cast_float32: bool = True,
    sanitize: bool = True,
) -> pl.LazyFrame:
    if not is_sorted:
        raise ValueError("Input LazyFrame must be pre-sorted by ['symbol_id','date_id','time_id']")

    required = {"symbol_id","date_id","time_id","time_bucket"} | set(clip_features)
    names = set(lf.collect_schema().names())
    missing = list(required - names)
    if missing:
        raise KeyError(f"Missing columns: {missing}")


    base = lf.select(pl.col(["symbol_id","date_id","time_id","time_bucket"] + list(clip_features)))
    min_need = max(min_valid, ddof + 1)
    min_samp = ddof + 1

    exprs = []
    for c in clip_features:
        x = pl.col(c)
        if cast_float32:
            x = x.cast(pl.Float32)
        if sanitize:
            x = pl.when(x.is_finite()).then(x).otherwise(None)

        # 注意：这里不要 over
        xlag = x.shift(1)

        # 只在 rolling 结果上 over（组内历史）
        cnt = (
            xlag.is_not_null()
                .cast(pl.Int32)
                .rolling_sum(window_size=window, min_samples=ddof + 1)
        ).over(over_cols)

        mu = (
            xlag.rolling_mean(window_size=window, min_samples=ddof + 1)
        ).over(over_cols)

        sd = (
            xlag.rolling_std(window_size=window, ddof=ddof, min_samples=ddof + 1)
        ).over(over_cols)

        lo, hi = mu - k * sd, mu + k * sd
        exprs.append(
            pl.when(cnt >= max(min_valid, ddof + 1))
            .then(x.clip(lo, hi))
            .otherwise(x)
            .alias(c)
        )

    return base.with_columns(exprs)


lb = lb.sort(['symbol_id', 'date_id', 'time_id'])

lf_clip = rolling_sigma_clip(
    lf=lb,
    clip_features=FEATURE_ALL,
    over_cols=cfg['winsorization']['groupby'],
    is_sorted=True,
    window=cfg['winsorization']['window'],
    k=cfg['winsorization']['z_k'],
    ddof=cfg['winsorization']['ddof'],
    min_valid=cfg['winsorization']['min_valid'],
    cast_float32=cfg['winsorization']['cast_float32'],
    sanitize=cfg['winsorization'].get('sanitize', True)
)

In [None]:
clip_out = Path(P("local", cfg["paths"]["cache"])) / "sample_clipped.parquet"

In [None]:
from pathlib import Path

clip_out = Path(P("local", cfg["paths"]["cache"])) / "sample_clipped.parquet"
clip_out.parent.mkdir(parents=True, exist_ok=True)

df = lf_clip.collect()  # 非流式；会把计划完整执行后落到内存
df.write_parquet(str(clip_out), compression="zstd")  # 可加 use_pyarrow=True



In [None]:
df.columns

Impute

In [None]:
def causal_impute(
    lf: pl.LazyFrame,
    impute_cols: Sequence[str],
    *,
    open_tick_window: Tuple[int, int] = (0, 10),
    ttl_days_open: int = 5,
    intra_ffill_max_gap_ticks: Optional[int] = 100,
    ttl_days_same_tick: Optional[int] = 5,
    is_sorted: bool = False,
) -> pl.LazyFrame:
    if not is_sorted:
        raise ValueError("Input LazyFrame must be pre-sorted by ['symbol_id','date_id','time_id']")

    # 参数合法性
    assert intra_ffill_max_gap_ticks is None or intra_ffill_max_gap_ticks >= 0
    assert ttl_days_same_tick is None or ttl_days_same_tick >= 0

    # 统一 dtype（可选，但更稳）
    lf = lf.with_columns([pl.col(c).cast(pl.Float32) for c in impute_cols])
    
    
    t0, t1 = open_tick_window
    is_open = pl.col("time_id").is_between(t0, t1, closed="left")  # [t0, t1)

    # ---- 1) 开盘：跨日承接（TTL）----
    open_exprs = []
    for c in impute_cols:
        last_date = (
            pl.when(pl.col(c).is_not_null()).then(pl.col("date_id"))
            .forward_fill().over("symbol_id")
        )
        cand = pl.col(c).forward_fill().over("symbol_id")
        gap  = (pl.col("date_id") - last_date).cast(pl.Int32)
        open_exprs.append(
            pl.when(is_open 
                    & pl.col(c).is_null() 
                    & (gap.fill_null(ttl_days_open + 1) <= ttl_days_open))
            .then(cand)
            .otherwise(pl.col(c))
            .alias(c)
        )
    lf1 = lf.with_columns(open_exprs)

    # ---- 2) 日内 ffill（(symbol,date)），可限步数 ----
    if intra_ffill_max_gap_ticks is None:
        lf2 = lf1.with_columns([pl.col(c).forward_fill().over(["symbol_id","date_id"]).alias(c) for c in impute_cols])
    else:
        k = intra_ffill_max_gap_ticks
        exprs = []
        for c in impute_cols:
            last_t = (
                pl.when(pl.col(c).is_not_null()).then(pl.col("time_id"))
                .forward_fill().over(["symbol_id","date_id"])
            )
            cand = pl.col(c).forward_fill().over(["symbol_id","date_id"])
            gap  = (pl.col("time_id") - last_t).cast(pl.Int32)
            exprs.append(
                pl.when(pl.col(c).is_null() & (gap.fill_null(k + 1) <= k))
                .then(cand)
                .otherwise(pl.col(c))
                .alias(c)
            )
        lf2 = lf1.with_columns(exprs)

    # ---- 3) 同一 time_id 跨日承接（TTL，可选）----
    lf3 = lf2
    if ttl_days_same_tick is not None:
        d = ttl_days_same_tick
        exprs = []
        for c in impute_cols:
            last_date_same = (
                pl.when(pl.col(c).is_not_null()).then(pl.col("date_id"))
                .forward_fill().over(["symbol_id","time_id"])
            )
            cand_same = pl.col(c).forward_fill().over(["symbol_id","time_id"])
            gap2 = (pl.col("date_id") - last_date_same).cast(pl.Int32)
            exprs.append(
                pl.when(pl.col(c).is_null() & (gap2.fill_null(d + 1) <= d))
                .then(cand_same)
                .otherwise(pl.col(c))
                .alias(c)
            )
        lf3 = lf2.with_columns(exprs)

    # ---- 4) 再日内 ffill 传播（与步骤2同逻辑）----
    if intra_ffill_max_gap_ticks is None:
        lf4 = lf3.with_columns([pl.col(c).forward_fill().over(["symbol_id","date_id"]).alias(c) for c in impute_cols])
    else:
        k = intra_ffill_max_gap_ticks
        exprs = []
        for c in impute_cols:
            last_t = (
                pl.when(pl.col(c).is_not_null()).then(pl.col("time_id"))
                .forward_fill().over(["symbol_id","date_id"])
            )
            cand = pl.col(c).forward_fill().over(["symbol_id","date_id"])
            gap  = (pl.col("time_id") - last_t).cast(pl.Int32)
            exprs.append(
                pl.when(pl.col(c).is_null() & (gap.fill_null(k + 1) <= k))
                .then(cand)
                .otherwise(pl.col(c))
                .alias(c)
            )
        lf4 = lf3.with_columns(exprs)

    KEYS = ["symbol_id","date_id","time_id"]
    return lf4.select([*KEYS, *impute_cols])


In [None]:
clip_path = Path(P("local", cfg["paths"]["cache"])) / "sample_clipped.parquet"
lf_clip = pl.scan_parquet(str(clip_path))

lf_clip = lf_clip.sort(['symbol_id', 'date_id', 'time_id'])
lf_imp = causal_impute(
    lf=lf_clip,
    impute_cols=FEATURE_ALL,
    open_tick_window=cfg['fill']['open_tick_window'],
    ttl_days_open=cfg['fill']['ttl_days_open'],
    intra_ffill_max_gap_ticks=cfg['fill']['intra_ffill_max_gap_ticks'],
    ttl_days_same_tick=cfg['fill']['ttl_days_same_tick'],
    is_sorted=True
)

In [None]:
# 查看缺失情况

pre_null = lf_clip.select([pl.col(c).is_null().mean().alias(c) for c in FEATURE_ALL]).collect()
post_null = lf_imp.select([pl.col(c).is_null().mean().alias(c) for c in FEATURE_ALL]).collect()
# melt 后拼起来看变化
pre_m = pre_null.melt(variable_name="feature", value_name="pre_null")
post_m = post_null.melt(variable_name="feature", value_name="post_null")
summary = pre_m.join(post_m, on="feature").with_columns(
    (pl.col("pre_null") - pl.col("post_null")).alias("filled_delta")
).sort("post_null", descending=True)
summary  # post_null 高的基本就是那 ~20 列

In [None]:
# 按日期看缺失是否“早期更高”（排除冷启动特征）
cols = ["feature_21","feature_26"]  # 换成你缺失最高的几列
by_date = (
    lf_imp.group_by("date_id")
          .agg([pl.col(c).is_null().mean().alias(c) for c in cols])
          .sort("date_id")
          .collect()
)
by_date  # 冷启动型会在较早的 date_id 更高，然后趋于稳定


In [None]:
# 这天到底有多少 symbol 是“整天全空”？
c = "feature_21"

daily_all_null_share = (
    lf_imp
    .group_by(["date_id","symbol_id"])
    .agg(pl.col(c).is_null().mean().alias("null_rate"))
    .with_columns((pl.col("null_rate") == 1).alias("all_null_day"))
    .group_by("date_id")
    .agg(pl.mean("all_null_day").alias("share_symbols_all_null"))
    .sort("date_id")
    .collect()
)
daily_all_null_share.filter(pl.col("share_symbols_all_null") != 0)

#找到原因， 是在中间的一些日期， 有1-2个symbol_id的一些特征列缺失 1/32, 1/33. 1/34 ~0.029-0.032

In [None]:
assert lf_imp.select(pl.len()).collect().item() == lb.select(pl.len()).collect().item()
assert lf_imp.group_by(["symbol_id","date_id","time_id"]).len().filter(pl.col("len")>1).collect().height == 0

In [None]:
lf_imp = lf_imp.with_columns([pl.col(c).fill_null(0.0).alias(c) for c in FEATURE_ALL])

# 再次查看缺失情况
post_null2 = lf_imp.select([pl.col(c).is_null().mean().alias(c) for c in FEATURE_ALL]).collect()
post_null2  # 应该全0了

In [None]:
lb.collect_schema().names()

In [None]:
# 合并响应变量

KEYS = cfg['keys']
WEIGHT = cfg['weight']
batch_size = cfg['fill']['batch_size']

In [None]:
# 右表：去重 + 对齐类型
rhs = (
    lb.select([*KEYS, WEIGHT, 'time_bucket', *RESP_COLS])
    .unique(subset=KEYS, keep='last')
    .with_columns([pl.col(k).cast(pl.Int32) for k in KEYS])
)

rhs.collect_schema().names()

In [None]:
rhs.limit(5).collect()

In [None]:


# 左表
imp_path = Path(P("local", cfg["paths"]["cache"])) / "sample_imputed.parquet"
lf_imp = pl.scan_parquet(str(imp_path)).with_columns([pl.col(k).cast(pl.Int32) for k in KEYS])

dmin, dmax = (
    lf_imp.select(
        pl.col('date_id').min().alias('dmin'),
        pl.col('date_id').max().alias('dmax')
        )
    .collect()
    .row(0)
)

path = P('az', cfg['paths']['clean_shards'])

fs.makedirs(path, exist_ok=True)

print(f"Processing date range: {dmin} to {dmax}")

In [None]:
lb.limit(5).collect()

In [None]:
for lo in range(dmin, dmax + 1, batch_size):
    hi = min(lo + batch_size, dmax + 1)

    left = (
        lf_imp
        .filter(pl.col('date_id').is_between(lo, hi, closed='left'))
    )
    right = rhs.filter(pl.col('date_id').is_between(lo, hi, closed='left'))

    part = (
        left.join(right, on=KEYS, how='left')
    ).sort(KEYS)
    
    feature_cols = [c for c in part.collect_schema().names() if c not in set([*KEYS, WEIGHT, 'time_bucket', *RESP_COLS])]
    part = part.select([*KEYS, WEIGHT, 'time_bucket', *feature_cols,  *RESP_COLS])


    # 命名时注意 hi 是排他的，所以文件名用 hi-1
    out_lo, out_hi = lo, hi - 1
    (
        part.sink_parquet(
            f"{path}/clean_{out_lo:04d}_{out_hi:04d}.parquet",
            compression="zstd",
            statistics=True,                 # 写入页/列统计划出更快
            storage_options=storage_options,
        )
    )



# 特征工程函数

In [31]:
# 特征工程

# A：响应列的“上一日尾部/日度摘要”
def fe_resp_daily(
    lf: pl.LazyFrame,
    *,
    keys: Tuple[str, str, str] = ("symbol_id","date_id","time_id"),
    rep_cols: Sequence[str],
    is_sorted: bool = False,
    prev_soft_days: Optional[int] = None,
    cast_f32: bool = True,
    tail_lags: Sequence[int] = (1,),
    tail_diffs: Sequence[int] = (1,),
    rolling_windows: Sequence[int] | None = (3,),
) -> pl.LazyFrame:
    """一次日频聚合得到昨日尾部与日级摘要 → 统一 TTL 到“对 d 生效的历史值” → 回拼到 tick 级。"""
    g_symbol, g_date, g_time = keys

    # 若未保证排序，这里补一次（只影响 lf；日频表仍会再按 (symbol,date) 排）
    if not is_sorted:
        lf = lf.sort([g_symbol, g_date, g_time])

    # --- 一次性日频聚合 ---
    need_L = sorted(set(tail_lags) | {k+1 for k in tail_diffs} | {1})
    agg_exprs: list[pl.Expr] = []
    for r in rep_cols:
        # 尾部倒数第 L（长度不足 L → null）
        for L in need_L:
            agg_exprs.append(
                pl.when(pl.len() >= L)
                .then(pl.col(r).sort_by(pl.col(g_time)).tail(L).first())
                .otherwise(None)
                .alias(f"{r}_prev_tail_lag{L}")
            )
        # 当日统计（显式补上 prevday_close）
        agg_exprs += [
            pl.col(r).sort_by(pl.col(g_time)).last().alias(f"{r}_prevday_close"),
            pl.col(r).mean().alias(f"{r}_prevday_mean"),
            pl.col(r).std(ddof=1).alias(f"{r}_prevday_std"),
        ]

    daily = (
        lf.group_by([g_symbol, g_date])
        .agg(agg_exprs)
        .sort([g_symbol, g_date])                # 供下面 shift/ffill 正确运行
    )

    # 派生（当日）dK：last - (K+1 from end)
    daily = daily.with_columns([
        (pl.col(f"{r}_prev_tail_lag1") - pl.col(f"{r}_prev_tail_lag{K+1}")).alias(f"{r}_prev_tail_d{K}")
        for r in rep_cols for K in tail_diffs
        if f"{r}_prev_tail_lag{K+1}" in daily.collect_schema().names()
    ])

    # prev2day/overnight/rolling（仍是“当日相对”的量）
    daily = daily.with_columns([
        pl.col(f"{r}_prevday_close").shift(1).over(g_symbol).alias(f"{r}_prev2day_close")
        for r in rep_cols
    ]).with_columns(
        [
            (pl.col(f"{r}_prevday_close") - pl.col(f"{r}_prevday_mean")).alias(f"{r}_prevday_close_minus_mean")
            for r in rep_cols
        ] + [
            (pl.col(f"{r}_prevday_close") - pl.col(f"{r}_prev2day_close")).alias(f"{r}_overnight_gap")
            for r in rep_cols
        ]
    )

    if rolling_windows:
        wins = sorted({int(w) for w in rolling_windows if int(w) > 1})
        roll_exprs: list[pl.Expr] = []
        for r in rep_cols:
            base = pl.col(f"{r}_prevday_close")
            for w in wins:
                roll_exprs += [
                    base.rolling_mean(window_size=w, min_samples=1).over(g_symbol)
                        .alias(f"{r}_close_roll{w}_mean"),
                    base.rolling_std(window_size=w, ddof=1, min_samples=2).over(g_symbol)
                        .alias(f"{r}_close_roll{w}_std"),
                ]
        daily = daily.with_columns(roll_exprs)

    # === 核心：将上面的“当日统计/尾部衍生列”转换为“对 d 生效的历史 TTL 值” ===
    prev_cols = [c for c in daily.collect_schema().names() if c not in (g_symbol, g_date)]
    exprs: list[pl.Expr] = []
    for c in prev_cols:
        # 最近一次（发生在当前日之前）的非空日期与值
        last_non_null_day = (
            pl.when(pl.col(c).is_not_null()).then(pl.col(g_date)).otherwise(None)
            .forward_fill().over(g_symbol)
            .shift(1)
        )
        last_non_null_val = pl.col(c).forward_fill().over(g_symbol).shift(1)

        if prev_soft_days is None:
            resolved = last_non_null_val  # 无限 TTL：总取最近一次历史非空
        else:
            gap_days = (pl.col(g_date) - last_non_null_day).cast(pl.Int32)
            resolved = pl.when(gap_days.is_not_null() & (gap_days <= int(prev_soft_days))) \
                        .then(last_non_null_val) \
                        .otherwise(None)

        if cast_f32:
            resolved = resolved.cast(pl.Float32)
        exprs.append(resolved.alias(c))    # 列名不变，语义已是“对 d 生效的历史值”

    daily_prev = daily.with_columns(exprs)

    # 回拼到 tick 级（左连），并固定顺序（可选）
    out = lf.join(daily_prev, on=[g_symbol, g_date], how="left")
    out = out.sort([g_symbol, g_date, g_time])
    return out



# B：同 time_id 跨日的 prev{k} + 统计
def fe_resp_same_tick_xday(
    lf: pl.LazyFrame,
    *,
    keys: Tuple[str,str,str] = ("symbol_id","date_id","time_id"),
    rep_cols: Sequence[str],
    is_sorted: bool = False,
    prev_soft_days: Optional[int] = None,   # None=严格d-k；整数=TTL
    cast_f32: bool = True,
    ndays: int = 5,
    stats_rep_cols: Optional[Sequence[str]] = None,
    add_prev1_multirep: bool = True,
    batch_size: int = 5,
) -> pl.LazyFrame:
    
    g_symbol, g_date, g_time = keys

    # 保证 (symbol,time) 组内按 date 递增（shift(k).over([symbol,time]) 的因果顺序）
    if not is_sorted:
        lf = lf.sort([g_symbol, g_time, g_date]) # 注意不是date, time

    if stats_rep_cols is None:
        stats_rep_cols = list(rep_cols)

    def _chunks(lst, k):
        for i in range(0, len(lst), k):
            yield lst[i:i+k]

    lf_cur = lf

    # 1) prev{k} with strict / TTL
    for batch in _chunks(list(rep_cols), batch_size):
        exprs = []
        for r in batch:
            for k in range(1, ndays + 1):
                val_k  = pl.col(r).shift(k).over([g_symbol, g_time])
                day_k  = pl.col(g_date).shift(k).over([g_symbol, g_time])
                gap_k  = (pl.col(g_date) - day_k).cast(pl.Int32)

                if prev_soft_days is None:
                    # 严格 d-k：gap==k
                    keep = gap_k.is_not_null() & (gap_k == k)
                else:
                    # TTL：只要在当前日之前，且 gap<=K
                    keep = gap_k.is_not_null() & (gap_k > 0) & (gap_k <= int(prev_soft_days))

                val_k = pl.when(keep).then(val_k).otherwise(None)
                if cast_f32:
                    val_k = val_k.cast(pl.Float32)
                exprs.append(val_k.alias(f"{r}_same_t_prev{k}"))
        lf_cur = lf_cur.with_columns(exprs)

    # 2) mean/std（忽略 null）
    for batch in _chunks([r for r in stats_rep_cols if r in rep_cols], batch_size):
        exprs = []
        for r in batch:
            cols = [f"{r}_same_t_prev{k}" for k in range(1, ndays + 1)]
            vals = pl.concat_list([pl.col(c) for c in cols]).list.drop_nulls()
            m = vals.list.mean()
            s = vals.list.std(ddof=1)   # 和全局统计一致
            if cast_f32:
                m = m.cast(pl.Float32); s = s.cast(pl.Float32)
            exprs += [
                m.alias(f"{r}_same_t_last{ndays}_mean"),
                s.alias(f"{r}_same_t_last{ndays}_std"),
            ]
        lf_cur = lf_cur.with_columns(exprs)

    # 3) slope：时间方向设为“最近为正、久远为负”（正=近期上升）
    x = np.arange(ndays, 0, -1, dtype=np.float64)
    x = (x - x.mean()) / (x.std() + 1e-9)
    x_lits = [pl.lit(float(v)) for v in x]

    for batch in _chunks([r for r in stats_rep_cols if r in rep_cols], batch_size):
        exprs = []
        for r in batch:
            cols = [f"{r}_same_t_prev{k}" for k in range(1, ndays + 1)]
            mean_ref = pl.col(f"{r}_same_t_last{ndays}_mean")
            std_ref  = pl.col(f"{r}_same_t_last{ndays}_std")
            terms = [((pl.col(c) - mean_ref) / (std_ref + 1e-9)) * x_lits[i]
                    for i, c in enumerate(cols)]
            # ——更稳：对 null 显式置 0，避免某些版本 sum_horizontal 因 null 变 null
            terms = [pl.when(pl.col(c).is_not_null() & mean_ref.is_not_null() & std_ref.is_not_null())
                    .then(t).otherwise(pl.lit(0.0)) for t, c in zip(terms, cols)]

            n_eff = pl.sum_horizontal([pl.col(c).is_not_null().cast(pl.Int32) for c in cols]).cast(pl.Float32)
            den   = pl.when(n_eff > 0).then(n_eff).otherwise(pl.lit(1.0))
            slope = pl.sum_horizontal(terms) / den
            if cast_f32:
                slope = slope.cast(pl.Float32)
            exprs.append(slope.alias(f"{r}_same_t_last{ndays}_slope"))
        lf_cur = lf_cur.with_columns(exprs)

    # 4) 跨 responder 的 prev1 行内统计（可选）
    if add_prev1_multirep and len(rep_cols) > 0:
        n_rep = len(rep_cols)  
        prev1_cols = [f"{r}_same_t_prev1" for r in rep_cols]
        prev1_list = pl.concat_list([pl.col(c) for c in prev1_cols]).list.drop_nulls()
        m1 = prev1_list.list.mean()
        s1 = prev1_list.list.std(ddof=1)
        if cast_f32:
            m1 = m1.cast(pl.Float32); s1 = s1.cast(pl.Float32)
        lf_cur = lf_cur.with_columns([
            m1.alias(f"prev1_same_t_mean_{n_rep}rep"),
            s1.alias(f"prev1_same_t_std_{n_rep}rep"),
        ])

    # 出口保持有序，便于后续 C 阶段 shift/rolling
    lf_cur = lf_cur.sort([g_symbol, g_date, g_time])
    return lf_cur




# C 系列：

def fe_feat_history(
    *,
    lf: pl.LazyFrame,
    keys: Tuple[str,str,str] = ("symbol_id","date_id","time_id"),
    feature_cols: Sequence[str],
    is_sorted: bool = False,
    prev_soft_days: Optional[int] = None,
    cast_f32: bool = True,
    batch_size: int = 10,
    lags: Iterable[int] = (1, 3),
    ret_periods: Iterable[int] = (1,),
    diff_periods: Iterable[int] = (1,),
    rz_windows: Iterable[int] = (5,),
    ewm_spans: Iterable[int] = (10,),
    keep_rmean_rstd: bool = True,
    cs_cols: Optional[Sequence[str]] = None,
) -> pl.LazyFrame:
    
    g_sym, g_date, g_time = keys
    
    by_grp = [g_sym]
    by_cs  = [g_date, g_time]

    need_cols = [*keys, *feature_cols]
    schema = lf.collect_schema().names()
    miss = [c for c in need_cols if c not in schema]
    if miss:
        raise KeyError(f"Columns not found: {miss}")

    lf_out = lf.select(need_cols)
    if not is_sorted:
        lf_out = lf_out.sort(list(keys))

    def _chunks(lst, k):
        for i in range(0, len(lst), k):
            yield lst[i:i+k]

    # ---- 规范化参数：None/[] -> 空元组；并去重/转 int/保正数 ----
    def _clean_pos_sorted_unique(x):
        if not x:
            return tuple()
        return tuple(sorted({int(v) for v in x if int(v) >= 1}))

    LAGS   = _clean_pos_sorted_unique(lags)
    K_RET  = _clean_pos_sorted_unique(ret_periods)
    K_DIFF = _clean_pos_sorted_unique(diff_periods)
    RZW    = _clean_pos_sorted_unique(rz_windows)
    SPANS  = _clean_pos_sorted_unique(ewm_spans)

    # C1 lags
    if LAGS:
        for batch in _chunks(feature_cols, batch_size):
            exprs = []
            for L in LAGS:
                last_date_L = pl.col(g_date).shift(L).over(by_grp)
                gap_L = (pl.col(g_date) - last_date_L).cast(pl.Int32)
                if prev_soft_days is not None:
                    keep_L = gap_L.is_not_null() & (gap_L > 0) & (gap_L <= pl.lit(int(prev_soft_days)))
                for c in batch:
                    e = pl.col(c).shift(L).over(by_grp)
                    if prev_soft_days is not None:
                        e = pl.when(keep_L).then(e).otherwise(None)
                    if cast_f32:
                        e = e.cast(pl.Float32)
                    exprs.append(e.alias(f"{c}__lag{L}"))
            lf_out = lf_out.with_columns(exprs)

    # C2 returns（可选）
    if K_RET:
        for batch in _chunks(feature_cols, batch_size):
            exprs = []
            for c in batch:
                cur = pl.col(c)
                for k in K_RET:
                    if k in LAGS:
                        prev = pl.col(f"{c}__lag{k}")  # 已含 TTL
                    else:
                        prev = pl.col(c).shift(k).over(by_grp)
                        if prev_soft_days is not None:
                            last_date_k = pl.col(g_date).shift(k).over(by_grp)
                            gap_k = (pl.col(g_date) - last_date_k).cast(pl.Int32)
                            keep_k = gap_k.is_not_null() & (gap_k > 0) & (gap_k <= pl.lit(int(prev_soft_days)))
                            prev = pl.when(keep_k).then(prev).otherwise(None)
                    ret = pl.when(prev.is_not_null() & (prev.abs() > 1e-12)).then(cur / prev - 1.0).otherwise(None)
                    if cast_f32:
                        ret = ret.cast(pl.Float32)
                    exprs.append(ret.alias(f"{c}__ret{k}"))
            lf_out = lf_out.with_columns(exprs)


    # C3 diffs（可选）
    if K_DIFF:
        for batch in _chunks(feature_cols, batch_size):
            exprs = []
            for c in batch:
                cur = pl.col(c)
                for k in K_DIFF:
                    if k in LAGS:
                        prevk = pl.col(f"{c}__lag{k}")  # 已含 TTL
                    else:
                        prevk = pl.col(c).shift(k).over(by_grp)
                        if prev_soft_days is not None:
                            last_date_k = pl.col(g_date).shift(k).over(by_grp)
                            gap_k = (pl.col(g_date) - last_date_k).cast(pl.Int32)
                            keep_k = gap_k.is_not_null() & (gap_k > 0) & (gap_k <= pl.lit(int(prev_soft_days)))
                            prevk = pl.when(keep_k).then(prevk).otherwise(None)
                    d = pl.when(prevk.is_not_null()).then(cur - prevk).otherwise(None)
                    if cast_f32:
                        d = d.cast(pl.Float32)
                    exprs.append(d.alias(f"{c}__diff{k}"))
            lf_out = lf_out.with_columns(exprs)



    # C4 rolling r-z
    if RZW:
        for batch in _chunks(feature_cols, batch_size):
            exprs_base = []
            # 统一构造 t-1 的基准值（含 TTL 掩码）
            if prev_soft_days is not None:
                last_date_1 = pl.col(g_date).shift(1).over(by_grp)
                gap_1 = (pl.col(g_date) - last_date_1).cast(pl.Int32)
                keep_1 = gap_1.is_not_null() & (gap_1 > 0) & (gap_1 <= pl.lit(int(prev_soft_days)))

            for c in batch:
                # 若之前已在 C1 产出 __lag1，可直接用： base = pl.col(f"{c}__lag1")
                base = pl.col(c).shift(1).over(by_grp)
                if prev_soft_days is not None:
                    base = pl.when(keep_1).then(base).otherwise(None)
                exprs_base.append(base.alias(f"{c}__tminus1_base"))
            lf_out = lf_out.with_columns(exprs_base)

            # 真正的 rolling r-z
            roll_exprs = []
            for c in batch:
                base = pl.col(f"{c}__tminus1_base")
                for w in RZW:
                    m  = base.rolling_mean(window_size=w, min_samples=1).over(by_grp)
                    s  = base.rolling_std(window_size=w, ddof=1, min_samples=2).over(by_grp)  # 统一 ddof=1
                    den = (s.fill_null(0.0) + 1e-9)
                    rz = (base - m) / den
                    if cast_f32:
                        m = m.cast(pl.Float32); s = s.cast(pl.Float32); rz = rz.cast(pl.Float32)
                    if keep_rmean_rstd:
                        roll_exprs += [
                            m.alias(f"{c}__rmean{w}"),
                            s.alias(f"{c}__rstd{w}"),
                            rz.alias(f"{c}__rz{w}"),
                        ]
                    else:
                        roll_exprs.append(rz.alias(f"{c}__rz{w}"))
            lf_out = lf_out.with_columns(roll_exprs)
            lf_out = lf_out.drop([f"{c}__tminus1_base" for c in batch])


    # C5 EWM（可选）
    if SPANS:
        for batch in _chunks(feature_cols, batch_size):
            exprs_base = []

            # TTL 掩码（t-1）
            if prev_soft_days is not None:
                last_date_1 = pl.col(g_date).shift(1).over(by_grp)
                gap_1 = (pl.col(g_date) - last_date_1).cast(pl.Int32)
                keep_1 = gap_1.is_not_null() & (gap_1 > 0) & (gap_1 <= pl.lit(int(prev_soft_days)))

            # 构造 t-1 基准（若你已在 C1 产出 __lag1，可以直接用它替代下面两行）
            for c in batch:
                base = pl.col(c).shift(1).over(by_grp)
                if prev_soft_days is not None:
                    base = pl.when(keep_1).then(base).otherwise(None)
                exprs_base.append(base.alias(f"{c}__tminus1_base"))
            lf_out = lf_out.with_columns(exprs_base)

            # 计算 EWM
            ewm_exprs = []
            for c in batch:
                base = pl.col(f"{c}__tminus1_base")
                for s in SPANS:
                    ema = base.ewm_mean(span=int(s), adjust=False, ignore_nulls=True).over(by_grp)
                    if cast_f32:
                        ema = ema.cast(pl.Float32)
                    ewm_exprs.append(ema.alias(f"{c}__ewm{s}"))
            lf_out = lf_out.with_columns(ewm_exprs)

            # 清理临时列
            lf_out = lf_out.drop([f"{c}__tminus1_base" for c in batch])


    # C6 cross-section rank（可选）
    if cs_cols:
        cs_cols = [c for c in cs_cols if c in feature_cols]
        if cs_cols:

            # TTL 掩码（t-1）
            if prev_soft_days is not None:
                last_date_1 = pl.col(g_date).shift(1).over(by_grp)
                gap_1 = (pl.col(g_date) - last_date_1).cast(pl.Int32)
                keep_1 = gap_1.is_not_null() & (gap_1 > 0) & (gap_1 <= pl.lit(int(prev_soft_days)))

            # 先构造每列的 t-1 基准（含 TTL）
            exprs_base = []
            for c in cs_cols:
                base = pl.col(c).shift(1).over(by_grp)
                if prev_soft_days is not None:
                    base = pl.when(keep_1).then(base).otherwise(None)
                exprs_base.append(base.alias(f"{c}__tminus1_base"))
            lf_out = lf_out.with_columns(exprs_base)

            # 基于 t-1：截面 z 与 rank(0..1)
            cs_exprs = []
            for c in cs_cols:
                base = pl.col(f"{c}__tminus1_base")

                # 截面统计（只用该列的 t-1）
                n_valid = base.is_not_null().cast(pl.Int32).sum().over(by_cs)
                mu      = base.mean().over(by_cs)
                sig     = base.std(ddof=1).over(by_cs)

                # z-score（数值更稳：sig.fill_null(0)+eps）
                z = ((base - mu) / (sig.fill_null(0.0) + 1e-9)) \
                        .cast(pl.Float32 if cast_f32 else pl.Float64)

                # 百分位排名：空→None；n=1→0.5
                rank_raw = base.rank(method="average").over(by_cs)
                csrank = pl.when(base.is_null()).then(None).otherwise(
                    pl.when(n_valid > 1)
                    .then((rank_raw - 0.5) / n_valid.cast(pl.Float32))
                    .otherwise(pl.lit(0.5))
                ).cast(pl.Float32 if cast_f32 else pl.Float64)

                cs_exprs += [z.alias(f"{c}__cs_z"), csrank.alias(f"{c}__csrank")]

            lf_out = lf_out.with_columns(cs_exprs)

            # 清理临时列
            lf_out = lf_out.drop([f"{c}__tminus1_base" for c in cs_cols])
    return lf_out

   
@dataclass
class StageA:
    tail_lags: Sequence[int]
    tail_diffs: Sequence[int]
    rolling_windows: Optional[Sequence[int]]
    prev_soft_days: Optional[int] = None
    is_sorted: bool = False
    cast_f32: bool = True

@dataclass
class StageB:
    ndays: int
    stats_rep_cols: Optional[Sequence[str]] = None
    add_prev1_multirep: bool = True
    batch_size: int = 5
    prev_soft_days: Optional[int] = None
    is_sorted: bool = False
    cast_f32: bool = True

# C 的每个操作可选；None / [] 表示跳过该操作
@dataclass
class StageC:
    lags: Optional[Iterable[int]] = None
    ret_periods: Optional[Iterable[int]] = None
    diff_periods: Optional[Iterable[int]] = None
    rz_windows: Optional[Iterable[int]] = None
    ewm_spans: Optional[Iterable[int]] = None
    cs_cols: Optional[Sequence[str]] = None
    keep_rmean_rstd: bool = True
    prev_soft_days: Optional[int] = None
    batch_size: Optional[int] = 10
    is_sorted: bool = False
    cast_f32: bool = True
    



def run_staged_engineering(
    lf_base: pl.LazyFrame,
    *,
    keys: Sequence[str],
    rep_cols: Sequence[str],
    feature_cols: Sequence[str],
    out_dir: str,
    A: StageA | None = None,
    B: StageB | None = None,
    C: StageC | None = None,
    write_date_between: tuple[int, int] | None = None,   # 新增：只写核心区间
):
    g_symbol, g_date, g_time = keys
    g_date = g_date

    def _save(lf_out: pl.LazyFrame, path: str):
        if write_date_between is None:
            raise ValueError("write_date_between must be specified to avoid date overlap")
        lo, hi = write_date_between
        df = lf_out.filter(pl.col(g_date).is_between(lo, hi)).collect()
        with fs.open(path, "wb") as f:   # 复用你上面构好的 fs (fsspec)
            df.write_parquet(f, compression="zstd")

        
    # ---------- A ----------
    if A is not None:
        lf_resp = lf_base.select([*keys, *rep_cols])
        lf_a_full = fe_resp_daily(
            lf_resp,
            keys=tuple(keys),
            rep_cols=rep_cols,
            is_sorted=A.is_sorted,
            prev_soft_days=A.prev_soft_days,
            cast_f32=A.cast_f32,
            tail_lags=A.tail_lags,
            tail_diffs=A.tail_diffs,
            rolling_windows=A.rolling_windows,
        )
        drop = set(keys) | set(rep_cols)
        a_cols = [c for c in lf_a_full.collect_schema().names() if c not in drop]
        _save(lf_a_full.select([*keys, *a_cols]), f"{out_dir}/stage_a.parquet")
        

    # ---------- B ----------
    if B is not None:
        lf_resp = lf_base.select([*keys, *rep_cols])
        lf_b_full = fe_resp_same_tick_xday(
            lf_resp,
            keys=tuple(keys),
            rep_cols=rep_cols,
            is_sorted=B.is_sorted,
            prev_soft_days=B.prev_soft_days,
            cast_f32=B.cast_f32,
            ndays=B.ndays,
            stats_rep_cols=B.stats_rep_cols,
            add_prev1_multirep=B.add_prev1_multirep,
            batch_size=B.batch_size,
        )
        drop = set(keys) | set(rep_cols)
        b_cols = [c for c in lf_b_full.collect_schema().names() if c not in drop]
        _save(lf_b_full.select([*keys, *b_cols]), f"{out_dir}/stage_b.parquet")

    # ---------- C（按操作分别输出） ----------
    if C is not None:
        def _do_op(op_name: str, **op_flags):
            lf_src = lf_base.select([*keys, *feature_cols])
            lf_c = fe_feat_history(
                lf=lf_src,
                keys=tuple(keys),
                feature_cols=feature_cols,
                is_sorted=C.is_sorted,
                prev_soft_days=C.prev_soft_days,
                cast_f32=C.cast_f32,
                batch_size=C.batch_size,
                lags=op_flags.get("lags"),
                ret_periods=op_flags.get("ret_periods"),
                diff_periods=op_flags.get("diff_periods"),
                rz_windows=op_flags.get("rz_windows"),
                ewm_spans=op_flags.get("ewm_spans"),
                keep_rmean_rstd=C.keep_rmean_rstd,
                cs_cols=op_flags.get("cs_cols"),
            ).drop(feature_cols)
            cols = [c for c in lf_c.collect_schema().names() if c not in keys]
            _save(lf_c.select([*keys, *cols]), f"{out_dir}/stage_c_{op_name}.parquet")

        if C.lags:         _do_op("lags",   lags=C.lags)
        if C.ret_periods:  _do_op("ret",    ret_periods=C.ret_periods)
        if C.diff_periods: _do_op("diff",   diff_periods=C.diff_periods)
        if C.rz_windows:   _do_op("rz",     rz_windows=C.rz_windows)
        if C.ewm_spans:    _do_op("ewm",    ewm_spans=C.ewm_spans)
        if C.cs_cols:      _do_op("csrank", cs_cols=C.cs_cols)


In [None]:


def _join_from(lf_left: pl.LazyFrame, path: str, lo: int, hi: int) -> pl.LazyFrame:
    cols = FILE2COLS.get(path, [])
    if not cols: return lf_left
    lf_right = (pl.scan_parquet(path)
                  .select([*KEYS, *cols])
                  .filter(pl.col("date_id").is_between(lo, hi))
                  .sort(KEYS))
    return lf_left.join(lf_right, on=KEYS, how="left")

def build_slice(lo: int, hi: int) -> pd.DataFrame:
    lf = (pl.scan_parquet(PARQUET_PATHS)
            .select([*KEYS, TARGET])
            .filter(pl.col("date_id").is_between(lo, hi))
            .sort(KEYS))
    for p in FILE2COLS:
        lf = _join_from(lf, p, lo, hi)
    return lf.collect(streaming=True).to_pandas()

def weighted_r2_zero_mean(y_true, y_pred, weight) -> float:
    """
    Sample-weighted zero-mean R^2 used in Jane Street:
        R^2 = 1 - sum_i w_i (y_i - yhat_i)^2 / sum_i w_i y_i^2
    """
    y_true = np.asarray(y_true, dtype=np.float64).ravel()
    y_pred = np.asarray(y_pred, dtype=np.float64).ravel()
    weight = np.asarray(weight, dtype=np.float64).ravel()
    assert y_true.shape == y_pred.shape == weight.shape

    num = np.sum(weight * (y_true - y_pred) ** 2)
    den = np.sum(weight * (y_true ** 2))
    if den <= 0:
        return 0.0  # safe fallback (shouldn't happen on the full JS eval)
    return 1.0 - (num / den)

def lgb_wr2_eval(preds, train_data):
    y = train_data.get_label()
    w = train_data.get_weight()
    if w is None:
        w = np.ones_like(y)
    score = weighted_r2_zero_mean(y, preds, w)
    return ('wr2', score, True)  # higher is better


# 特征选择- 初选 (选特征，省略)

In [None]:
import os, gc, glob
import polars as pl
import numpy as np
import lightgbm as lgb
import pandas as pd
from pathlib import Path

BASE_PATH = ["/mnt/data/js/clean/final_clean.parquet"]
KEYS = ["symbol_id","date_id","time_id"]
TARGET = "responder_6"
WEIGHT = 'weight'
FEATURE_COLS = [f"feature_{i:02d}" for i in range(79)]
REP_COLS = [f"responder_{i}" for i in range(9)]

OUT_DIR = "/mnt/data/js/cache/first_selection"
os.makedirs(OUT_DIR, exist_ok=True)


In [None]:

# --- A: prev-day tails + daily summaries ---
A = StageA(
    tail_lags=(1,),
    tail_diffs=(1,),
    rolling_windows=(5,),
    prev_soft_days=3,          # allow fallback up to 3 calendar days
)

# --- B: same time_id cross-day ---
B = StageB(
    ndays=3,                   # prev{1..3} at same time_id
    stats_rep_cols=None,       # default: use rep_cols
    add_prev1_multirep=True,
    batch_size=5,
    prev_soft_days=3,          # TTL for gaps
    strict_k=False,            # allow ≤K-day gaps instead of strict d-k
)

# --- C: history features (keep it tiny) ---
C = StageC(
    lags=(1, ),
    ret_periods=(1,),
    diff_periods=(1,),
    rz_windows=(5,),
    ewm_spans=(10,),
    keep_rmean_rstd=True,
    cs_cols=None,        # must be subset of feature_cols
    cs_by=("date_id","time_id"),
    prev_soft_days=3,
)

# example call
paths = run_staged_engineering(
    lf_base=lf_base,                # your base LazyFrame
    keys=KEYS,
    rep_cols=REP_COLS,         # updated to use REP_COLS
    feature_cols=FEATURE_COLS, # updated to use FEATURE_COLS
    out_dir=OUT_DIR,
    A=A, B=B, C=C,
)


0. 准备与切分天数

In [None]:

STAGE_PATHS = [
    "/mnt/data/js/cache/first_selection/stage_a.parquet",
    "/mnt/data/js/cache/first_selection/stage_b.parquet",
    "/mnt/data/js/cache/first_selection/stage_c_lags.parquet",
    "/mnt/data/js/cache/first_selection/stage_c_ret.parquet",
    "/mnt/data/js/cache/first_selection/stage_c_diff.parquet",
    "/mnt/data/js/cache/first_selection/stage_c_rz.parquet",
    "/mnt/data/js/cache/first_selection/stage_c_ewm.parquet",
]

DATE_LO, DATE_HI = 1200, 1400
OUT_DIR = "/mnt/data/js/cache/first_selection/run_full"
SHARD_DIR = f"{OUT_DIR}/shards_all"
Path(SHARD_DIR).mkdir(parents=True, exist_ok=True)

lf_base = pl.scan_parquet(BASE_PATH)
# 仅拿目标区间的base
lf_range = lf_base.filter(pl.col("date_id").is_between(DATE_LO, DATE_HI))

# days & split
days = (lf_range.select(pl.col("date_id").unique().sort())
                .collect(streaming=True)["date_id"].to_list())
cut = int(len(days) * 0.8)
train_days, val_days = days[:cut], days[cut:]
print(f"[split] train {len(train_days)} days, val {len(val_days)} days, range={days[0]}..{days[-1]}")


1.收集全量特征列名（并集）

In [None]:
# 来自 base 的特征列
feat_cols = set(FEATURE_COLS)

# 各 stage 全部列（除 KEYS/TARGET/WEIGHT）
for p in STAGE_PATHS:
    if not os.path.exists(p):
        print(f"[skip] missing: {p}")
        continue
    names = pl.scan_parquet(p).collect_schema().names()
    for c in names:
        if c not in KEYS and c not in (TARGET, WEIGHT):
            feat_cols.add(c)

feat_cols = sorted(feat_cols)
print(f"[cols] total feature columns = {len(feat_cols)}")


2. 写“天片”—把所有列拼上并立刻落盘

In [None]:
DAYS_PER_SHARD = 16

# 左表（含 base 的 FEATURE_COLS）
lf_left_base = (
    lf_range
    .select([*KEYS, TARGET, WEIGHT, *[pl.col(c) for c in FEATURE_COLS]])
)

# 为每个 stage 准备元信息（列名 + 文件大小，先拼小文件更省内存）
stage_meta = []
for p in STAGE_PATHS:
    if not os.path.exists(p): 
        continue
    scan = pl.scan_parquet(p).filter(pl.col("date_id").is_between(DATE_LO, DATE_HI))
    cols = [c for c in scan.collect_schema().names() if c not in KEYS]
    if cols:
        stage_meta.append({"path": p, "cols": cols, "size": os.path.getsize(p)})
stage_meta.sort(key=lambda d: d["size"])


In [None]:
def write_shards(tag, days_list):
    ds = sorted(days_list)
    for i in range(0, len(ds), DAYS_PER_SHARD):
        batch = set(ds[i:i+DAYS_PER_SHARD])

        # 当前片的左表
        lf_chunk = lf_left_base.filter(pl.col("date_id").is_in(batch))
        already = set(lf_chunk.collect_schema().names())

        # 逐 stage 拼接（右表只取该片天数 + 只取未存在列）
        for m in stage_meta:
            need = [c for c in m["cols"] if c not in already]
            if not need:
                continue
            lf_add = (pl.scan_parquet(m["path"])
                        .filter(pl.col("date_id").is_in(batch))
                        .select([*KEYS, *need]))
            lf_chunk = lf_chunk.join(lf_add, on=KEYS, how="left")
            already.update(need)

        # 统一 float32 并落盘（列按 feat_cols 顺序对齐；片内缺失的列自然是 null）
        present = [c for c in feat_cols if c in already]
        cast_feats = [pl.col(c).cast(pl.Float32).alias(c) for c in present]
        lf_out = lf_chunk.select([
            *KEYS,
            pl.col(WEIGHT).cast(pl.Float32).alias(WEIGHT),
            pl.col(TARGET).cast(pl.Float32).alias(TARGET),
            *cast_feats,
        ])
        out_path = f"{SHARD_DIR}/{tag}_shard_{i//DAYS_PER_SHARD:04d}.parquet"
        lf_out.sink_parquet(out_path, compression="zstd")
        print(f"[{tag}] wrote {out_path}")
        gc.collect()

write_shards("train", train_days)
write_shards("val",   val_days)

3. 从 shards 构建 memmap 数组 （恒定内存）

In [None]:
def memmap_from_shards(glob_pat, feat_cols, prefix):
    paths = sorted(glob.glob(glob_pat))
    counts = [pl.scan_parquet(p).select(pl.len()).collect(streaming=True).item() for p in paths]
    n_rows, n_feat = int(sum(counts)), len(feat_cols)
    print(f"[memmap] {glob_pat}: {len(paths)} files, {n_rows} rows, {n_feat} features")

    X = np.memmap(f"{prefix}_X.float32.mmap", dtype="float32", mode="w+", shape=(n_rows, n_feat))
    y = np.memmap(f"{prefix}_y.float32.mmap", dtype="float32", mode="w+", shape=(n_rows,))
    w = np.memmap(f"{prefix}_w.float32.mmap", dtype="float32", mode="w+", shape=(n_rows,))

    i = 0
    for p, k in zip(paths, counts):
        lf = pl.scan_parquet(p)
        names = set(lf.collect_schema().names())
        exprs = [
            (pl.col(c).cast(pl.Float32).alias(c) if c in names
             else pl.lit(None, dtype=pl.Float32).alias(c))
            for c in feat_cols
        ]
        df = lf.select([
            pl.col(TARGET).cast(pl.Float32).alias(TARGET),
            pl.col(WEIGHT).cast(pl.Float32).alias(WEIGHT),
            *exprs
        ]).collect(streaming=True)

        X[i:i+k, :] = df.select(feat_cols).to_numpy()
        y[i:i+k]    = df.select(pl.col(TARGET)).to_numpy().ravel()
        w[i:i+k]    = df.select(pl.col(WEIGHT)).to_numpy().ravel()
        i += k
        del df; gc.collect()

    X.flush(); y.flush(); w.flush()
    return X, y, w

train_X, train_y, train_w = memmap_from_shards(f"{SHARD_DIR}/train_shard_*.parquet", feat_cols, f"{OUT_DIR}/train")
val_X,   val_y,   val_w   = memmap_from_shards(f"{SHARD_DIR}/val_shard_*.parquet",   feat_cols, f"{OUT_DIR}/val")

print("train shapes:", train_X.shape, train_y.shape, train_w.shape)
print("val   shapes:", val_X.shape,   val_y.shape,   val_w.shape)


4. LightGBM 训练 + 重要性 （一次性全列）

In [None]:
dtrain = lgb.Dataset(train_X, label=train_y, weight=train_w,
                     feature_name=feat_cols, free_raw_data=True)
dval   = lgb.Dataset(val_X,   label=val_y,   weight=val_w,
                     feature_name=feat_cols, reference=dtrain, free_raw_data=True)

params = dict(
    objective="regression", metric="None",
    num_threads=16, seed=42, deterministic=True, first_metric_only=True,
    learning_rate=0.05, num_leaves=31, max_depth=-1, min_data_in_leaf=20,
    # 内存友好
    max_bin=63, bin_construct_sample_cnt=100_000, min_data_in_bin=3,
)

model = lgb.train(
    params, dtrain,
    valid_sets=[dval, dtrain], valid_names=["val","train"],
    num_boost_round=1000, callbacks=[lgb.early_stopping(50)],
    feval=lgb_wr2_eval,   # 你的评估函数
)

imp = pd.DataFrame({
    "feature": model.feature_name(),
    "gain": model.feature_importance("gain"),
    "split": model.feature_importance("split"),
}).sort_values("gain", ascending=False).reset_index(drop=True)

print(imp.head(30))



In [None]:
imp.to_csv(f"{OUT_DIR}/imp_1r.csv", index=False)

In [None]:
imp = pd.read_csv(f"{OUT_DIR}/imp_1r.csv")
top_feats = imp.loc[imp.gain > 0]

In [None]:
fam = top_feats['feature'].str.extract(r'^(feature_\d{2}|responder_\d)', expand=False)
top_feats['family'] = fam

In [None]:
top_feats

In [None]:
fam_feats = top_feats.groupby('family').agg(
    n = ('feature', 'count'),
    gain = ('gain', 'sum'),
    split = ('split', 'sum'),
).reset_index().sort_values('gain', ascending=False)

In [None]:
print(fam_feats.shape)

In [None]:
mask_feat = fam_feats['family'].str.startswith('feature_', na=False)
mask_resp = fam_feats["family"].str.startswith("responder_", na=False)
features_only   = fam_feats[mask_feat].sort_values("gain", ascending=False)
responders_only = fam_feats[mask_resp].sort_values("gain", ascending=False)

In [None]:
selected_features = features_only['family'][:79] # select all
selected_resps = responders_only['family'][:9] # select all

# save the Series (no index)
selected_features.to_csv(f"{OUT_DIR}/selected_features_1r.csv", index=False, header=False)
selected_resps.to_csv(f"{OUT_DIR}/selected_responders_1r.csv", index=False, header=False)

# 特征工程

In [32]:
# 训练集参数


paths = fs.glob(f"{P('az', cfg['paths']['clean_shards'])}/*.parquet")
az_paths = []
for p in paths:
    az_paths.append(f"az://{p}")
az_paths = sorted(az_paths)  # Use sorted() instead of sort() to create a new sorted list   
lc = pl.scan_parquet(az_paths, storage_options=storage_options)

days = lc.select(pl.col("date_id").unique().sort()).collect(streaming=True)["date_id"].to_list()

In [33]:
# ------- step 2: FE per clean shard (A+B once, C batched internally via C.batch_size) -------
fea = cfg.get("feature_eng", {})
A_cfg = fea.get("A", {})
B_cfg = fea.get("B", {})
C_cfg = fea.get("C", {})
A_enabled = A_cfg.get("enabled", True)
B_enabled = B_cfg.get("enabled", True)
C_enabled = C_cfg.get("enabled", True)

A = (StageA(
        tail_lags=A_cfg.get("tail_lags", [1]),
        tail_diffs=A_cfg.get("tail_diffs", [1]),
        rolling_windows=A_cfg.get("rolling_windows", [3]),
        prev_soft_days=A_cfg.get("prev_soft_days", 7),
        is_sorted=A_cfg.get("is_sorted", False),
        cast_f32=A_cfg.get("cast_f32", True),
    ) if A_enabled else None)

B = (StageB(
        ndays=B_cfg.get("ndays", 5),
        stats_rep_cols=B_cfg.get("stats_rep_cols", None),
        add_prev1_multirep=B_cfg.get("add_prev1_multirep", True),
        batch_size=B_cfg.get("batch_size", 5),
        prev_soft_days=B_cfg.get("prev_soft_days", 7),
        is_sorted=B_cfg.get("is_sorted", False),
        cast_f32=B_cfg.get("cast_f32", True),
    ) if B_enabled else None)

C = (StageC(
        lags=C_cfg.get("lags", [1,3]),
        ret_periods=C_cfg.get("ret_periods", [1]),
        diff_periods=C_cfg.get("diff_periods", [1]),
        rz_windows=C_cfg.get("rz_windows", [5]),
        ewm_spans=C_cfg.get("ewm_spans", [10]),
        keep_rmean_rstd=C_cfg.get("keep_rmean_rstd", True),
        cs_cols=C_cfg.get("cs_cols", None),
        prev_soft_days=C_cfg.get("prev_soft_days", 7),
        batch_size=C_cfg.get("batch_size", 10),
        is_sorted=C_cfg.get("is_sorted", False),
        cast_f32=C_cfg.get("cast_f32", True),
    ) if C_enabled else None)



# 创建 FE 输出目录
fe_dir = P("az", cfg["paths"]["fe_shards"])
fs.mkdir(fe_dir, exist_ok=True)

In [34]:
# -------- 分片循环：每片包含 [pad_lo .. core_hi] 的输入，但只写 [core_lo .. core_hi] --------
PAD_DAYS = 30 # 后期可定义函数取最小有效日期
CORE_DAYS = 30


g_date= cfg['keys'][1]
for start in range(PAD_DAYS, len(days), CORE_DAYS):
    core_lo_idx = start
    core_hi_idx = min(start + CORE_DAYS - 1, len(days) - 1) # 闭区间
    pad_lo_idx = core_lo_idx - PAD_DAYS
    
    core_lo, core_hi = days[core_lo_idx], days[core_hi_idx]
    pad_lo = days[pad_lo_idx]
    
    # 仅读本片+pad的输入 （懒加载 + 谓词下推）
    lf_shard = (lc.filter(pl.col(g_date).is_between(pad_lo, core_hi))
                .select([*cfg['keys'], cfg['weight'], 'time_bucket', *RESP_COLS, *FEATURE_ALL]))
    out_path = f"{fe_dir}/fe_{core_lo:04d}_{core_hi:04d}"
    fs.mkdir(out_path, exist_ok=True)
    print(f"[FE] days {core_lo}..{core_hi} (pad from {pad_lo}) -> {out_path}")
    run_staged_engineering(
        lf_base = lf_shard,
        keys = cfg['keys'],
        rep_cols = RESP_COLS,
        feature_cols = FEATURE_ALL,
        out_dir = out_path,
        A = A,
        B = B,
        C = C,
        write_date_between=(core_lo, core_hi)
        )

[FE] days 710..739 (pad from 680) -> az://jackson/js_exp/exp/v1/fe_shards/fe_0710_0739
[FE] days 740..769 (pad from 710) -> az://jackson/js_exp/exp/v1/fe_shards/fe_0740_0769
[FE] days 770..799 (pad from 740) -> az://jackson/js_exp/exp/v1/fe_shards/fe_0770_0799
[FE] days 800..829 (pad from 770) -> az://jackson/js_exp/exp/v1/fe_shards/fe_0800_0829
[FE] days 830..859 (pad from 800) -> az://jackson/js_exp/exp/v1/fe_shards/fe_0830_0859
[FE] days 860..889 (pad from 830) -> az://jackson/js_exp/exp/v1/fe_shards/fe_0860_0889
[FE] days 890..919 (pad from 860) -> az://jackson/js_exp/exp/v1/fe_shards/fe_0890_0919
[FE] days 920..949 (pad from 890) -> az://jackson/js_exp/exp/v1/fe_shards/fe_0920_0949
[FE] days 950..979 (pad from 920) -> az://jackson/js_exp/exp/v1/fe_shards/fe_0950_0979
[FE] days 980..1009 (pad from 950) -> az://jackson/js_exp/exp/v1/fe_shards/fe_0980_1009
[FE] days 1010..1019 (pad from 980) -> az://jackson/js_exp/exp/v1/fe_shards/fe_1010_1019


## 把同一分片内 A/B/C 拼成 Panel 分片

In [None]:
fe_root = P("az", cfg["paths"]["fe_shards"])
panel_root = P("az", cfg["paths"]["panel_data"])
fs.mkdir(panel_root, exist_ok=True)




# 特征选择 二选（选参数）

至此，我们已经获得了全量数据的满参数下的各片特征量，接下来选小量样本 进行特征参数选择

样本集_特征选择

- 选择目标日期的数据集并合并
- 划分训练集，验证集
- 简单参数lgb训练
- 特征重要性排序

1. 基本配置

In [35]:
paths = fs.glob(f"{P('az', cfg['paths']['fe_shards'])}/*")

In [36]:
paths

['jackson/js_exp/exp/v1/fe_shards/fe_0710_0739',
 'jackson/js_exp/exp/v1/fe_shards/fe_0740_0769',
 'jackson/js_exp/exp/v1/fe_shards/fe_0770_0799',
 'jackson/js_exp/exp/v1/fe_shards/fe_0800_0829',
 'jackson/js_exp/exp/v1/fe_shards/fe_0830_0859',
 'jackson/js_exp/exp/v1/fe_shards/fe_0860_0889',
 'jackson/js_exp/exp/v1/fe_shards/fe_0890_0919',
 'jackson/js_exp/exp/v1/fe_shards/fe_0920_0949',
 'jackson/js_exp/exp/v1/fe_shards/fe_0950_0979',
 'jackson/js_exp/exp/v1/fe_shards/fe_0980_1009',
 'jackson/js_exp/exp/v1/fe_shards/fe_1010_1019']

In [49]:
# 列名与区间
#FEATURE_COLS = [f"feature_{i:02d}" for i in range(79)]  #pd.read_csv("/mnt/data/js/config/selected_features_1r.csv",
DATE_LO, DATE_HI = 800, 1000 # 指定训练/验证的 date_id 范围, 后期转为全部训练集

# 基本量
#FEATURE_COLS = pd.read_csv(f"{INPUT_DIR}/selected_features_1r.csv", header=None).squeeze().tolist()
#REP_COLS = pd.read_csv(f"{INPUT_DIR}/selected_responders_1r.csv", header=None).squeeze().tolist()
clean_root = P("az", cfg["paths"]["clean_shards"])      # 你的 clean 分片目录
fe_root    = P("az", cfg["paths"]["fe_shards"])         # 你刚写出的 FE 分片目录
panel_root = P("az", cfg["paths"]["panel_shards"])
fs.mkdir(panel_root, exist_ok=True)


paths = fs.glob(f"{fe_root }/*")
sorted_paths = [f"az://{p}" for p in sorted(paths)]

print("ready")

ready


2.枚举窗口

In [51]:
# 从 Blob 列出全部 fe_shards 分片（返回不带协议的路径，要手动加 az://）

# 按日期范围筛选
wins = set()
for p in sorted_paths:
    base = p.split("/")[-1]  # e.g. C_lags_1220_1249.parquet
    lo = int(base.split("_")[-2]); hi = int(base.split("_")[-1])
    if hi >= DATE_LO and lo <= DATE_HI:
        wins.add((lo, hi))
wins = sorted(wins)
print(f"windows in range: {wins[:5]} ... (total {len(wins)})")

# 取得该区间实际天
days = [d for d in range(DATE_LO, DATE_HI+1)]



windows in range: [(800, 829), (830, 859), (860, 889), (890, 919), (920, 949)] ... (total 7)


3. 按窗口拼接 (A + B + 所有 C_*) → 直接写数据分片（无大表）

In [None]:
import numpy as np
import polars as pl
import fsspec, gc

T = cfg['trading']['ticket']


lc = pl.scan_parquet(f"{clean_root}/*.parquet", storage_options=storage_options)

for (lo, hi) in wins:
    # 与全局区间取交集，防止边缘窗口越界
    w_lo, w_hi = max(lo, DATE_LO), min(hi, DATE_HI)
    
    shard_name = f"fe_{lo}_{hi}"
    
    # 基表（先筛行，再一次性加时间特征）
    lf = (
        lc.filter(pl.col("date_id").is_between(w_lo, w_hi))
          .select([*cfg['keys'], 'time_bucket', cfg['target'], cfg['weight'], *FEATURE_ALL])
        .with_columns([
            # 复制一份 time_id 作为“位置特征”，避免与 key 列冲突
            pl.col("time_id").cast(pl.Float32).alias("time_pos"),

              # 周期相位：phase = 2π * time_id / T
              (2*np.pi * pl.col("time_id") / pl.lit(T, dtype=pl.Float32)).alias("_phase_"),
        ])
        .with_columns([
            # 兼容旧版：对表达式调用 .sin() / .cos()
            pl.col("_phase_").sin().cast(pl.Float32).alias("time_sin"),
            pl.col("_phase_").cos().cast(pl.Float32).alias("time_cos"),
        ])
        .drop(["_phase_"])
    )
    
    
    # A/B
    A = pl.scan_parquet(f"{fe_root}/{shard_name}/stage_a.parquet", storage_options=storage_options)
    B = pl.scan_parquet(f"{fe_root}/{shard_name}/stage_b.parquet", storage_options=storage_options)
    
    # C
    C_paths = fs.glob(f"{fe_root}/{shard_name}/stage_c_*.parquet")
    C_paths = [f"az://{p}" for p in C_paths]
    C_scans = [pl.scan_parquet(p, storage_options=storage_options) for p in C_paths]
    
    # 逐个 join 
    panel = lf.join(A, on=cfg['keys'], how="left")
    panel = panel.join(B, on=cfg['keys'], how="left")
    for C in C_scans:
        panel = panel.join(C, on=cfg['keys'], how="left")
        
    panel = panel.sort(cfg['keys'])
    
    out_path = f"{panel_root}/panel_{w_lo:04d}_{w_hi:04d}.parquet"
    (
        panel.sink_parquet(
            out_path,
            compression="zstd",
            storage_options=storage_options,
            statistics=True,
        )
    )

    gc.collect()


NameError: name 'T' is not defined

4.生成最终特征清单

In [None]:
import glob, polars as pl

# 任选一个训练分片当“列模板”
ref = f"{P('az', 'exp/v1', cfg['paths']['sample_fe'])}/sample_1300_1309.parquet"
names = pl.scan_parquet(ref, storage_options=storage_options).collect_schema().names()

# 直接从这个分片拿特征列（已包含 base + engineered）

feat_cols = [c for c in names if c not in (*cfg['keys'], cfg['target'], cfg['weight'])]
print(f"final feature list size = {len(feat_cols)}")

df_feat = pd.DataFrame({'feature': feat_cols})


5.构建memmap

In [None]:
import re
mm_dir = P("local", "exp/v1", cfg["paths"]["sample_mm"])
os.makedirs(mm_dir, exist_ok=True)

def full_shard_key(p: str):
    bn = os.path.basename(p)          # e.g. sample_1200_1219.parquet
    m = re.match(r"sample_(\d+)_(\d+)\.parquet$", bn)
    if not m:
        return (10**12, 10**12, bn)
    lo, hi = map(int, m.groups())
    return (lo, hi)


def shard2memmap(glob_paths: list[str], feat_cols: list[str], prefix: str):
    date_col   = cfg["keys"][1]
    target_col = cfg["target"]
    weight_col = cfg["weight"]

    paths = sorted(glob_paths, key=full_shard_key)

    counts = []
    for p in paths:
        k = (pl.scan_parquet(p, storage_options=storage_options)
               .select(pl.len()).collect(streaming=True).item())
        counts.append(int(k))
    n_rows, n_feat = int(sum(counts)), len(feat_cols)

    os.makedirs(os.path.dirname(prefix), exist_ok=True)

    X = np.memmap(f"{prefix}_X.float32.mmap", dtype="float32", mode="w+", shape=(n_rows, n_feat))
    y = np.memmap(f"{prefix}_y.float32.mmap", dtype="float32", mode="w+", shape=(n_rows,))
    w = np.memmap(f"{prefix}_w.float32.mmap", dtype="float32", mode="w+", shape=(n_rows,))
    d = np.memmap(f"{prefix}.date.i32.mmap",  dtype="int32",   mode="w+", shape=(n_rows,))

    i = 0
    need_cols = [date_col, target_col, weight_col, *feat_cols]
    for p, k in zip(paths, counts):
        df = (pl.scan_parquet(p, storage_options=storage_options)
                .select(need_cols).collect(streaming=True))

        X[i:i+k, :] = df.select(feat_cols).to_numpy()
        y[i:i+k]    = df.select(pl.col(target_col)).to_numpy().ravel()
        w[i:i+k]    = df.select(pl.col(weight_col)).to_numpy().ravel()
        d[i:i+k]    = df.select(pl.col(date_col)).to_numpy().ravel().astype("int32")
        i += k
        del df; gc.collect()

    X.flush(); y.flush(); w.flush(); d.flush()

    meta = {
        "n_rows": int(n_rows), "n_feat": int(n_feat), "dtype": "float32", "ts": time.time(),
        "features": list(feat_cols), "target": target_col, "weight": weight_col,
        "date_col": date_col, "files": paths
    }
    with open(f"{prefix}.meta.json", "w") as f:
        json.dump(meta, f)
    return X, y, w

np_paths = fs.glob(f"{P('np', 'exp/v1', cfg['paths']['sample_fe'])}/sample_*_*.parquet")
glob_paths = []
for p in np_paths:
    glob_paths.append(f"az://{p}")

X, y, w = shard2memmap(glob_paths= glob_paths, feat_cols=feat_cols, prefix=f"{mm_dir}/full_sample_v1")


开始训练

In [None]:
import numpy as np

def make_sliding_cv(date_ids: np.ndarray, *, n_splits: int, gap_days: int = 5, train_to_val: int = 9):
    """
    date_ids: 逐行的 date_id（一行一个样本）
    返回 [(train_idx, val_idx), ...]
    """
    # 唯一且升序的 day 轴（用于按“天”计算窗口）
    days = np.unique(date_ids)
    N = len(days)
    R = int(train_to_val)
    K = int(n_splits)
    G = int(gap_days)

    usable = N - G
    if usable <= 0 or K <= 0 or R <= 0:
        return []

    V_base, rem = divmod(usable, R + K)
    if V_base <= 0:
        return []

    T = R * V_base
    v_lens = [V_base + 1 if i < rem else V_base for i in range(K)]

    v_lo = T + G
    folds = []
    for V_i in v_lens:
        v_hi  = v_lo + V_i
        tr_hi = v_lo - G
        tr_lo = tr_hi - T
        if tr_lo < 0 or v_hi > N:
            break

        # 将“按天的窗口”映射到行索引
        tr_days = days[tr_lo:tr_hi]
        va_days = days[v_lo:v_hi]
        tr_idx  = np.flatnonzero(np.isin(date_ids, tr_days))
        va_idx  = np.flatnonzero(np.isin(date_ids, va_days))

        if len(tr_idx) == 0 or len(va_idx) == 0:
            break

        # 安全检查：确保有 gap
        assert va_days.min() - tr_days.max() >= G, "gap_days 未生效，可能泄漏"

        folds.append((tr_idx, va_idx))
        v_lo = v_hi

    return folds

# 用法
d = np.memmap("/mnt/data/js/exp/v1/sample_mm/full_sample_v1.date.i32.mmap", dtype="int32", mode="r")
folds = make_sliding_cv(d, n_splits=2, gap_days=5, train_to_val=9)


In [None]:
# ---------- 2) 加载 memmap ----------
import json, numpy as np, lightgbm as lgb
prefix = f"/mnt/data/js/exp/v1/sample_mm/full_sample_v1"
with open(f"{prefix}.meta.json") as f:
    meta = json.load(f)
n_rows, n_feat = meta["n_rows"], meta["n_feat"]
feat_names = meta["features"]

X = np.memmap(f"{prefix}_X.float32.mmap", dtype="float32", mode="r", shape=(n_rows, n_feat))
y = np.memmap(f"{prefix}_y.float32.mmap", dtype="float32", mode="r", shape=(n_rows,))
w = np.memmap(f"{prefix}_w.float32.mmap", dtype="float32", mode="r", shape=(n_rows,))
# 你之前已定义：weighted_r2_zero_mean、lgb_wr2_eval


6.训练LightGBM

In [None]:
# 1) 统计 N 天窗口的行数（按你真实筛选逻辑来）
n_rows = (
    lc.filter(pl.col("date_id").is_between(DATE_LO, DATE_HI))  # 你选的 200天区间
      .select(pl.len())
      .collect()
      .item()  # -> int
)

# 2) 估算 GPU “transfer to GPU” 的大头（经验值）
n_feat = len(feat_names)
dense_groups = int(n_feat)  # 按之前比例估
bytes_est = n_rows * 0.8* dense_groups         
gb_est = bytes_est / (1024**3)

print(f"rows≈{n_rows:,}, dense_groups≈{dense_groups}, est GPU load≈{gb_est:.2f} GiB")


In [None]:
ds_params = dict(
    max_bin=31,                    # 降低直方图桶数，省显存/内存
    bin_construct_sample_cnt=100000,# 构桶采样行数（默认是20万）
    min_data_in_bin=3,
    data_random_seed=42,
)

# 1) 全集 Dataset
d_all = lgb.Dataset(
    X, label=y, weight=w,
    feature_name=feat_names,
    free_raw_data=True,
    params=ds_params,               # 让子集也继承这些设置
)

params = dict(
    objective="regression",
    metric="None",
    device_type="gpu",
    learning_rate=0.08,
    num_leaves=31,
    max_depth=8,
    feature_fraction=0.60,
    bagging_fraction=0.60,
    bagging_freq=1,
    min_data_in_leaf=200,
    seed=42,
)

# 2) 多折训练 + 每折 wr2 + 汇总 gain_share（仅一张表）
import numpy as np, pandas as pd, os

fi = pd.DataFrame({"feature": feat_names})
scores = []

for k, (tr, va) in enumerate(folds, 1):
    dtrain = d_all.subset(tr, params=ds_params)    # 只构建本折的子集
    dvalid = d_all.subset(va, params=ds_params)

    bst = lgb.train(
        params, dtrain,
        valid_sets=[dvalid, dtrain],
        valid_names=["val", "train"],
        feval=lgb_wr2_eval,
        num_boost_round=4000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=True),
            lgb.log_evaluation(period=100),
        ],
    )

    # 每折分数
    scores.append(bst.best_score["val"]["wr2"])   # or bst.best_score["val"]["wr2"]

    # 每折 gain_share → 作为一列加入
    g = bst.feature_importance(importance_type="gain", iteration=bst.best_iteration).astype(float)
    denom = g.sum()
    fi[f"fold{k}_gain_share"] = (g / denom) if denom > 0 else np.zeros_like(g, dtype=float)
    bst.free_dataset()                 # 释放 booster 里持有的 Dataset
    del dtrain, dvalid, bst; gc.collect()

In [None]:
# 汇总均值 + 排序 + 保存
fold_cols = [c for c in fi.columns if c.startswith("fold")]
fi["mean_gain_share"] = fi[fold_cols].mean(axis=1)
fi = fi.sort_values("mean_gain_share", ascending=False, ignore_index=True)

In [None]:
# 保存 汇总均值
fi.to_csv(f"/mnt/data/js/exp/v1/sample_mm//fe_v1_gain_share.csv", index=False)

In [None]:
fi = pd.read_csv(f"/mnt/data/js/exp/v1/sample_mm//fe_v1_gain_share.csv")

In [None]:
fi.head(15)

In [None]:
rl = [c for c in fi['feature'][:100] if c.startswith('responder_')]

In [None]:
rl

In [None]:
whitelist = cfg.get("white_list", [])
fi_normal = fi[~fi["feature"].isin(whitelist)].reset_index(drop=True)

展示

In [None]:
dfi = fi_normal[["feature", "mean_gain_share"]].copy()
dfi.reset_index(drop=True, inplace=True)
dfi['rank'] = dfi.index + 1

cum_share = dfi["mean_gain_share"].cumsum()

fig, ax1 = plt.subplots(figsize=(8,4))
ax1.plot(dfi["rank"], dfi["mean_gain_share"], color="tab:blue")
ax1.set_xlabel("Feature rank (desc)")
ax1.set_ylabel("Mean gain share", color="tab:blue")

ax2 = ax1.twinx()
ax2.plot(dfi["rank"], cum_share, color="tab:orange")
ax2.set_ylabel("Cumulative share", color="tab:orange")

plt.title("Feature importance (gain share)")
plt.tight_layout()
plt.show()


In [None]:
cum_share = dfi["mean_gain_share"].cumsum()
tot = cum_share.iloc[-1]
for th in [0.8, 0.9, 0.95]:
    k = (cum_share <= th*tot).sum()
    print(f"{th*100:.0f}% → Top-{k}")


In [None]:
whitelist

In [None]:
final_feats = list(dict.fromkeys(whitelist + dfi['feature'][:632].tolist()))  # 保持顺序且不重复
final_feats = pd.Series(final_feats)

final_feats.to_csv("/mnt/data/js/exp/v1/sample_mm/top_fi_0911.csv", index=False, header=False)

In [None]:
final_feats

In [None]:
pd.Series(final_feats, name="feature").to_csv(f"{P('local', 'exp/v1', cfg['paths']['sample_mm'])}/top_fi_0911.csv", index=False)

In [None]:
# check it
df_check = pd.read_csv(f"/mnt/data/js/exp/v1/sample_mm/top_fi_0911.csv")
df_check

# 去共线性

In [None]:
PARQUET_PATHS = ["/mnt/data/js/final_clean.parquet"]
KEYS = ["symbol_id","date_id","time_id"]
TARGET = "responder_6"
FEATURE_COLS = pd.read_csv('/home/admin_ml/Jackson/projects/selected_features.csv')['family'].tolist()
REP_COLS = pd.read_csv('/home/admin_ml/Jackson/projects/selected_resps.csv')['family'].tolist()

lf_base = pl.scan_parquet(PARQUET_PATHS).select(KEYS+FEATURE_COLS+REP_COLS)


lf_slice = lf_base.filter((pl.col("date_id") >= 1200) & (pl.col("date_id") <= 1400))

PARAMS = dict(
        prev_soft_days=7,
        tail_lags=(2, 5, 20, 40, 100),
        tail_diffs=(2, 5,),
        rolling_windows=(5, 20),
        same_time_ndays=5,
        strict_k=False,
        hist_lags=(1,2,5,10,20,100),
        ret_periods=(1,5,20),
        diff_periods=(1,5),
        rz_windows=(5,20),
        ewm_spans=(5,40,100),
        cs_cols=None,       # <- keep this small to avoid blow-up
    )

lf_eng = run_engineering_on_slice(lf_slice, **PARAMS)

feats = pd.read_csv("/home/admin_ml/Jackson/projects/final_fi_mean.csv")["feature"].tolist()

lf_small = lf_eng.select(feats[:500])
lf_small.collect(streaming=True).write_parquet("/mnt/data/js/X_ready.parquet", compression="zstd")


In [None]:

lf = pl.scan_parquet("/mnt/data/js/X_ready.parquet")

df = lf.collect(streaming=True).to_pandas()

# Correlation (pairwise complete obs) + guard on min observations
min_obs = max(50, int(0.3 * len(df)))  # tweak as you like
C = df.corr(method="pearson", min_periods=min_obs).abs().fillna(0.0)

# Ensure to align rows & cols to the same (priority) order, fill any NaNs with 0
order = feats
C = C.reindex(index=order, columns=order).fillna(0.0).copy()


# --- Prepare NumPy array for the greedy loop ---
A = C.values
np.fill_diagonal(A, 0.0)  # ensure the value is smaller than thresh, so the feature won't be dropped by value'1'
m = len(order)

# --- Greedy de-correlation (keep-by-priority, drop neighbors) ---
THRESH = 0.97
keep_mask = np.ones(m, dtype=bool)

for i in range(m):
    if not keep_mask[i]:
        continue  # already removed by a higher-priority pick
    # only check j > i (upper triangle) among still-active features
    active_slice = keep_mask[i+1:]
    drop = (A[i, i+1:] >= THRESH) & active_slice
    active_slice[drop] = False  # marks into keep_mask[i+1:] via view
keep = [order[i] for i in range(m) if keep_mask[i]]


pd.DataFrame({'feature': keep}).to_csv("final_selected_features.csv", index=False)

# 全数据训练

基本配置

In [None]:
DATE_LO, DATE_HI = 680, 1530 # 指定训练/验证的 date_id 范围, 后期转为全部训练集
# 基本量
FEATURE_COLS = [f"feature_{i:02d}" for i in range(79)] #FEATURE_COLS = pd.read_csv(f"{INPUT_DIR}/selected_features_1r.csv", header=None).squeeze().tolist()
REP_COLS = [f"responder_{i}" for i in range(9)] #REP_COLS = pd.read_csv(f"{INPUT_DIR}/selected_responders_1r.csv", header=None).squeeze().tolist()

paths = fs.glob(f"{P('az', 'exp/v1', cfg['paths']['clean'])}/*.parquet")
clean_files = []
for p in paths:
    bn = os.path.basename(p)
    parts = bn.split('_')
    start = int(parts[1])
    clean_files.append((start, p))
    
clean_sorted_file_list = [f"az://{f}" for _, f in sorted(clean_files)]

lc = pl.scan_parquet(clean_sorted_file_list, storage_options=storage_options)

print("ready")

枚举窗口

In [None]:
# 从 Blob 列出全部 fe_shards 分片（返回不带协议的路径，要手动加 az://）

fe_all = fs.glob(f"{P('np', 'exp/v1',cfg['paths']['fe_shards'])}/*.parquet")
fe_all = [f"az://{p}" for p in fe_all]

# 按日期范围筛选
wins = set()
for p in fe_all:
    base = p.split("/")[-1]  # e.g. C_lags_1220_1249.parquet
    lo = int(base.split("_")[-2]); hi = int(base.split("_")[-1].split(".")[0])
    if hi >= DATE_LO and lo <= DATE_HI:
        wins.add((lo, hi))
wins = sorted(wins)
print(f"windows in range: {wins[:5]} ... (total {len(wins)})")

# 取得该区间实际天
days = [d for d in range(DATE_LO, DATE_HI+1)]
#cut = int(len(days)*0.8)


3. 按窗口拼接 (A + B + 所有 C_*) → 直接写数据分片（无大表）

In [None]:
import numpy as np
import polars as pl
import gc

T = int(cfg['ticks'])                 # 例如 968 
K = int(cfg['bucket_size'])           # 例如 6
open_n  = int(cfg.get('open_auction_ticks', 5))
close_n = int(cfg.get('close_auction_ticks', 5))

# 安全的“上界截断”工具（兼容旧版 Polars 无 clip_max）
def clip_upper(expr: pl.Expr, ub: int) -> pl.Expr:
    return pl.when(expr > pl.lit(ub)).then(pl.lit(ub)).otherwise(expr)

for (lo, hi) in wins:
    # 与全局区间取交集，防止边缘窗口越界
    w_lo, w_hi = max(lo, DATE_LO), min(hi, DATE_HI)

    # 基表（先筛行，再一次性加时间特征）
    base = (
        lc.filter(pl.col("date_id").is_between(w_lo, w_hi))
          .select([*cfg['keys'], cfg['target'], cfg['weight'], *FEATURE_COLS])
        .with_columns([
            # 复制一份 time_id 作为“位置特征”，避免与 key 列冲突
            pl.col("time_id").cast(pl.Float32).alias("time_pos"),

              # 周期相位：phase = 2π * time_id / T
              (2*np.pi * pl.col("time_id") / pl.lit(T, dtype=pl.Float32)).alias("_phase_"),
        ])
        .with_columns([
            # 兼容旧版：对表达式调用 .sin() / .cos()
            pl.col("_phase_").sin().cast(pl.Float32).alias("time_sin"),
            pl.col("_phase_").cos().cast(pl.Float32).alias("time_cos"),
        ])
        .drop(["_phase_"])
        .with_columns([
            # 开盘/收盘指示器（恰好 open_n / close_n 个 tick）
            (pl.col("time_id") <  pl.lit(open_n)).cast(pl.Int8).alias("is_open_auction"),
            (pl.col("time_id") >= pl.lit(T - close_n)).cast(pl.Int8).alias("is_close_auction"),
        ])
    )

    # 分桶：bucket = floor(time_id * K / T)，并防御性截到 [0..K-1]
    bucket_raw = ( (pl.col('time_id') * pl.lit(K)) // pl.lit(T) )
    bucket_capped = clip_upper(bucket_raw, K - 1)
    base = base.with_columns([
        bucket_capped.cast(pl.Int8).alias(f"time_bucket_{K}")
    ])

    lf = base  # 后面继续你的 join 逻辑

    fe_files = []
    for name in (f"A_{lo}_{hi}.parquet", f"B_{lo}_{hi}.parquet"):
        p = f"{P('az', 'exp/v1', cfg['paths']['fe_shards'])}/{name}"
        fe_files.append(p)

    # 同窗口所有 C_* 分片
    c_files = fs.glob(f"{P('np', 'exp/v1', cfg['paths']['fe_shards'])}/C_*_{lo}_{hi}.parquet")
    c_files = [f"az://{p}" for p in c_files]
    fe_files.extend(c_files)

    # 逐个左连接
    already = set(lf.collect_schema().names())
    for fp in fe_files:
        ds = pl.scan_parquet(fp, storage_options=storage_options)
        names = ds.collect_schema().names()
        add_cols = [c for c in names if c not in already]
        if add_cols:
            lf = lf.join(ds.select([*cfg['keys'], *add_cols]), on=cfg['keys'], how="left")
            already.update(add_cols)

    # 选出特征并统一类型
    feat_present = [c for c in already if c not in (*cfg['keys'], cfg['target'], cfg['weight'])]
    select_exprs = [
        *cfg['keys'],
        pl.col(cfg['target']).cast(pl.Float32).alias(cfg['target']),
        pl.col(cfg['weight']).cast(pl.Float32).alias(cfg['weight']),
        *[pl.col(c).cast(pl.Float32).alias(c) for c in feat_present],
    ]
    lf_win = lf.select(select_exprs)

    # 直接流式写分片
    panel_path = P("az", "exp/v1", cfg["paths"]["panel_shards"])
    fs.mkdir(panel_path, exist_ok=True)
    out_path = f"{panel_path}/panel_{w_lo}_{w_hi}.parquet"
    (
        lf_win.sink_parquet(
            out_path,
            compression="zstd",
            storage_options=storage_options,
            statistics=True,
            maintain_order=True,
        )
    )

    gc.collect()


In [None]:
# 检查关键分片/总表是否按 (symbol_id, date_id, time_id) 排序

paths = sorted(fs.glob(f"{P('az', 'exp/v1', cfg['paths']['clean_shards'])}/*.parquet"))
for p in paths:
    df = pl.read_parquet(f"az://{p}", storage_options=storage_options).select(["symbol_id","date_id","time_id"])
    n  = df.height
    # 计算“按 key 的正确顺序”
    sid = df["symbol_id"].to_numpy()
    did = df["date_id"].to_numpy()
    tid = df["time_id"].to_numpy()
    ord_keys = np.lexsort((tid, did, sid))    # 以 symbol->date->time 升序
    is_sorted = np.all(ord_keys == np.arange(n))
    print(os.path.basename(p), "sorted_by_keys:", is_sorted, "rows:", n)

4.导入最终特征清单

In [None]:

feat_cols = pd.read_csv("/mnt/data/js/exp/v1/sample_mm/top_fi_0911.csv")
feat_cols = feat_cols['feature'].tolist()
    

In [None]:
len(feat_cols)

5.构建memmap

In [None]:
import re
mm_dir = P("local", "exp/v1", cfg["paths"]["panel_mm"])
os.makedirs(mm_dir, exist_ok=True)

def full_shard_key(p: str):
    bn = os.path.basename(p)          # e.g. panel_1200_1219.parquet
    m = re.match(r"panel_(\d+)_(\d+)\.parquet$", bn)
    if not m:
        return (10**12, 10**12, bn)
    lo, hi = map(int, m.groups())
    return (lo, hi)


def shard2memmap(glob_paths: list[str], feat_cols: list[str], prefix: str):
    date_col   = cfg["keys"][1]
    target_col = cfg["target"]
    weight_col = cfg["weight"]

    paths = sorted(glob_paths, key=full_shard_key)

    counts = []
    for p in paths:
        k = (pl.scan_parquet(p, storage_options=storage_options)
               .select(pl.len()).collect(streaming=True).item())
        counts.append(int(k))
    n_rows, n_feat = int(sum(counts)), len(feat_cols)

    os.makedirs(os.path.dirname(prefix), exist_ok=True)

    X = np.memmap(f"{prefix}_X.float32.mmap", dtype="float32", mode="w+", shape=(n_rows, n_feat))
    y = np.memmap(f"{prefix}_y.float32.mmap", dtype="float32", mode="w+", shape=(n_rows,))
    w = np.memmap(f"{prefix}_w.float32.mmap", dtype="float32", mode="w+", shape=(n_rows,))
    d = np.memmap(f"{prefix}.date.i32.mmap",  dtype="int32",   mode="w+", shape=(n_rows,))

    i = 0
    need_cols = [date_col, target_col, weight_col, *feat_cols]
    for p, k in zip(paths, counts):
        df = (pl.scan_parquet(p, storage_options=storage_options)
                .select(need_cols).collect(streaming=True))

        X[i:i+k, :] = df.select(feat_cols).to_numpy()
        y[i:i+k]    = df.select(pl.col(target_col)).to_numpy().ravel()
        w[i:i+k]    = df.select(pl.col(weight_col)).to_numpy().ravel()
        d[i:i+k]    = df.select(pl.col(date_col)).to_numpy().ravel().astype("int32")
        i += k
        del df; gc.collect()

    X.flush(); y.flush(); w.flush(); d.flush()

    meta = {
        "n_rows": int(n_rows), "n_feat": int(n_feat), "dtype": "float32", "ts": time.time(),
        "features": list(feat_cols), "target": target_col, "weight": weight_col,
        "date_col": date_col, "files": paths
    }
    with open(f"{prefix}.meta.json", "w") as f:
        json.dump(meta, f)
    return X, y, w

In [None]:
np_paths = fs.glob(f"{P('np', 'exp/v1', cfg['paths']['panel_shards'])}/panel_*_*.parquet")
glob_paths = []
for p in np_paths:
    glob_paths.append(f"az://{p}")
    
X, y, w = shard2memmap(glob_paths= glob_paths, feat_cols=feat_cols, prefix=f"{mm_dir}/full_panel_v1")


In [None]:
# 滑动式来分割训练集/验证集

d = np.memmap(f"/mnt/data/js/exp/v1/panel_mm/full_panel_v1.date.i32.mmap", dtype="int32", mode="r")

In [None]:
import numpy as np

d = np.memmap("/mnt/data/js/exp/v1/panel_mm/full_panel_v1.date.i32.mmap", dtype="int32", mode="r")

mono = np.all(d[1:] >= d[:-1])
viol = np.flatnonzero(d[1:] < d[:-1])
print("monotonic_non_decreasing:", bool(mono), "| violations:", viol.size)

# 看前几个“逆序”位置（如果有）
for j in viol[:10]:
    print(f"idx {j}->{j+1}: {d[j]} -> {d[j+1]}")


In [None]:
import numpy as np

d = np.memmap("/mnt/data/js/exp/v1/panel_mm/full_panel_v1.date.i32.mmap", dtype="int32", mode="r")

days_all = np.unique(d)
print("rows:", d.size, "unique_days:", days_all.size, "min/max:", days_all.min(), days_all.max())

# 相邻唯一天的差
gaps = np.diff(days_all)
gap_pos = np.flatnonzero(gaps > 1)
print("global missing-day blocks:", gap_pos.size)

# 看前几段缺口：[前一天, 后一天, 差值]
if gap_pos.size:
    preview = np.c_[days_all[gap_pos], days_all[gap_pos+1], gaps[gap_pos]]
    print(preview[:10])


In [None]:
import numpy as np

def make_sliding_cv(date_ids: np.ndarray, *, n_splits: int, gap_days: int = 5, train_to_val: int = 9):
    # ---- 构造唯一天轴 ----
    days = np.unique(date_ids)                 # 关键：按天计算窗口
    N, R, K, G = len(days), int(train_to_val), int(n_splits), int(gap_days)

    usable = N - G
    if usable <= 0 or K <= 0 or R <= 0:
        return []

    V_base, rem = divmod(usable, R + K)
    if V_base <= 0:
        return []

    T = R * V_base
    v_lens = [V_base + 1 if i < rem else V_base for i in range(K)]
    v_lo = T + G

    folds = []
    for V_i in v_lens:
        v_hi  = v_lo + V_i
        tr_hi = v_lo - G
        tr_lo = tr_hi - T
        if tr_lo < 0 or v_hi > N:
            break

        tr_days = days[tr_lo:tr_hi]
        va_days = days[v_lo:v_hi]

        tr_idx = np.flatnonzero(np.isin(date_ids, tr_days))
        va_idx = np.flatnonzero(np.isin(date_ids, va_days))

        # 保险丝（行不重叠、天不重叠、gap 按“天位置”生效）
        assert np.intersect1d(tr_idx, va_idx).size == 0, "row overlap"
        assert np.intersect1d(tr_days, va_days).size == 0, "day overlap"
        gap_pos = (np.searchsorted(days, va_days.min())
                   - np.searchsorted(days, tr_days.max()) - 1)
        assert gap_pos >= G, f"gap_days not enforced: {gap_pos} < {G}"

        folds.append((tr_idx, va_idx))
        v_lo = v_hi

    return folds

# 用法
d = np.memmap("/mnt/data/js/exp/v1/panel_mm/full_panel_v1.date.i32.mmap", dtype="int32", mode="r")
folds = make_sliding_cv(d, n_splits=3, gap_days=5, train_to_val=9)


In [None]:
folds

In [None]:
days_all = np.unique(d)
for i,(tr,va) in enumerate(folds,1):
    tr_days = np.unique(d[tr]); va_days = np.unique(d[va])
    row_ovl = np.intersect1d(tr,va).size
    day_ovl = np.intersect1d(tr_days,va_days).size
    gap_pos = (np.searchsorted(days_all, va_days.min())
               - np.searchsorted(days_all, tr_days.max()) - 1)
    print(f"fold{i}: row_ovl={row_ovl}, day_ovl={day_ovl}, gap_days={gap_pos}")


In [None]:
d

In [None]:
folds

In [None]:
# ---------- 2) 加载 memmap ----------
import json, numpy as np, lightgbm as lgb
prefix = f"/mnt/data/js/exp/v1/panel_mm/full_panel_v1"
with open(f"{prefix}.meta.json") as f:
    meta = json.load(f)
n_rows, n_feat = meta["n_rows"], meta["n_feat"]
feat_names = meta["features"]

X = np.memmap(f"{prefix}_X.float32.mmap", dtype="float32", mode="r", shape=(n_rows, n_feat))
y = np.memmap(f"{prefix}_y.float32.mmap", dtype="float32", mode="r", shape=(n_rows,))
w = np.memmap(f"{prefix}_w.float32.mmap", dtype="float32", mode="r", shape=(n_rows,))
# 你之前已定义：weighted_r2_zero_mean、lgb_wr2_eval


训练模型

In [None]:
# 估算 GPU “transfer to GPU” 的大头（经验值）

n_rows = (
    lc.filter(pl.col("date_id").is_between(DATE_LO, DATE_HI)) 
      .select(pl.len())
      .collect()
      .item()  # -> int
)

n_feat = len(feat_names)
dense_groups = int(n_feat)  # 按之前比例估
bytes_est = n_rows * 0.8* dense_groups         
gb_est = bytes_est / (1024**3)

print(f"rows≈{n_rows:,}, dense_groups≈{dense_groups}, est GPU load≈{gb_est:.2f} GiB")


In [None]:
ds_params = dict(
    max_bin=31,                    
    bin_construct_sample_cnt=100000,
    min_data_in_bin=3,
    data_random_seed=42,
)

# 1) 全集 Dataset
d_all = lgb.Dataset(
    X, label=y, weight=w,
    feature_name=feat_names,
    free_raw_data=True,
    params=ds_params,               # 让子集也继承这些设置
)

params = dict(
    objective="regression",
    metric="None",
    device_type="gpu",
    num_threads=16,
    learning_rate=0.08,
    num_leaves=31,
    max_depth=8,
    feature_fraction=0.60,
    bagging_fraction=0.60,
    bagging_freq=1,
    min_data_in_leaf=200,
    seed=42,
)

# 2) 多折训练 + 每折 wr2 + 汇总 gain_share（仅一张表）
import numpy as np, pandas as pd, os

fi = pd.DataFrame({"feature": feat_names})
scores = [] 

for k, (tr, va) in enumerate(folds, 1):
    dtrain = d_all.subset(tr, params=ds_params)    # 只构建本折的子集
    dvalid = d_all.subset(va, params=ds_params)

    bst = lgb.train(
        params, dtrain,
        valid_sets=[dvalid, dtrain],
        valid_names=["val", "train"],
        feval=lgb_wr2_eval,
        num_boost_round=4000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=True),
            lgb.log_evaluation(period=100),
        ],
    )

    # 每折分数
    scores.append(bst.best_score["val"]["wr2"])   # or bst.best_score["val"]["wr2"]

    # 每折 gain_share → 作为一列加入
    g = bst.feature_importance(importance_type="gain", iteration=bst.best_iteration).astype(float)
    denom = g.sum()
    fi[f"fold{k}_gain_share"] = (g / denom) if denom > 0 else np.zeros_like(g, dtype=float)
    bst.free_dataset()                 # 释放 booster 里持有的 Dataset
    del dtrain, dvalid, bst; gc.collect()

In [None]:
# 汇总均值 + 排序 + 保存
fold_cols = [c for c in fi.columns if c.startswith("fold")]
fi["mean_gain_share"] = fi[fold_cols].mean(axis=1)
fi = fi.sort_values("mean_gain_share", ascending=False, ignore_index=True)

In [None]:
fi

# 模型评估

## 1.数据清洗,预处理

数据集：test + pad

## 2.特征工程