In [1]:
# -*- coding: utf-8 -*-
from __future__ import annotations

# ── 标准库 ──────────────────────────────────────────────────────────────────
import os
import time
from pathlib import Path
from collections import defaultdict
from datetime import datetime
import re

# ── 第三方 ──────────────────────────────────────────────────────────────────
import numpy as np
import pandas as pd
import polars as pl

import gc

import torch
import torch.backends.cudnn as cudnn
import lightning as L
import lightning.pytorch as lp
from torch.utils.data import DataLoader

from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from lightning.pytorch.loggers import TensorBoardLogger
from lightning.pytorch.callbacks import DeviceStatsMonitor
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer, Baseline
from pytorch_forecasting.metrics import MAE, RMSE
from pytorch_forecasting.data.encoders import NaNLabelEncoder
from pytorch_forecasting.data import TorchNormalizer, GroupNormalizer


# 你的工程工具
from pipeline.io import cfg, P, fs, storage_options, ensure_dir_local, ensure_dir_az
from pipeline.stream_input_local import ShardedBatchStream  
from pipeline.wr2 import WR2

import time as _t


import warnings
warnings.filterwarnings("ignore")  # avoid printing out absolute paths

def _now() -> str:
    return _t.strftime("%Y-%m-%d %H:%M:%S")
print(f"[{_now()}] imports ok")

[2025-10-16 16:39:02] imports ok


In [2]:
# 读入筛选的所有特征列

df_ranking_features = pd.read_csv("/mnt/data/js/exp/v1/models/tune/feature_importance__fixed__fixed__mm_full_train__features__fs__1300-1500__cv3-g7-r4__seed42__top1000__1760299442__range1000-1600__range1000-1600__cv2-g7-r4__1760347190.csv")
de_corr_features = pd.read_csv("/mnt/data/js/exp/v1/tft/selected_features/selected_features__decorr__tau0.95__1400-1600.csv")
df_ranking_decorr_features = df_ranking_features.loc[df_ranking_features['feature'].isin(de_corr_features['feature'])].copy()
df_e_features = df_ranking_decorr_features.reset_index(drop=True)

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/js/exp/v1/tft/selected_features/selected_features__decorr__tau0.95__1400-1600.csv'

In [None]:
# 所有列
G_SYM, G_DATE, G_TIME = cfg["keys"]          # e.g. ("symbol_id","date_id","time_id")
TARGET_COL = cfg["target"]                   # e.g. "responder_6"
WEIGHT_COL = cfg["weight"]                   # 允许为 None

TIME_FEATURES = ["time_bucket", "time_pos", "time_sin", "time_cos"]
COV_FEATURES = df_e_features['feature'].tolist()


start_date, end_date = (1200, 1600)  # 全量时间段


In [None]:
data_paths = fs.glob("az://jackson/js_exp/exp/v1/panel_shards/*.parquet")
data_paths =[f"az://{p}" for p in data_paths]

lf_data = (
    pl.scan_parquet(data_paths, storage_options=storage_options)
    .select([*cfg['keys'], WEIGHT_COL, TARGET_COL, *TIME_FEATURES, *COV_FEATURES])
    .filter(pl.col(G_DATE).is_between(start_date, end_date, closed="both"))
)
lf_data = lf_data.sort([G_SYM, G_DATE, G_TIME])


In [None]:
cols = COV_FEATURES
chunk = 20
parts = []
for i in range(0, len(cols), chunk):
    part = lf_data.select([pl.col(c).is_null().sum().alias(c) for c in cols[i:i+chunk]]).collect(streaming=True)
    parts.append(part)
df_null = pl.concat(parts, how="horizontal")


In [None]:


df_null = df_null_case.to_pandas().T
df_null.rename(columns={0: 'null_count'}, inplace=True)
df_null.index.name = 'feature'
df_null.reset_index(inplace=True)
df_null = df_null.sort_values(by='null_count', ascending=False)
df_null.head(10)

In [None]:
# 删除缺失值超过40%的列

# 返回总行数
total_nrow = lf_data.select(pl.len()).collect()[0, 0]
drop_cols =df_null[df_null["null_count"] / total_nrow > 0.4]["feature"].tolist()
