In [1]:
import polars as pl
import numpy as np
import pandas as pd

# 你的工程工具
from pipeline.io import cfg, P, fs, storage_options, ensure_dir_local, ensure_dir_az

In [2]:
# ========== 1) 统一配置 ==========
pl.set_random_seed(42)

G_SYM, G_DATE, G_TIME = cfg["keys"]          # e.g. ("symbol_id","date_id","time_id")
TARGET_COL = cfg["target"]                   # e.g. "responder_6"
WEIGHT_COL = cfg["weight"]                   # 允许为 None

# 读入筛选的特征列
df_ranking_features = pd.read_csv("/mnt/data/js/exp/v1/models/tune/feature_importance__fixed__fixed__mm_full_train__features__fs__1400-1600__cv2-g7-r4__seed42__top1000__1760734398__range1000-1600__range1000-1600__cv2-g7-r4__1760740035.csv")
ls_topk_features = df_ranking_features['feature'].tolist()

TIME_FEATURES = [ c for c in ["time_bucket", "time_pos", "time_sin", "time_cos"] if c in ls_topk_features]
TO_DEC_INPUTS = [c for c in ls_topk_features if c not in TIME_FEATURES]

In [34]:
# 读入数据
data_start_date = 1000
data_end_date = 1300 

data_paths = fs.glob("az://jackson/js_exp/exp/v1/panel_shards/*.parquet")
data_paths =[f"az://{p}" for p in data_paths]

lf_data = (
    pl.scan_parquet(data_paths, storage_options=storage_options)
    .select([*cfg['keys'], *TO_DEC_INPUTS])
    .filter(pl.col(G_DATE).is_between(data_start_date, data_end_date, closed="both"))
)

In [35]:
lf_data.select(pl.len()).collect()

len
u32
11006160


In [36]:
null_case = lf_data.select(pl.all().is_null().sum()).collect()

In [37]:
df_null = null_case.to_pandas().T

In [38]:
# 修改列名
df_null.index.name = 'feature'
df_null = df_null.rename(columns={df_null.columns[0]: "null_count"}).reset_index()

In [42]:
# dat_id 范围 1000-1300 共 11006160
df_null[df_null['null_count'] / 11006160 < 0.1]['feature'].tolist()

['symbol_id',
 'date_id',
 'time_id',
 'feature_52__rmean30',
 'feature_60__cs_z',
 'feature_36__ewm50',
 'feature_06__ewm50',
 'feature_06__ewm10',
 'feature_59__cs_z',
 'responder_6_prevday_std',
 'feature_58',
 'responder_0_prevday_std',
 'responder_8_prevday_std',
 'responder_1_prev2day_close',
 'feature_33',
 'feature_38',
 'feature_30__ewm50',
 'feature_36__ewm10',
 'responder_7_prev_tail_lag100',
 'feature_52__ewm10',
 'feature_15__rmean7',
 'responder_4_prevday_std',
 'feature_07',
 'feature_05',
 'feature_08',
 'responder_5_prevday_mean',
 'responder_0_prev_tail_lag10',
 'feature_23',
 'feature_39__ewm5',
 'responder_3_prevday_std',
 'feature_04',
 'feature_26__csrank',
 'feature_61__ewm5',
 'responder_5_prevday_std',
 'feature_56',
 'responder_6_prev_tail_lag967',
 'responder_4_close_roll14_std',
 'responder_0_prev_tail_lag500',
 'responder_8_prev_tail_lag500',
 'feature_60__csrank',
 'feature_42__rmean3',
 'responder_2_prev_tail_lag100',
 'feature_09__rmean3',
 'feature_04__

In [44]:
# dat_id 范围 1000-1300 共 11006160
df_null[df_null['null_count'] / 11006160 > 0.9]['feature'].tolist()

['feature_01__rstd30',
 'feature_02__rstd3',
 'feature_77__ret10',
 'feature_77__ret3',
 'feature_76__ret50',
 'feature_76__ret10',
 'feature_76__ret3',
 'feature_75__ret50',
 'feature_75__ret10',
 'feature_10__diff50',
 'feature_31__diff50',
 'feature_08__rstd30',
 'feature_11__diff3',
 'feature_00__diff3',
 'feature_73__ret50',
 'feature_09__rstd7',
 'feature_06__rstd30',
 'feature_03__diff3',
 'feature_33__diff3',
 'feature_05__diff50',
 'feature_05__diff10',
 'feature_05__diff3',
 'feature_04__diff50',
 'feature_04__diff10',
 'feature_11__diff10',
 'feature_07__rstd7',
 'feature_10__diff10',
 'feature_78__ret3',
 'feature_77__ret50',
 'feature_08__rstd7',
 'feature_32__diff50',
 'feature_32__diff10',
 'feature_32__diff3',
 'feature_09__rstd3',
 'feature_31__diff10',
 'feature_31__diff3',
 'feature_11__rstd30',
 'feature_01__diff10',
 'feature_78__ret50',
 'feature_08__rstd3',
 'feature_07__rstd30',
 'feature_07__rstd14',
 'feature_07__rstd3',
 'feature_03__diff10',
 'feature_00__di

In [None]:
# date_id 范围 1300-1600 共 11024552 行
df_null[df_null['null_count'] / 11024552 > 0.9]

Unnamed: 0,feature,null_count
745,feature_01__rstd30,11013408
768,feature_02__rstd3,11013408
787,feature_77__ret10,11024552
788,feature_77__ret3,11024468
789,feature_76__ret50,11024552
...,...,...
936,feature_01__diff50,11024552
937,feature_09__rstd30,11013408
939,feature_00__rstd7,11013408
940,feature_02__rstd30,11013408


In [None]:
# 按组随机抽样

out = (
    df
    .groupby([G_SYM, G_DATE], group_keys=False)
    .sample(frac=0.1, random_state=42)
    .reset_index(drop=True)
)

In [None]:
del df

In [None]:
out.shape

In [None]:
X = out[TO_Z_INPUTS]  # 保留 NaN
MIN_OVERLAP = max(70000, int(0.1*len(X)))  # 两列至少要这么多共同样本
corr_abs = X.corr(method="pearson", min_periods=MIN_OVERLAP).abs()
corr_abs = corr_abs.fillna(0.0)  # 样本不足导致的 NaN，当 0 处理

In [None]:

cols = corr_abs.columns.tolist()

C = corr_abs.values

In [None]:
# ——(可选)一致性校验：当前列顺序是否等于重要度顺序的交集——
imp_order = [c for c in ls_topk_features if c in cols]  # 重要度序列 ∩ 现有列
if imp_order != cols:
    print("[warn] corr_abs 的列顺序与重要度顺序不一致；为安全起见将按重要度顺序去重。")
    idx = {c:i for i,c in enumerate(cols)}
    ord_idx = np.array([idx[c] for c in imp_order], dtype=int)
else:
    ord_idx = np.arange(len(cols), dtype=int)  # 已按重要度排好 → 直接使用当前顺序
    
tau = 0.95
selected_idx = []
sel_mask = np.ones(len(cols), dtype=bool)

for j in ord_idx:
    if not selected_idx:
        selected_idx.append(j); sel_mask[j] = True
        continue
    # 与已选集合的最大相关
    if np.max(C[j, selected_idx]) < tau:
        selected_idx.append(j); sel_mask[j] = True
        


In [None]:
keep_cols = [cols[i] for i in selected_idx]
drop_cols = [c for c in cols if c not in keep_cols]

print(f"[decorrelation] tau={tau}, keep={len(keep_cols)}, drop={len(drop_cols)}")

In [None]:
keep_cols

In [None]:
# 保存 去重后的列
keep_cols_root = "/mnt/data/js/exp/v1/tft/selected_features"; ensure_dir_local(keep_cols_root)


In [None]:
# 保存结果
pd.DataFrame({"feature": keep_cols}).to_csv(f"{keep_cols_root}/selected_features__decorr__tau{tau}__{data_start_date}-{data_end_date}.csv", index=False)