In [1]:
import polars as pl
import numpy as np
import pandas as pd

# 你的工程工具
from pipeline.io import cfg, P, fs, storage_options, ensure_dir_local, ensure_dir_az

In [2]:
# ========== 1) 统一配置 ==========
pl.set_random_seed(42)

G_SYM, G_DATE, G_TIME = cfg["keys"]          # e.g. ("symbol_id","date_id","time_id")
TARGET_COL = cfg["target"]                   # e.g. "responder_6"
WEIGHT_COL = cfg["weight"]                   # 允许为 None

# 读入筛选的特征列
df_ranking_features = pd.read_csv("/mnt/data/js/exp/v1/models/tune/feature_importance__fixed__fixed__mm_full_train__features__fs__1300-1500__cv3-g7-r4__seed42__top1000__1760299442__range1000-1600__range1000-1600__cv2-g7-r4__1760347190.csv")
ls_topk_features = df_ranking_features['feature'].tolist()

TIME_FEATURES = [ c for c in ["time_bucket", "time_pos", "time_sin", "time_cos"] if c in ls_topk_features]
TO_Z_INPUTS = [c for c in ls_topk_features if c not in TIME_FEATURES]

In [5]:
print(df_ranking_features['feature'].to_string())

0                                 feature_06
1                                 feature_36
2                                   time_pos
3                                 feature_04
4                         feature_75__rstd14
5                           feature_60__cs_z
6                                 feature_59
7                                   time_cos
8               responder_0_close_roll30_std
9                         feature_59__rstd30
10                                feature_07
11                        feature_61__lag900
12                                feature_60
13                       feature_61__lag1936
14                   responder_6_prevday_std
15               responder_8_prev_tail_lag10
16                         feature_61__ret50
17                       feature_61__lag6776
18                        feature_25__diff50
19                         feature_76__rstd7
20                                feature_48
21                   responder_5_prevday_std
22        

In [6]:
# 读入数据
data_start_date = 1400
data_end_date = 1600

data_paths = fs.glob("az://jackson/js_exp/exp/v1/panel_shards/*.parquet")
data_paths =[f"az://{p}" for p in data_paths]

lf_data = (
    pl.scan_parquet(data_paths, storage_options=storage_options)
    .select([*cfg['keys'], *TO_Z_INPUTS])
    .filter(pl.col(G_DATE).is_between(data_start_date, data_end_date, closed="both"))
)

In [7]:
null_case = lf_data.select(pl.all().is_null().sum()).collect()

In [8]:
df_null = null_case.to_pandas().T

In [9]:
# 修改列名
df_null.index.name = 'feature'
df_null = df_null.rename(columns={df_null.columns[0]: "null_count"}).reset_index()

In [10]:
df_null = df_null.sort_values(by='null_count', ascending=False)

In [22]:
df_data = lf_data.collect()

In [23]:
df = df_data.to_pandas()

In [None]:
del df_data

In [25]:
# 按组随机抽样

out = (
    df
    .groupby([G_SYM, G_DATE], group_keys=False)
    .sample(frac=0.1, random_state=42)
    .reset_index(drop=True)
)

In [26]:
del df

In [28]:
out.shape

(736327, 1003)

In [29]:
X = out[TO_Z_INPUTS]  # 保留 NaN
MIN_OVERLAP = max(70000, int(0.1*len(X)))  # 两列至少要这么多共同样本
corr_abs = X.corr(method="pearson", min_periods=MIN_OVERLAP).abs()
corr_abs = corr_abs.fillna(0.0)  # 样本不足导致的 NaN，当 0 处理

In [30]:

cols = corr_abs.columns.tolist()

C = corr_abs.values

In [32]:
# ——(可选)一致性校验：当前列顺序是否等于重要度顺序的交集——
imp_order = [c for c in ls_topk_features if c in cols]  # 重要度序列 ∩ 现有列
if imp_order != cols:
    print("[warn] corr_abs 的列顺序与重要度顺序不一致；为安全起见将按重要度顺序去重。")
    idx = {c:i for i,c in enumerate(cols)}
    ord_idx = np.array([idx[c] for c in imp_order], dtype=int)
else:
    ord_idx = np.arange(len(cols), dtype=int)  # 已按重要度排好 → 直接使用当前顺序
    
tau = 0.95
selected_idx = []
sel_mask = np.ones(len(cols), dtype=bool)

for j in ord_idx:
    if not selected_idx:
        selected_idx.append(j); sel_mask[j] = True
        continue
    # 与已选集合的最大相关
    if np.max(C[j, selected_idx]) < tau:
        selected_idx.append(j); sel_mask[j] = True
        


In [33]:
keep_cols = [cols[i] for i in selected_idx]
drop_cols = [c for c in cols if c not in keep_cols]

print(f"[decorrelation] tau={tau}, keep={len(keep_cols)}, drop={len(drop_cols)}")

[decorrelation] tau=0.95, keep=801, drop=199


In [35]:
keep_cols

['feature_06',
 'feature_36',
 'feature_04',
 'feature_75__rstd14',
 'feature_60__cs_z',
 'feature_59',
 'responder_0_close_roll30_std',
 'feature_59__rstd30',
 'feature_07',
 'feature_61__lag900',
 'feature_60',
 'feature_61__lag1936',
 'responder_6_prevday_std',
 'responder_8_prev_tail_lag10',
 'feature_61__ret50',
 'feature_61__lag6776',
 'feature_25__diff50',
 'feature_76__rstd7',
 'feature_48',
 'responder_5_prevday_std',
 'feature_60__rstd30',
 'responder_5_prevday_mean',
 'feature_51__rmean14',
 'responder_1_close_roll14_std',
 'feature_37__rstd30',
 'responder_2_close_roll30_std',
 'feature_31__diff50',
 'feature_58',
 'feature_59__ewm50',
 'feature_24__lag6776',
 'responder_0_prevday_std',
 'feature_61__lag5808',
 'responder_3_prevday_std',
 'responder_8_prevday_mean',
 'feature_26__diff50',
 'feature_22__diff50',
 'feature_01__ewm50',
 'feature_60__csrank',
 'responder_7_close_roll14_std',
 'feature_20__diff50',
 'feature_08__ewm50',
 'feature_25__ret50',
 'feature_61__diff50

In [36]:
# 保存 去重后的列
keep_cols_root = "/mnt/data/js/exp/v1/tft/selected_features"; ensure_dir_local(keep_cols_root)


In [37]:
# 保存结果
pd.DataFrame({"feature": keep_cols}).to_csv(f"{keep_cols_root}/selected_features__decorr__tau{tau}__{data_start_date}-{data_end_date}.csv", index=False)