In [None]:
# -*- coding: utf-8 -*-
from __future__ import annotations

# ── 标准库 ──────────────────────────────────────────────────────────────────
import os
import time
from pathlib import Path
from collections import defaultdict
from datetime import datetime
import re

# ── 第三方 ──────────────────────────────────────────────────────────────────
import numpy as np
import pandas as pd
import polars as pl

from pipeline.io import cfg, fs, storage_options

import time as _t

def _now() -> str:
    return _t.strftime("%Y-%m-%d %H:%M:%S")

import warnings
warnings.filterwarnings("ignore")  # avoid printing out absolute paths
print(f"[{_now()}] imports ok")

In [None]:
# 读入筛选的所有特征列

df_ranking_features = pd.read_csv("/mnt/data/js/exp/v1/models/tune/feature_importance__fixed__fixed__mm_full_train__features__fs__1400-1698__cv2-g7-r4__seed42__top1000__1760906660__range830-1698__range830-1698__cv2-g7-r4__1760912739.csv")
df_ranking_features = df_ranking_features[df_ranking_features["mean_gain"] > 0].copy()
# 所有列
G_SYM, G_DATE, G_TIME = cfg["keys"]          # e.g. ("symbol_id","date_id","time_id")
TARGET_COL = cfg["target"]                   # e.g. "responder_6"
WEIGHT_COL = cfg["weight"]                   # 允许为 None

TIME_FEATURES = ["time_bucket", "time_pos", "time_sin", "time_cos"]
COV_FEATURES = [c for c in df_ranking_features['feature'].tolist() if c not in TIME_FEATURES]

start_date, end_date = (1200, 1600)  # 全量时间段


In [None]:
data_paths = fs.glob("az://jackson/js_exp/exp/v1/panel_shards/*.parquet")
data_paths =[f"az://{p}" for p in data_paths]

lf_data = (
    pl.scan_parquet(data_paths, storage_options=storage_options)
    .select([*cfg['keys'], WEIGHT_COL, TARGET_COL, *TIME_FEATURES, *COV_FEATURES])
    .filter(pl.col(G_DATE).is_between(start_date, end_date, closed="both"))
)
lf_data = lf_data.sort([G_SYM, G_DATE, G_TIME])

n = int(lf_data.select(pl.len()).collect(streaming=True)[0, 0])


In [None]:
# ---- 使用polars 方法 查看缺失情况-----
df = lf_data.select([
    pl.col(c).null_count().alias(c) for c in COV_FEATURES
]).collect(streaming=True).to_pandas()
df = df.T
df.index.name = 'feature'
df.rename(columns={df.columns[0]: 'null_count'}, inplace=True)
df.reset_index(inplace=True)
df.head()


In [None]:
df_sort_null = df.sort_values(by='null_count', ascending=False)
df_sort_null.index.name = "sorted_index"
df_sort_null.reset_index(inplace=True)
df_sort_null.head(50)

In [None]:
drop_cols = df_sort_null[df_sort_null.index <=37]["feature"].tolist()
keep_cov_cols = [c for c in COV_FEATURES if c not in drop_cols]
path = Path("/mnt/data/js/exp/v1/models/tune/selected_covariant_features.txt")
path.parent.mkdir(parents=True, exist_ok=True)

# 保存
path.write_text("\n".join(keep_cov_cols), encoding="utf-8")


In [None]:
# 读取
path = Path("/mnt/data/js/exp/v1/models/tune/selected_covariant_features.txt")
cov_cols = path.read_text(encoding="utf-8").splitlines()