In [1]:
import os
import pandas as pd

def merge_one_factor_ew_vw_base_std(
    sort_factor: str,
    base_dir: str,
    std_dir: str,
    out_dir: str,
    out_suffix: str = "_EWVW_base_vs_std",
    save: bool = True,
):
    """
    合并同一因子的 4 张表（允许 base/std 在不同目录）：
      base_dir: {factor}_ew_result.csv, {factor}_vw_result.csv
      std_dir : {factor}_ew_result_s.csv, {factor}_vw_result_s.csv
    输出到 out_dir:
      {factor}{out_suffix}.csv

    Returns
    -------
    merged_df or None (如果缺文件)
    missing(list[str]) 缺失文件列表
    """

    os.makedirs(out_dir, exist_ok=True)

    ew_base_path = os.path.join(base_dir, f"{sort_factor}_ew_result.csv")
    vw_base_path = os.path.join(base_dir, f"{sort_factor}_vw_result.csv")
    ew_std_path  = os.path.join(std_dir,  f"{sort_factor}_ew_result_s.csv")
    vw_std_path  = os.path.join(std_dir,  f"{sort_factor}_vw_result_s.csv")

    paths = {
        "ew_base": ew_base_path,
        "vw_base": vw_base_path,
        "ew_std":  ew_std_path,
        "vw_std":  vw_std_path,
    }

    missing = [k for k, p in paths.items() if not os.path.exists(p)]
    if missing:
        return None, missing

    # ---------- 读入 ----------
    ew_base = pd.read_csv(ew_base_path, index_col=0)
    vw_base = pd.read_csv(vw_base_path, index_col=0)
    ew_std  = pd.read_csv(ew_std_path,  index_col=0)
    vw_std  = pd.read_csv(vw_std_path,  index_col=0)

    # ---------- 改列名（核心：把 EW/VW + 是否标准化写进列名） ----------
    def rename_cols(df, tag):
        return df.rename(columns={c: f"{c}_{tag}" for c in df.columns})

    ew_base = rename_cols(ew_base, "EW")
    ew_std  = rename_cols(ew_std,  "EW_s")
    vw_base = rename_cols(vw_base, "VW")
    vw_std  = rename_cols(vw_std,  "VW_s")

    merged = pd.concat([ew_base, ew_std, vw_base, vw_std], axis=1)

    # ---------- 列排序：同一组合挨在一起更好对比 ----------
    # 目标：col_1_EW, col_1_EW_s, col_1_VW, col_1_VW_s, col_2_EW, ...
    def parse_col(c):
        # 例：col_1_EW_s / high_low_VW
        # 返回 (combo_order, weight_order)
        # combo_order: col_1..col_10 排在前，high_low 最后
        if c.startswith("col_"):
            num = int(c.split("_")[1])
            combo_order = num
        elif c.startswith("high_low"):
            combo_order = 999
        else:
            combo_order = 1000

        if c.endswith("_EW"):
            weight_order = 0
        elif c.endswith("_EW_s"):
            weight_order = 1
        elif c.endswith("_VW"):
            weight_order = 2
        elif c.endswith("_VW_s"):
            weight_order = 3
        else:
            weight_order = 9

        return (combo_order, weight_order, c)

    merged = merged[sorted(merged.columns, key=parse_col)]

    if save:
        out_path = os.path.join(out_dir, f"{sort_factor}{out_suffix}.csv")
        merged.to_csv(out_path)

    return merged, []


In [2]:
import os
import pandas as pd

def merge_all_factors_ew_vw_base_std(
    base_dir: str,
    std_dir: str,
    out_dir: str,
    out_suffix: str = "_EWVW_base_vs_std",
    save_merged: bool = True,
    save_reports: bool = True,
):
    os.makedirs(out_dir, exist_ok=True)

    def list_factors(folder, ew_pat, vw_pat):
        factors = set()
        for fn in os.listdir(folder):
            if fn.endswith(ew_pat):
                factors.add(fn.replace(ew_pat, ""))
            if fn.endswith(vw_pat):
                factors.add(fn.replace(vw_pat, ""))
        return factors

    base_factors = list_factors(base_dir, "_ew_result.csv", "_vw_result.csv")
    std_factors  = list_factors(std_dir,  "_ew_result_s.csv", "_vw_result_s.csv")

    all_factors = sorted(base_factors | std_factors)

    missing_rows = []
    merged_ok = []

    for f in all_factors:
        merged, missing = merge_one_factor_ew_vw_base_std(
            sort_factor=f,
            base_dir=base_dir,
            std_dir=std_dir,
            out_dir=out_dir,
            out_suffix=out_suffix,
            save=save_merged,
        )
        if missing:
            missing_rows.append({
                "sort_factor": f,
                "exists_in_base_dir": f in base_factors,
                "exists_in_std_dir":  f in std_factors,
                "missing_parts": ",".join(missing),
            })
        else:
            merged_ok.append(f)

    missing_df = pd.DataFrame(missing_rows)

    # 只在一个文件夹出现的因子
    only_in_base = sorted(base_factors - std_factors)
    only_in_std  = sorted(std_factors - base_factors)
    one_folder_df = pd.DataFrame({
        "only_in_base_dir": pd.Series(only_in_base, dtype="object"),
        "only_in_std_dir":  pd.Series(only_in_std, dtype="object"),
    })

    if save_reports:
        missing_df.to_csv(os.path.join(out_dir, "merge_missing_report.csv"), index=False)
        one_folder_df.to_csv(os.path.join(out_dir, "factors_only_in_one_folder.csv"), index=False)

    return {
        "merged_ok_factors": merged_ok,
        "missing_report": missing_df,
        "only_in_one_folder": one_folder_df,
    }


In [3]:
result = merge_all_factors_ew_vw_base_std(
    base_dir="test4output",     # 基础版结果所在文件夹
    std_dir="test4output",       # 标准化版结果所在文件夹
    out_dir="test4merge",    # 你想输出到哪里
    save_merged=True,
    save_reports=True
)

print("合并成功因子数：", len(result["merged_ok_factors"]))
print("缺文件的因子数：", len(result["missing_report"]))


合并成功因子数： 87
缺文件的因子数： 22
