选取一个因子新建小表，表中只有四个因素

sub_table = predictor[['STKCD','TRDMNT',sort_factor,'ret']]

In [2]:
import func
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import os

chunk_size = 100000
op_path = 'output-standard'
ngroup = 10

def get_stat(ret_df, max_lag: int = None):

    inner_df = ret_df.copy()

    ret_mean = inner_df.mean() * 100

    if max_lag == None:
        ret_t = stats.ttest_1samp(inner_df, 0)[0]
        ret_p = stats.ttest_1samp(inner_df, 0)[1]
        ret_t = pd.Series(ret_t, index=ret_mean.index)
        ret_p = pd.Series(ret_p, index=ret_mean.index)
    else:
        assert type(max_lag) == int, "input an integer max_lag"
        ret_t = []
        ret_p = []
        for col in inner_df.columns:
            reg = smf.ols(f"{col} ~ 1", data=inner_df).fit(
                cov_type='HAC', cov_kwds={'maxlags': max_lag})
            t_v = reg.tvalues['Intercept']
            p_v = reg.pvalues['Intercept']
            ret_t.append(t_v)
            ret_p.append(p_v)
        ret_t = pd.Series(ret_t, index=ret_mean.index)
        ret_p = pd.Series(ret_p, index=ret_mean.index)
    
    ret_mean.name = 'mean'
    ret_t.name = 't'
    ret_p.name = 'p'
    
    stats_data = pd.DataFrame([ret_mean, ret_t, ret_p])
    return stats_data


def read_csv(csv_path, chunk_size=100000):
    try:
        # 方法 A: 标准读取 (默认逗号分隔)
        #df_csv = pd.read_csv('CHN24/all_predictors.csv')
        # 强制将 'STKCD' 列读取为字符串，保留 000002
        df_csv = pd.read_csv(csv_path, dtype={'STKCD': str})

        # 检查一下
        print(df_csv['STKCD'].head())
    except UnicodeDecodeError:
        # 方法 B: 如果是中文 CSV (特别是 Excel 导出的)，通常需要 gbk 或 gb18030 编码
        print("默认编码失败，尝试 GBK...")
        df_csv = pd.read_csv('CHN24/all_predictors.csv', encoding='gbk')

    # 预览 CSV 数据
    print("\nCSV 数据预览：")
    print(df_csv.head())
    return df_csv

def match_df_flex(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    *,
    left_on: list[str],
    right_on: list[str],
    how: str = "left",
    validate: str = "many_to_one"
):
    if len(left_on) != len(right_on):
        raise ValueError("left_on 和 right_on 长度必须一致")

    for col in left_on:
        if col not in df1.columns:
            raise ValueError(f"[df1] 缺少列: {col}")

    for col in right_on:
        if col not in df2.columns:
            raise ValueError(f"[df2] 缺少列: {col}")

    # df2 去重保护
    if df2.duplicated(subset=right_on).any():
        raise ValueError("df2 在 right_on 上存在重复键")

    df_merged = pd.merge(
        df1,
        df2,
        how=how,
        left_on=left_on,
        right_on=right_on,
        validate=validate
    )

    return df_merged


def cleanBlank(df, sort1, sort2):
    # 1️⃣ 先保存一份 df
    df = df.copy()

    # 2️⃣ 排序
    df = df.sort_values(by=[sort1, sort2])

    # 3️⃣ 记录删除前行数
    n_before = len(df)

    # 4️⃣ 删除含 NaN 的行（只要有一个 NaN 就删）
    df = df.dropna(axis=0)

    # 5️⃣ 记录删除后行数
    n_after = len(df)

    # 6️⃣ 打印删除信息
    print(f"firstSort: 删除了 {n_before - n_after} 行（{n_before} → {n_after}）")

    return df


# ---------- 工具：任意格式 -> YYYYMM(Int64) ----------
def to_yyyymm(series: pd.Series) -> pd.Series:
    s = series.astype(str).str.strip()

    # 情况A：已经是 6 位 YYYYMM
    mask6 = s.str.fullmatch(r"\d{6}", na=False)

    out = pd.Series([pd.NA] * len(s), index=s.index, dtype="Int64")

    # 6位直接转
    out.loc[mask6] = s.loc[mask6].astype("Int64")

    # 情况B：YYYY-MM / YYYY-MM-DD / datetime 等
    dt = pd.to_datetime(s.loc[~mask6], errors="coerce")
    out.loc[~mask6] = (dt.dt.year * 100 + dt.dt.month).astype("Int64")

    return out

# ---------- 读取数据 ----------
df_csv = read_csv('standard/df_standard.csv')
df_csv = df_csv.drop(columns=['Unnamed: 0'], errors='ignore')  # 防止垃圾索引列混进来
print(df_csv.head())

0    000002
1    000002
2    000002
3    000002
4    000002
Name: STKCD, dtype: str

CSV 数据预览：
    STKCD        date     mom6m    mom12m    mom36m     mom1m     chmom  \
0  000002  2000-01-31 -1.042180  0.165599 -0.470788  2.097052 -1.039057   
1  000002  2000-02-29 -0.210615  0.776803 -0.238338 -0.727790 -0.905743   
2  000002  2000-03-31 -0.269587  0.525622 -0.321605  0.146812 -0.712768   
3  000002  2000-04-30 -0.003622  0.349919 -0.172868  1.510853  0.146093   
4  000002  2000-05-31  0.913752  1.277721 -0.449543 -1.293463 -0.487082   

       turn       IPO    indmom  ...        Ol  AnA  ReA  Tan       INA  \
0  0.280377 -0.361158 -0.113961  ...  0.558387  NaN  NaN  NaN -0.693045   
1  0.708814 -0.380693 -0.113228  ...  0.558387  NaN  NaN  NaN -0.693045   
2  0.544172 -0.358569  0.113228  ...  0.558387  NaN  NaN  NaN -0.693045   
3  0.662261 -0.335673 -0.113228  ...  0.558387  NaN  NaN  NaN -0.693045   
4  0.528957 -0.309662 -0.112509  ...  0.558387  NaN  NaN  NaN -0.693045   

   

导入已经合并后的表格

## 因子列表

In [8]:
non_factor_cols = [
    'Unnamed: 0',   # 索引垃圾列
    'STKCD',        # 股票代码（实体标识）
    'date',         # 原始日期
    'TRDMNT',       # 月份（面板时间索引）
    'MRETWD',       # 含分红收益（因变量）
    'MRETND',       # 不含分红收益（因变量）
    'MARKETTYPE' ,   # 市场分类标签
    'INDCDZX'
]

# 只保留真正的因子列
factor_cols = [
    c for c in df_csv.columns
    if c not in non_factor_cols
]

print("因子列数量:", len(factor_cols))
print("前10个因子列:", factor_cols[:10])


因子列数量: 112
前10个因子列: ['mom6m', 'mom12m', 'mom36m', 'mom1m', 'chmom', 'turn', 'IPO', 'indmom', 'maxret', 'retvol']


In [5]:
# 2️⃣ 重命名 'MRETWD' 为 'RET'
df_csv_all = df_csv.rename(columns={'MRETWD': 'RET'})

# 3️⃣ 查看结果
print(df_csv_all.columns.tolist())


['STKCD', 'date', 'mom6m', 'mom12m', 'mom36m', 'mom1m', 'chmom', 'turn', 'IPO', 'indmom', 'maxret', 'retvol', 'std_dolvol', 'std_turn', 'ill', 'zerotrade', 'beta', 'betasq', 'pricedelay', 'idiovol', 'prc', 'size', 'volt', 'B_Mkt', 'B_Mktsq', 'B_Dim', 'B_Dn', 'B_FF', 'B_HS', 'B_LSY', 'ACC', 'PACC', 'age', 'ATO', 'BM', 'CAPXG', 'CFD', 'CFOA', 'CFP', 'CP', 'CR', 'CRG', 'CTA', 'CTO', 'dBe', 'DP', 'EBIT', 'EP', 'EY', 'GM', 'GP', 'IVC', 'IVG', 'IA', 'am', 'LG', 'DER', 'DLME', 'NOA', 'NPOP', 'dPIA', 'PY', 'QR', 'QRG', 'RNA', 'ROA', 'ROE', 'ROIC', 'SC', 'SI', 'SMI', 'SP', 'SG', 'TG', 'TBI', 'Z', 'CHTX', 'CINVEST', 'realestate', 'salerev', 'IC', 'DA', 'DPR', 'TOR', 'EPS', 'CEPS', 'NCF', 'GS', 'ROO', 'RFI', 'TA', 'AD', 'FM', 'FOE', 'IP', 'FVAD', 'NAPS', 'RSGL', 'TMT', 'CAC', 'IBV', 'RDS', 'RDM', 'RCA', 'VAHU', 'Hn', 'LFE', 'Adm', 'gAd', 'Ol', 'AnA', 'ReA', 'Tan', 'INA', 'TRDMNT', 'RET', 'MRETND', 'MARKETTYPE', 'INDCDZX']


In [6]:
df_csv_filtered=df_csv_all

# 选取小表
任意选取了一个指标,如“RDM”

In [7]:
sort_factor = factor_cols[0]

In [9]:
sub_table = df_csv_filtered[['STKCD','TRDMNT',sort_factor,'RET','size']]

In [10]:
print(sub_table.head())

    STKCD  TRDMNT     mom6m       RET      size
0  000002  200001 -1.042180  0.138941  4.658447
1  000002  200002 -0.210615  1.510853  5.256671
2  000002  200003 -0.269587 -1.293463  6.162150
3  000002  200004 -0.003622 -0.317116  5.608948
4  000002  200005  0.913752 -0.822494  5.151797


## 清理空格

In [11]:
df_clean=cleanBlank(sub_table, 'TRDMNT','STKCD')
print(df_clean.head())

firstSort: 删除了 26331 行（698913 → 672582）
      STKCD  TRDMNT     mom6m       RET      size
0    000002  200001 -1.042180  0.138941  4.658447
294  000003  200001 -0.138370 -0.535968  2.025178
318  000004  200001 -0.129089  1.193884 -0.623744
609  000005  200001 -1.519921  0.478978  1.900671
885  000006  200001 -1.384669  0.142376  1.482002


# 运行代码

In [12]:
#========================================================
#                   第一步、分组
#========================================================

def GroupN(in_df, sort_var, vars, n_group=10):
    out_df = in_df.copy()
    out_df[f"{vars}_g{n_group}"] = out_df.groupby(sort_var)[vars].transform(
        lambda x: pd.qcut(x, q=n_group, labels=[i for i in range(1, n_group+1)]))
    out_df[f"{vars}_g{n_group}"] = out_df[f"{vars}_g{n_group}"] .astype(int)
    return out_df

sub_table_groupped = GroupN(df_clean, 'TRDMNT', sort_factor , n_group = ngroup)

In [13]:
#========================================================
#                   第二步、缩尾处理
#          对除了keep_cols以外的所有因子进行缩尾处理
#========================================================

class Winsorize:
    def __init__(self, in_df, sort_var, vars, perc=1, trim=0) -> None:
        self.in_df = in_df
        self.sort_var = sort_var
        self.vars = vars
        self.perc = perc
        self.trim = trim

    def func_trim(self, in_ser, perc):
        perc_upper = (100 - perc) / 100
        perc_lower = perc / 100

        qt_lower, qt_upper = in_ser.quantile([perc_lower, perc_upper])
        in_ser[in_ser > qt_upper] = np.nan
        in_ser[in_ser < qt_lower] = np.nan
        return in_ser

    def func_winsor(self, in_ser, perc):
        perc_upper = (100 - perc) / 100
        perc_lower = perc / 100
        qt_lower, qt_upper = in_ser.quantile([perc_lower, perc_upper])

        in_ser[in_ser > qt_upper] = qt_upper
        in_ser[in_ser < qt_lower] = qt_lower
        return in_ser

    def get(self, ):
        out_df = self.in_df.copy()
        if self.trim == 1:
            proc_method = self.func_trim
        if self.trim == 0:
            proc_method = self.func_winsor

        out_df[f"{self.vars}"] = out_df.groupby(
            self.sort_var)[self.vars].transform(lambda x: proc_method(x, 1))

        return out_df

In [14]:
#Winsorize
#先对size进行缩尾处理
winsor = Winsorize(sub_table_groupped, "TRDMNT",'size')
sub_table_groupped = winsor.get()
# 第二次缩尾：按（月 × 因子组）对 size 缩尾
winsor = Winsorize(sub_table_groupped, ["TRDMNT",f"{sort_factor}_g{ngroup}" ],'size')
sub_table_groupped = winsor.get()

In [15]:

#========================================================
#                   第三步、计算EW VW
#           EW=同一个月 (TRDMNT)
#           同一个因子分组（比如 AM_g10 的第 3 组）
#           把这一组里所有股票的收益率 ret 简单平均。
#           VW=同一个月
#           同一个因子组
#           用上一期市值 size 当权重，对收益率加权平均
#
#       在这里的收益率指的是RET [考虑现金红利再投资的月个股回报率]
#       从这里开始就只有一个了，上面都是对全部的因子进行循环处理
#=======================================================

In [16]:
print(sub_table_groupped.head(5))

      STKCD  TRDMNT     mom6m       RET      size  mom6m_g10
0    000002  200001 -1.042180  0.138941  3.223238          2
294  000003  200001 -0.138370 -0.535968  2.025178          6
318  000004  200001 -0.129089  1.193884 -0.623744          6
609  000005  200001 -1.519921  0.478978  1.900671          1
885  000006  200001 -1.384669  0.142376  1.482002          1


In [17]:
import statsmodels.formula.api as smf

def get_stat(ret_df, max_lag: int = None):

    inner_df = ret_df.copy()

    ret_mean = inner_df.mean() * 100

    if max_lag == None:
        ret_t = stats.ttest_1samp(inner_df, 0)[0]
        ret_p = stats.ttest_1samp(inner_df, 0)[1]
        ret_t = pd.Series(ret_t, index=ret_mean.index)
        ret_p = pd.Series(ret_p, index=ret_mean.index)
    else:
        assert type(max_lag) == int, "input an integer max_lag"
        ret_t = []
        ret_p = []
        for col in inner_df.columns:
            reg = smf.ols(f"{col} ~ 1", data=inner_df).fit(
                cov_type='HAC', cov_kwds={'maxlags': max_lag})
            t_v = reg.tvalues['Intercept']
            p_v = reg.pvalues['Intercept']
            ret_t.append(t_v)
            ret_p.append(p_v)
        ret_t = pd.Series(ret_t, index=ret_mean.index)
        ret_p = pd.Series(ret_p, index=ret_mean.index)
    
    ret_mean.name = 'mean'
    ret_t.name = 't'
    ret_p.name = 'p'
    
    stats_data = pd.DataFrame([ret_mean, ret_t, ret_p])
    return stats_data

In [18]:
in_ret = sub_table_groupped.copy(deep =True)
print(in_ret.head(5))

ew_ret = in_ret.groupby(['TRDMNT', f"{sort_factor}_g{ngroup}"])['RET'].mean()
vw_ret = (
    in_ret.groupby(['TRDMNT', f"{sort_factor}_g{ngroup}"])
          .apply(lambda g: np.average(g['RET'], weights=g['size']))
)
vw_ret.name = "Vw_ret"

ew_mean = ew_ret.copy(deep=True)
ew_mean.name = 'Ew_ret'

vw_mean = vw_ret.copy(deep=True)
vw_mean.name = 'Vw_ret'

month_count = in_ret.groupby(
    ['TRDMNT', f"{sort_factor}_g{ngroup}"]
)['RET'].count()
month_count.name = 'Count'

sort_factor_mean = in_ret.groupby(
    ['TRDMNT', f"{sort_factor}_g{ngroup}"]
)[sort_factor].mean()

month_result = pd.concat(
    [month_count, sort_factor_mean, ew_mean, vw_mean],
    axis=1,
    ignore_index=False
)

ew_ret = ew_ret.unstack()
vw_ret = vw_ret.unstack()

ew_ret.columns = [f"col_{i+1}" for i in range(ngroup)]
vw_ret.columns = [f"col_{i+1}" for i in range(ngroup)]

ew_ret['high_low'] = ew_ret[f"col_{ngroup}"] - ew_ret["col_1"]
vw_ret['high_low'] = vw_ret[f"col_{ngroup}"] - vw_ret["col_1"]

ew_other = ew_ret.loc[:, ['high_low']]
ew_other = ew_other.stack()
ew_other.name = 'Ew_ret'

vw_other = vw_ret.loc[:, ['high_low']]
vw_other = vw_other.stack()
vw_other.name = 'Vw_ret'

other = pd.concat([ew_other, vw_other], axis=1, ignore_index=False)
other = other.reset_index()
other = other.rename(columns={'level_1': f"{sort_factor}_g{ngroup}"})
other = other.set_index(['TRDMNT', f"{sort_factor}_g{ngroup}"])

month_result = pd.concat([month_result, other], axis=0, ignore_index=False)
month_result.sort_index(inplace=True)

month_result.to_csv(os.path.join(op_path, f"{sort_factor}_month_result.csv"))

# 如果 index 是 PeriodIndex，用 to_timestamp
if isinstance(ew_ret.index, pd.PeriodIndex):
    ew_ret.index = ew_ret.index.to_timestamp(how="end")

if isinstance(vw_ret.index, pd.PeriodIndex):
    vw_ret.index = vw_ret.index.to_timestamp(how="end")

ew_stat = get_stat(ew_ret, max_lag = 3)
vw_stat = get_stat(vw_ret, max_lag = 3)

ew_stat.to_csv(os.path.join(op_path, f"{sort_factor}_ew_result.csv"))
vw_stat.to_csv(os.path.join(op_path, f"{sort_factor}_vw_result.csv"))


      STKCD  TRDMNT     mom6m       RET      size  mom6m_g10
0    000002  200001 -1.042180  0.138941  3.223238          2
294  000003  200001 -0.138370 -0.535968  2.025178          6
318  000004  200001 -0.129089  1.193884 -0.623744          6
609  000005  200001 -1.519921  0.478978  1.900671          1
885  000006  200001 -1.384669  0.142376  1.482002          1
