In [9]:
import rqdatac
import pandas as pd
import numpy as np
import re # 导入正则表达式库

def generate_micro_factors(all_instruments: pd.DataFrame, start_date: str, end_date: str, factor_name: str) -> pd.DataFrame:
    """
    【最终修正版】为一个资产池生成指定的微观量价因子。
    该版本更高效，只计算被请求的因子，并增强了对NaN值的处理和因子名称的解析。
    """
    print(f"开始为因子 '{factor_name}' 获取基础量价数据...")
    df = rqdatac.get_price(
        all_instruments['order_book_id'].tolist(),
        start_date, end_date, frequency='1d', fields=['open', 'high', 'low', 'close', 'volume']
    ).reset_index().set_index(['order_book_id', 'date'])

    print("数据获取完毕，开始计算因子...")

    # --- 准备工作 ---
    df['daily_return'] = df.groupby(level=0)['close'].pct_change()
    
    # --- 【核心修改】根据传入的 factor_name，只计算需要的因子 ---
    
    # 动量因子
    if factor_name.startswith('momentum_'):
        # 【已修正】使用更稳健的正则表达式来提取窗口期数字
        match = re.search(r'(\d+)d$', factor_name)
        if not match: raise ValueError(f"无法从因子名称解析窗口期: {factor_name}")
        n = int(match.group(1))
        
        print(f"正在计算动量因子: {factor_name}...")
        factor_series = df.groupby(level=0)['close'].pct_change(n)
        df[factor_name] = factor_series

    # 波动率因子
    elif factor_name.startswith('volatility_'):
        # 【已修正】使用更稳健的正则表达式来提取窗口期数字
        match = re.search(r'(\d+)d$', factor_name)
        if not match: raise ValueError(f"无法从因子名称解析窗口期: {factor_name}")
        n = int(match.group(1))
        
        print(f"正在计算波动率因子: {factor_name}...")
        vol_series = df.groupby(level=0)['daily_return'].rolling(window=n, min_periods=2).std() * np.sqrt(252)
        df[factor_name] = vol_series.reset_index(level=0, drop=True)

    # 平均成交量因子
    elif factor_name.startswith('avg_volume_'):
        # 【已修正】使用更稳健的正则表达式来提取窗口期数字
        match = re.search(r'(\d+)d$', factor_name)
        if not match: raise ValueError(f"无法从因子名称解析窗口期: {factor_name}")
        n = int(match.group(1))

        print(f"正在计算流动性因子: {factor_name}...")
        avg_vol_series = df.groupby(level=0)['volume'].rolling(window=n, min_periods=2).mean()
        df[factor_name] = avg_vol_series.reset_index(level=0, drop=True)
        
    # 成交量冲击因子
    elif factor_name == 'volume_shock':
        print("正在计算流动性因子: volume_shock...")
        avg_vol_20d = df.groupby(level=0)['volume'].rolling(window=20, min_periods=2).mean().reset_index(level=0, drop=True)
        avg_vol_60d = df.groupby(level=0)['volume'].rolling(window=60, min_periods=2).mean().reset_index(level=0, drop=True)
        df['volume_shock'] = avg_vol_20d / avg_vol_60d - 1

    # RSI
    elif factor_name == 'rsi_14d':
        print("正在计算技术指标: RSI...")
        delta = df.groupby(level=0)['close'].diff()
        gain = delta.clip(lower=0).ewm(com=13, adjust=False).mean()
        loss = -delta.clip(upper=0).ewm(com=13, adjust=False).mean()
        rs = gain / loss
        df['rsi_14d'] = 100 - (100 / (1 + rs))

    # MACD
    elif factor_name == 'macd':
        print("正在计算技术指标: MACD...")
        ewm_12d = df.groupby(level=0)['close'].ewm(span=12, adjust=False).mean().reset_index(level=0, drop=True)
        ewm_26d = df.groupby(level=0)['close'].ewm(span=26, adjust=False).mean().reset_index(level=0, drop=True)
        macd_line = ewm_12d - ewm_26d
        signal_line = macd_line.groupby(level=0).ewm(span=9, adjust=False).mean().reset_index(level=0, drop=True)
        df['macd'] = macd_line - signal_line
        
    else:
        raise ValueError(f"未知的因子名称: {factor_name}")

    # --- 清理和返回 ---
    print(f"因子 {factor_name} 计算完成。")
    
    # 1. 只选择我们需要的因子列
    final_df = df[[factor_name]].dropna()
    # 2. 将列名统一为 'beta'，方便后续Fama-MacBeth回归
    final_df = final_df.rename(columns={factor_name: 'beta'})
    # 3. 将索引 ('order_book_id', 'date') 转换成列，以符合后续流程
    final_df = final_df.reset_index()
    
    return final_df


In [10]:
if __name__ == '__main__':
    import rqdatac
    import pandas as pd
    import statsmodels.api as sm
    from linearmodels.panel import FamaMacBeth
    import csv
    import numpy as np
    import import_ipynb
    import a_general_factor_test
    rqdatac.init()
    start_date = pd.to_datetime('2018-01-01')
    end_date = pd.to_datetime('2020-12-31')
    instrument_type = 'Convertible'
    factor_name = '第二产业增加值占GDP比重(现价)'
    remaining_time_to_mature ='短期 (1-3年)'
    rolling_window = 90
    factor_name = ['momentum_5d',  'momentum_10d',  'momentum_20d','momentum_60d',  'momentum_90d', 
                    'volatility_20d','volatility_60d','volatility_90d','avg_volume_20d','avg_volume_60d',
                    'avg_volume_90d','volume_shock','rsi_14d','macd']
    all_instruments = a_general_factor_test.get_bonds_poll(start_date, end_date, instrument_type)
    rolling_micro_factors = generate_micro_factors(all_instruments,start_date,end_date,factor_name[1])
    print(rolling_micro_factors)



testmodel2
可转债数量: 449
开始为因子 'momentum_10d' 获取基础量价数据...
数据获取完毕，开始计算因子...
正在计算动量因子: momentum_10d...
因子 momentum_10d 计算完成。
       order_book_id       date      beta
0        110030.XSHG 2018-01-16  0.053131
1        110030.XSHG 2018-01-17  0.046179
2        110030.XSHG 2018-01-18  0.045455
3        110030.XSHG 2018-01-19  0.005945
4        110030.XSHG 2018-01-22  0.001458
...              ...        ...       ...
130029   132022.XSHG 2020-12-25 -0.007113
130030   132022.XSHG 2020-12-28 -0.009901
130031   132022.XSHG 2020-12-29  0.011765
130032   132022.XSHG 2020-12-30  0.008317
130033   132022.XSHG 2020-12-31  0.000000

[130034 rows x 3 columns]
