In [23]:
import sys
sys.path.append('/public/src')
from factor_evaluation_server import FactorEvaluation, DataService  # type: ignore
import numpy as np
import pandas as pd
from scipy.ndimage import uniform_filter1d
import warnings
warnings.filterwarnings('ignore')

In [24]:
ds=DataService()
df=ds['ETHUSDT_15m_2020_2025']['2021-10-01':]

In [25]:
evaluator=FactorEvaluation(df=df,future_return_periods=10)

# 定义因子！

In [26]:
def factor(df):
    # 计算主动买入量占比
    df['buy_ratio'] = df['taker_buy_volume'] / (df['volume'] + 1e-7)
    
    # === 核心改进1: 尾部市场压力指数 ===
    # 1. 价格效率指标 (收盘价相对于高低点位置)
    df['price_efficiency'] = (df['close'] - df['low']) / (df['high'] - df['low'] + 1e-7)
    
    # 2. 成交量压力指标
    volume_ma = df['volume'].ewm(span=96).mean()
    df['volume_pressure'] = (df['volume'] - volume_ma) / (volume_ma + 1e-7)
    
    # 3. 买卖失衡指标
    df['order_imbalance'] = (df['taker_buy_volume'] - (df['volume'] - df['taker_buy_volume'])) / df['volume']
    
    # 综合压力指数
    df['pressure_index'] = (
        (1 - df['price_efficiency']) *  # 收盘接近低点增加压力
        np.tanh(np.abs(df['volume_pressure'])) *  # 成交量异常增加压力
        np.where(df['order_imbalance'] < 0, 1.5, 1.0)  # 卖出主导增加压力
    )
    
    # 计算价格波动率 (压力调整)
    price_change = df['close'].pct_change()
    volatility = price_change.ewm(span=96, min_periods=24).std() * (1 + df['pressure_index'])
    
    # 计算量价比率 (压力调整)
    vp_ratio = df['volume'] / (volatility + 1e-7)
    
    # 关键周期定义
    periods = [96, 288, 672]
    
    # 计算各周期量价比率
    vp_matrix = np.column_stack([
        vp_ratio.ewm(span=p, min_periods=int(p/4)).mean()
        for p in periods
    ])
    
    # === 尾部反转增强 ===
    # 1. 短期反转信号
    short_term_return = df['close'].pct_change(4)  # 1小时回报
    reversal_signal = -np.sign(short_term_return) * np.log1p(df['volume'])
    
    # 2. 压力区域增强反转
    high_pressure = (df['pressure_index'] > df['pressure_index'].rolling(1440).quantile(0.85))
    reversal_signal = np.where(high_pressure, reversal_signal * 1.8, reversal_signal)
    
    # 组合特征矩阵 - 修复点：移除 .values
    combined_matrix = np.concatenate((vp_matrix, reversal_signal.reshape(-1, 1)), axis=1)
    
    # 滚动分位数标准化 (压力感知)
    scaled_std = np.zeros_like(combined_matrix)
    for i in range(combined_matrix.shape[1]):
        col = pd.Series(combined_matrix[:, i])
        
        # 高压区域使用短窗口标准化
        high_pressure_mask = high_pressure
        normal_pressure_mask = ~high_pressure
        
        # 高压区域标准化
        col_high = col[high_pressure_mask]
        if len(col_high) > 100:
            q20_high = col_high.rolling(720, min_periods=180).quantile(0.20)
            q80_high = col_high.rolling(720, min_periods=180).quantile(0.80)
            scaled_std[high_pressure_mask, i] = (col_high - q20_high) / (q80_high - q20_high + 1e-7)
        
        # 正常区域标准化
        col_normal = col[normal_pressure_mask]
        if len(col_normal) > 100:
            q20_normal = col_normal.rolling(2880, min_periods=720).quantile(0.20)
            q80_normal = col_normal.rolling(2880, min_periods=720).quantile(0.80)
            scaled_std[normal_pressure_mask, i] = (col_normal - q20_normal) / (q80_normal - q20_normal + 1e-7)
    
    # 最终因子计算 (压力加权)
    factor = np.mean(scaled_std, axis=1) * (1 + df['pressure_index']) * df['buy_ratio']
    return factor

# 测试因子表现

In [27]:
evaluator.set_factor(
    factor_func=lambda df: factor(df),
    factor_name='factor'
)

result=evaluator.run_full_evaluation(run_stationarity_test=True)

ValueError: 因子函数执行失败: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [None]:
from tabulate import tabulate

def display_factor_evaluation(result):
    # 1. 基础统计指标
    basic_metrics = [
        ["信息系数(IC)", result['correlation_analysis']['IC']],
        ["秩相关系数(Rank IC)", result['correlation_analysis']['Rank_IC']],
        ["信息比率(IR)", result['information_ratio']['IR']],
        ["分组数量", result['information_ratio']['n_groups']],
        ["数据点数", f"{result['factor_return_scatter']['data_points']:,}"],
        ["异常值(>3σ)", f"{result['factor_distribution']['outlier_analysis']['>3std']*100:.2f}%"],
        ["异常值(>5σ)", f"{result['factor_distribution']['outlier_analysis']['>5std']*100:.2f}%"]
    ]
    
    # 2. 分组分析
    group_stats = result['group_analysis']['group_stats']
    group_table = []
    for idx, row in group_stats.iterrows():
        group_table.append([
            idx,
            f"{row['val_min']:.4f}",
            f"{row['val_max']:.4f}",
            f"{row['val_mean']:.4f}",
            f"{row['return_mean']:.6f}",
            f"{row['return_std']:.4f}",
            row['count']
        ])
    
    # 3. 因子分布描述
    dist_stats = result['factor_distribution']['descriptive_stats']
    distribution_metrics = [
        ["均值", f"{dist_stats['mean']:.4f}"],
        ["标准差", f"{dist_stats['std']:.4f}"],
        ["最小值", f"{dist_stats['min']:.4f}"],
        ["1%分位数", f"{dist_stats['1%']:.4f}"],
        ["5%分位数", f"{dist_stats['5%']:.4f}"],
        ["25%分位数", f"{dist_stats['25%']:.4f}"],
        ["中位数", f"{dist_stats['50%']:.4f}"],
        ["75%分位数", f"{dist_stats['75%']:.4f}"],
        ["95%分位数", f"{dist_stats['95%']:.4f}"],
        ["99%分位数", f"{dist_stats['99%']:.4f}"],
        ["最大值", f"{dist_stats['max']:.4f}"]
    ]
    
    # 4. 分组相关性
    group_correlations = []
    for i, corr in enumerate(result['information_ratio']['group_correlations']):
        group_correlations.append([f"组 {i+1}", f"{corr:.6f}"])
    
    # 5. 滚动IC分析 - 完整展示
    rolling_ic = result['rolling_ic_analysis']['rolling_ic']
    rolling_rank_ic = result['rolling_ic_analysis']['rolling_rank_ic']
    window_centers = result['rolling_ic_analysis']['window_centers']
    cumsum_ic = result['rolling_ic_analysis']['cumsum_ic']
    cumsum_rank_ic = result['rolling_ic_analysis']['cumsum_rank_ic']
    
    # 创建滚动IC数据表格
    rolling_table = []
    for i in range(len(rolling_ic)):
        rolling_table.append([
            window_centers[i].strftime('%Y-%m-%d %H:%M:%S'),
            f"{rolling_ic[i]:.6f}",
            f"{rolling_rank_ic[i]:.6f}",
            f"{cumsum_ic[i]:.6f}",
            f"{cumsum_rank_ic[i]:.6f}"
        ])
    
    # 滚动IC汇总统计
    rolling_metrics = [
        ["滚动IC均值", f"{pd.Series(rolling_ic).mean():.6f}"],
        ["滚动IC标准差", f"{pd.Series(rolling_ic).std():.6f}"],
        ["滚动Rank IC均值", f"{pd.Series(rolling_rank_ic).mean():.6f}"],
        ["滚动Rank IC标准差", f"{pd.Series(rolling_rank_ic).std():.6f}"],
        ["累积IC终值", f"{cumsum_ic[-1]:.6f}"],
        ["累积Rank IC终值", f"{cumsum_rank_ic[-1]:.6f}"],
        ["初始窗口大小", result['rolling_ic_analysis']['initial_window']],
        ["滚动步长", result['rolling_ic_analysis']['rolling_step']]
    ]
    
    # 6. 时间窗口统计
    window_stats = []
    window_data = result['factor_distribution']['window_stats'][10]
    
    # 正确处理时间窗口统计数据
    if isinstance(window_data['stats'], pd.DataFrame):
        # 如果是DataFrame结构
        for idx, row in window_data['stats'].iterrows():
            window_stats.append([
                idx,
                f"{row['mean']:.4f}",
                f"{row['std']:.4f}",
                f"{row['skew']:.4f}",
                f"{row['lambda']:.4f}",
                row['count']
            ])
    else:
        # 如果是字典结构
        for time_range, stats in window_data['stats'].items():
            window_stats.append([
                time_range,
                f"{stats['mean']:.4f}",
                f"{stats['std']:.4f}",
                f"{stats['skew']:.4f}",
                f"{stats['lambda']:.4f}",
                stats['count']
            ])
    
    # # 7. 平稳性测试结果
    # stationarity = result['stationarity_test']
    # stationarity_metrics = [
    #     ["ADF统计量", stationarity['adf_statistic']],
    #     ["P值", stationarity['p_value']],
    #     ["1%临界值", stationarity['critical_values']['1%']],
    #     ["5%临界值", stationarity['critical_values']['5%']],
    #     ["10%临界值", stationarity['critical_values']['10%']],
    #     ["是否平稳", stationarity['is_stationary']],
    #     #["显著性水平", stationarity['alpha']]
    # ]
    
    # 打印所有结果
    print("="*80)
    print("因子评估核心指标".center(80))
    print("="*80)
    print(tabulate(basic_metrics, headers=["指标", "值"], tablefmt="grid", floatfmt=".6f"))
    
    # print("\n" + "="*80)
    # print("平稳性测试结果".center(80))
    # print("="*80)
    # print(tabulate(stationarity_metrics, headers=["指标", "值"], tablefmt="grid", floatfmt=".6f"))
    
    print("\n" + "="*80)
    print("分组分析结果".center(80))
    print("="*80)
    print(tabulate(group_table, headers=["分组", "最小值", "最大值", "平均值", "收益均值", "收益标准差", "样本数"], tablefmt="grid"))
    
    print("\n" + "="*80)
    print("因子分布统计".center(80))
    print("="*80)
    print(tabulate(distribution_metrics, headers=["统计量", "值"], tablefmt="grid", floatfmt=".4f"))
    
    print("\n" + "="*80)
    print("分组相关性".center(80))
    print("="*80)
    print(tabulate(group_correlations, headers=["分组", "相关性"], tablefmt="grid", floatfmt=".6f"))
    
    print("\n" + "="*80)
    print("滚动IC分析汇总".center(80))
    print("="*80)
    print(tabulate(rolling_metrics, headers=["指标", "值"], tablefmt="grid", floatfmt=".6f"))
    
    print("\n" + "="*80)
    print("滚动IC详细数据".center(80))
    print("="*80)
    print(tabulate(rolling_table, headers=["窗口中心", "滚动IC", "滚动Rank IC", "累积IC", "累积Rank IC"], tablefmt="grid"))
    
    print("\n" + "="*80)
    print("时间窗口统计".center(80))
    print("="*80)
    print(tabulate(window_stats, headers=["时间窗口", "均值", "标准差", "偏度", "Lambda", "样本数"], tablefmt="grid"))
    print(f"时间范围: {window_data['time_ranges']}")

# 调用展示函数
display_factor_evaluation(result)

                                    因子评估核心指标                                    
+---------------------+-----------------------+
| 指标                | 值                    |
| 信息系数(IC)        | 0.0023722691166723165 |
+---------------------+-----------------------+
| 秩相关系数(Rank IC) | 0.011412783650012148  |
+---------------------+-----------------------+
| 信息比率(IR)        | 0.4812263110721214    |
+---------------------+-----------------------+
| 分组数量            | 10                    |
+---------------------+-----------------------+
| 数据点数            | 132,111               |
+---------------------+-----------------------+
| 异常值(>3σ)         | 4.65%                 |
+---------------------+-----------------------+
| 异常值(>5σ)         | 0.14%                 |
+---------------------+-----------------------+

                                     分组分析结果                                     
+--------+--------------+--------------+--------------+------------+--------------+----------+
|   