In [1]:
import sys
sys.path.append('/public/src')
from factor_evaluation_server import FactorEvaluation, DataService  # type: ignore
import numpy as np
import pandas as pd
from scipy.signal import savgol_filter
import warnings
warnings.filterwarnings('ignore')

In [2]:
ds=DataService()
df=ds['ETHUSDT_15m_2020_2025']['2021-10-01':]

In [3]:
evaluator=FactorEvaluation(df=df,future_return_periods=10)

# 定义因子！

In [4]:
def factor(df):
    """
    优化点：
    1. 时间衰减加权机制
    2. 多尺度因子分解（高频/低频）
    3. 趋势-均值回复状态识别
    """
    # 计算主动买入量占比（时间衰减加权）
    decay_factor = 0.99
    weights = np.array([decay_factor**i for i in range(48)])[::-1]
    weights /= weights.sum()
    
    df['buy_ratio'] = (
        df['taker_buy_volume'].rolling(48).apply(lambda x: np.dot(x, weights), raw=True) 
        / (df['volume'].rolling(48).apply(lambda x: np.dot(x, weights), raw=True) + 1e-7))
    
    # 波动率计算（多尺度）
    ret = df['close'].pct_change()
    vol_short = ret.ewm(span=48).std()
    vol_medium = ret.ewm(span=192).std()
    vol_long = ret.ewm(span=768).std()
    
    # 波动率比率（识别市场状态）
    vol_ratio = vol_short / vol_long
    df['volatility'] = np.where(
        vol_ratio > 1.2, vol_short,  # 高波动状态
        np.where(
            vol_ratio < 0.8, vol_long,  # 低波动状态
            vol_medium  # 正常状态
        )
    )
    
    # 计算量价比率（状态自适应）
    vp_ratio = np.log1p(df['volume']) / (df['volatility'] + 1e-7)
    
    # 关键周期定义（多尺度）
    short_periods = [24, 48, 96]
    long_periods = [288, 672, 1440, 2880]
    
    # 计算短期量价比率
    vp_short = np.column_stack([
        vp_ratio.ewm(span=p, min_periods=int(p/4)).mean()
        for p in short_periods
    ])
    
    # 计算长期量价比率（使用HP滤波分解）
    vp_long = []
    for p in long_periods:
        # 使用HP滤波分离趋势
        cycle, trend = hp_filter(vp_ratio.values, lamb=1600*(p/1440)**2)
        vp_long.append(trend)
    vp_long = np.column_stack(vp_long)
    
    # 改进的协同指标（多尺度）
    synergy_short = np.zeros((len(df), len(short_periods)))
    synergy_long = np.zeros((len(df), len(long_periods)))
    
    # 短期协同
    for i, p in enumerate(short_periods):
        price_trend = df['close'].pct_change(p)
        flow_trend = df['buy_ratio'].diff(p)
        synergy_short[:, i] = np.sign(price_trend) * flow_trend
    
    # 长期协同
    for i, p in enumerate(long_periods):
        price_trend = np.log(df['close'] / df['close'].shift(p))
        flow_trend = df['buy_ratio'].rolling(p).mean() - df['buy_ratio'].shift(p).rolling(p).mean()
        synergy_long[:, i] = np.sign(price_trend) * flow_trend
    
    # 组合特征矩阵（多尺度）
    combined_matrix = np.concatenate((vp_short, vp_long, synergy_short, synergy_long), axis=1)
    
    # 多尺度标准化
    scaled_std = np.zeros_like(combined_matrix)
    short_window = 672
    long_window = 2880
    
    for i in range(combined_matrix.shape[1]):
        col = pd.Series(combined_matrix[:, i])
        
        # 前7列为短期特征
        if i < (len(short_periods) + len(long_periods)):
            rolling_mean = col.ewm(span=short_window).mean()
            rolling_std = col.ewm(span=short_window).std()
        # 后7列为长期特征
        else:
            rolling_mean = col.ewm(span=long_window).mean()
            rolling_std = col.ewm(span=long_window).std()
        
        scaled_std[:, i] = (col - rolling_mean) / (rolling_std + 1e-7)
    
    # 时间衰减加权（近期特征权重更高）
    decay_weights = np.array([0.95**i for i in range(combined_matrix.shape[1])])[::-1]
    decay_weights /= decay_weights.sum()
    
    # 最终因子计算（多尺度组合）
    flow_weight = df['buy_ratio'].clip(0.3, 0.7)
    volume_weight = np.log1p(df['volume']) / (np.log1p(df['volume']).rolling(2880).std() + 1)
    
    factor = np.average(scaled_std, axis=1, weights=decay_weights) * flow_weight * volume_weight
    
    # 高频噪声过滤
    factor = pd.Series(savgol_filter(factor, 21, 3))
    return factor

# HP滤波函数
def hp_filter(series, lamb=1600):
    n = len(series)
    I = np.eye(n)
    D = np.zeros((n-2, n))
    for i in range(n-2):
        D[i, i] = 1
        D[i, i+1] = -2
        D[i, i+2] = 1
    
    trend = np.linalg.solve(I + lamb * D.T @ D, series)
    cycle = series - trend
    return cycle, trend

# 测试因子表现

In [None]:
evaluator.set_factor(
    factor_func=lambda df: factor(df),
    factor_name='factor'
)

result=evaluator.run_full_evaluation(run_stationarity_test=True)

In [None]:
from tabulate import tabulate

def display_factor_evaluation(result):
    # 1. 基础统计指标
    basic_metrics = [
        ["信息系数(IC)", result['correlation_analysis']['IC']],
        ["秩相关系数(Rank IC)", result['correlation_analysis']['Rank_IC']],
        ["信息比率(IR)", result['information_ratio']['IR']],
        ["分组数量", result['information_ratio']['n_groups']],
        ["数据点数", f"{result['factor_return_scatter']['data_points']:,}"],
        ["异常值(>3σ)", f"{result['factor_distribution']['outlier_analysis']['>3std']*100:.2f}%"],
        ["异常值(>5σ)", f"{result['factor_distribution']['outlier_analysis']['>5std']*100:.2f}%"]
    ]
    
    # 2. 分组分析
    group_stats = result['group_analysis']['group_stats']
    group_table = []
    for idx, row in group_stats.iterrows():
        group_table.append([
            idx,
            f"{row['val_min']:.4f}",
            f"{row['val_max']:.4f}",
            f"{row['val_mean']:.4f}",
            f"{row['return_mean']:.6f}",
            f"{row['return_std']:.4f}",
            row['count']
        ])
    
    # 3. 因子分布描述
    dist_stats = result['factor_distribution']['descriptive_stats']
    distribution_metrics = [
        ["均值", f"{dist_stats['mean']:.4f}"],
        ["标准差", f"{dist_stats['std']:.4f}"],
        ["最小值", f"{dist_stats['min']:.4f}"],
        ["1%分位数", f"{dist_stats['1%']:.4f}"],
        ["5%分位数", f"{dist_stats['5%']:.4f}"],
        ["25%分位数", f"{dist_stats['25%']:.4f}"],
        ["中位数", f"{dist_stats['50%']:.4f}"],
        ["75%分位数", f"{dist_stats['75%']:.4f}"],
        ["95%分位数", f"{dist_stats['95%']:.4f}"],
        ["99%分位数", f"{dist_stats['99%']:.4f}"],
        ["最大值", f"{dist_stats['max']:.4f}"]
    ]
    
    # 4. 分组相关性
    group_correlations = []
    for i, corr in enumerate(result['information_ratio']['group_correlations']):
        group_correlations.append([f"组 {i+1}", f"{corr:.6f}"])
    
    # 5. 滚动IC分析 - 完整展示
    rolling_ic = result['rolling_ic_analysis']['rolling_ic']
    rolling_rank_ic = result['rolling_ic_analysis']['rolling_rank_ic']
    window_centers = result['rolling_ic_analysis']['window_centers']
    cumsum_ic = result['rolling_ic_analysis']['cumsum_ic']
    cumsum_rank_ic = result['rolling_ic_analysis']['cumsum_rank_ic']
    
    # 创建滚动IC数据表格
    rolling_table = []
    for i in range(len(rolling_ic)):
        rolling_table.append([
            window_centers[i].strftime('%Y-%m-%d %H:%M:%S'),
            f"{rolling_ic[i]:.6f}",
            f"{rolling_rank_ic[i]:.6f}",
            f"{cumsum_ic[i]:.6f}",
            f"{cumsum_rank_ic[i]:.6f}"
        ])
    
    # 滚动IC汇总统计
    rolling_metrics = [
        ["滚动IC均值", f"{pd.Series(rolling_ic).mean():.6f}"],
        ["滚动IC标准差", f"{pd.Series(rolling_ic).std():.6f}"],
        ["滚动Rank IC均值", f"{pd.Series(rolling_rank_ic).mean():.6f}"],
        ["滚动Rank IC标准差", f"{pd.Series(rolling_rank_ic).std():.6f}"],
        ["累积IC终值", f"{cumsum_ic[-1]:.6f}"],
        ["累积Rank IC终值", f"{cumsum_rank_ic[-1]:.6f}"],
        ["初始窗口大小", result['rolling_ic_analysis']['initial_window']],
        ["滚动步长", result['rolling_ic_analysis']['rolling_step']]
    ]
    
    # 6. 时间窗口统计
    window_stats = []
    window_data = result['factor_distribution']['window_stats'][10]
    
    # 正确处理时间窗口统计数据
    if isinstance(window_data['stats'], pd.DataFrame):
        # 如果是DataFrame结构
        for idx, row in window_data['stats'].iterrows():
            window_stats.append([
                idx,
                f"{row['mean']:.4f}",
                f"{row['std']:.4f}",
                f"{row['skew']:.4f}",
                f"{row['lambda']:.4f}",
                row['count']
            ])
    else:
        # 如果是字典结构
        for time_range, stats in window_data['stats'].items():
            window_stats.append([
                time_range,
                f"{stats['mean']:.4f}",
                f"{stats['std']:.4f}",
                f"{stats['skew']:.4f}",
                f"{stats['lambda']:.4f}",
                stats['count']
            ])
    
    # # 7. 平稳性测试结果
    # stationarity = result['stationarity_test']
    # stationarity_metrics = [
    #     ["ADF统计量", stationarity['adf_statistic']],
    #     ["P值", stationarity['p_value']],
    #     ["1%临界值", stationarity['critical_values']['1%']],
    #     ["5%临界值", stationarity['critical_values']['5%']],
    #     ["10%临界值", stationarity['critical_values']['10%']],
    #     ["是否平稳", stationarity['is_stationary']],
    #     #["显著性水平", stationarity['alpha']]
    # ]
    
    # 打印所有结果
    print("="*80)
    print("因子评估核心指标".center(80))
    print("="*80)
    print(tabulate(basic_metrics, headers=["指标", "值"], tablefmt="grid", floatfmt=".6f"))
    
    # print("\n" + "="*80)
    # print("平稳性测试结果".center(80))
    # print("="*80)
    # print(tabulate(stationarity_metrics, headers=["指标", "值"], tablefmt="grid", floatfmt=".6f"))
    
    print("\n" + "="*80)
    print("分组分析结果".center(80))
    print("="*80)
    print(tabulate(group_table, headers=["分组", "最小值", "最大值", "平均值", "收益均值", "收益标准差", "样本数"], tablefmt="grid"))
    
    print("\n" + "="*80)
    print("因子分布统计".center(80))
    print("="*80)
    print(tabulate(distribution_metrics, headers=["统计量", "值"], tablefmt="grid", floatfmt=".4f"))
    
    print("\n" + "="*80)
    print("分组相关性".center(80))
    print("="*80)
    print(tabulate(group_correlations, headers=["分组", "相关性"], tablefmt="grid", floatfmt=".6f"))
    
    print("\n" + "="*80)
    print("滚动IC分析汇总".center(80))
    print("="*80)
    print(tabulate(rolling_metrics, headers=["指标", "值"], tablefmt="grid", floatfmt=".6f"))
    
    print("\n" + "="*80)
    print("滚动IC详细数据".center(80))
    print("="*80)
    print(tabulate(rolling_table, headers=["窗口中心", "滚动IC", "滚动Rank IC", "累积IC", "累积Rank IC"], tablefmt="grid"))
    
    print("\n" + "="*80)
    print("时间窗口统计".center(80))
    print("="*80)
    print(tabulate(window_stats, headers=["时间窗口", "均值", "标准差", "偏度", "Lambda", "样本数"], tablefmt="grid"))
    print(f"时间范围: {window_data['time_ranges']}")

# 调用展示函数
display_factor_evaluation(result)

                                    因子评估核心指标                                    
+---------------------+-----------------------+
| 指标                | 值                    |
| 信息系数(IC)        | 0.014574277839937382  |
+---------------------+-----------------------+
| 秩相关系数(Rank IC) | 0.010976959708802995  |
+---------------------+-----------------------+
| 信息比率(IR)        | -0.008057134574379052 |
+---------------------+-----------------------+
| 分组数量            | 10                    |
+---------------------+-----------------------+
| 数据点数            | 128,607               |
+---------------------+-----------------------+
| 异常值(>3σ)         | 4.52%                 |
+---------------------+-----------------------+
| 异常值(>5σ)         | 0.12%                 |
+---------------------+-----------------------+

                                     分组分析结果                                     
+--------+----------+----------+----------+------------+--------------+----------+
|   分组 |   最小值 |