In [6]:
import sys
sys.path.append('/public/src')
from factor_evaluation_server import FactorEvaluation,DataService # type: ignore
import numpy as np
import pandas as pd

In [7]:
ds=DataService()
df=ds['SOLUSDT_15m_2020_2025']['2021-10-01':]

In [8]:
evaluator=FactorEvaluation(df=df,future_return_periods=10)

# 定义因子！

In [9]:
def factor(df):
    """
    计算趋势资金交易行为综合因子（加密货币版）
    步骤：
    1. 计算趋势资金相对均价因子（因子1）
    2. 计算趋势资金净支撑量因子（因子3）
    3. 对两个因子分别进行标准化（z-score）
    4. 综合因子 = -z(因子1) + z(因子3)
    5. 将t日综合因子值赋给t+1日所有分钟
    """
    # 计算因子1：趋势资金相对均价
    def calc_factor1(df):
        # ... 与因子1相同的计算逻辑 ...
        # 复制数据避免警告
        df = df.copy()
        
        # 创建日期列（UTC自然日）
        df['date'] = df.index.floor('D')
        
        # 计算每日总成交额和成交量
        daily_data = df.groupby('date').agg(
            total_turnover=('turnover', 'sum'),
            total_volume=('volume', 'sum')
        )
        daily_data['VWAP_all'] = daily_data['total_turnover'] / daily_data['total_volume']
        
        # 存储每日因子值
        daily_factor = pd.Series(index=daily_data.index, dtype=float)
        
        # 按日期排序
        dates = daily_data.index.sort_values()
        
        # 遍历每个交易日（从第6天开始，保证有5天历史数据）
        for i in range(5, len(dates)):
            current_date = dates[i]
            
            # 获取前5天数据（t-5到t-1日）
            start_date = dates[i-5]
            end_date = dates[i-1]
            past_mask = (df['date'] >= start_date) & (df['date'] <= end_date)
            past_volumes = df.loc[past_mask, 'volume']
            
            # 计算成交量90%分位数阈值
            if len(past_volumes) > 0:
                vol_threshold = np.percentile(past_volumes, 90)
            else:
                vol_threshold = 0
            
            # 获取当前日分钟数据
            current_day_mask = (df['date'] == current_date)
            current_day_data = df.loc[current_day_mask]
            
            # 识别趋势资金分钟（成交量>阈值）
            trend_mask = current_day_data['volume'] > vol_threshold
            trend_data = current_day_data[trend_mask]
            
            # 计算趋势资金VWAP
            if len(trend_data) > 0:
                trend_turnover = trend_data['turnover'].sum()
                trend_volume = trend_data['volume'].sum()
                VWAP_trend = trend_turnover / trend_volume
            else:
                VWAP_trend = daily_data.loc[current_date, 'VWAP_all']
            
            # 计算因子值
            VWAP_all = daily_data.loc[current_date, 'VWAP_all']
            factor_value = (VWAP_trend / VWAP_all) - 1
            
            # 存储当前日因子值
            daily_factor.loc[current_date] = factor_value
        
        # 创建与原始df等长的因子序列
        factor_series = pd.Series(index=df.index, dtype=float)
        
        # 将t日因子值赋给t+1日所有分钟
        for date_val in daily_factor.index:
            next_date = date_val + pd.Timedelta(days=1)
            next_date_mask = (df['date'] == next_date)
            factor_series.loc[next_date_mask] = daily_factor[date_val]
        
        return factor_series
        # 返回factor1_series（日频因子值）
    
    # 计算因子3：趋势资金净支撑量
    def calc_factor3(df):
        # ... 与因子3相同的计算逻辑 ...
        # 复制数据避免警告
        df = df.copy()
        
        # 创建日期列（UTC自然日）
        df['date'] = df.index.floor('D')
        
        # 存储每日因子值
        daily_factor = pd.Series(dtype=float)
        
        # 按日期排序
        dates = df['date'].unique()
        dates.sort()
        
        # 遍历每个交易日（从第6天开始）
        for i in range(5, len(dates)):
            current_date = dates[i]
            
            # 获取前5天数据（t-5到t-1日）
            past_dates = dates[i-5:i]
            past_mask = df['date'].isin(past_dates)
            past_volumes = df.loc[past_mask, 'volume']
            
            # 计算成交量阈值
            vol_threshold = np.percentile(past_volumes, 90) if len(past_volumes) > 0 else 0
            
            # 获取当前日分钟数据
            current_mask = (df['date'] == current_date)
            current_data = df.loc[current_mask]
            
            # 识别趋势资金分钟
            trend_mask = current_data['volume'] > vol_threshold
            trend_data = current_data[trend_mask]
            
            if len(trend_data) > 0:
                # 计算趋势资金收盘价均值
                trend_mean_close = trend_data['close'].mean()
                
                # 计算趋势支撑成交量
                trend_support = trend_data[trend_data['close'] < trend_mean_close]['volume'].sum()
                
                # 计算趋势阻力成交量
                trend_resist = trend_data[trend_data['close'] > trend_mean_close]['volume'].sum()
                
                factor_value = trend_support - trend_resist
            else:
                factor_value = 0  # 无趋势资金时设为中性值
            
            daily_factor.loc[current_date] = factor_value
        
        # 创建与原始df等长的因子序列
        factor_series = pd.Series(index=df.index, dtype=float)
        
        # 将t日因子值赋给t+1日所有分钟
        for date_val in daily_factor.index:
            next_date = date_val + pd.Timedelta(days=1)
            next_mask = (df['date'] == next_date)
            factor_series.loc[next_mask] = daily_factor[date_val]
        
        return factor_series
        # 返回factor3_series（日频因子值）
    
    # 获取日频因子值
    factor1 = calc_factor1()
    factor3 = calc_factor3()
    
    # 对齐日期索引
    common_dates = factor1.index.intersection(factor3.index)
    factor1 = factor1.loc[common_dates]
    factor3 = factor3.loc[common_dates]
    
    # 创建综合因子容器
    composite = pd.Series(index=common_dates, dtype=float)
    
    # 滚动计算标准化值（窗口20个交易日）
    for i in range(20, len(common_dates)):
        current_date = common_dates[i]
        
        # 获取过去20日因子值
        past_dates = common_dates[i-20:i]
        f1_past = factor1.loc[past_dates]
        f3_past = factor3.loc[past_dates]
        
        # 计算z-score
        z_f1 = (factor1.loc[current_date] - f1_past.mean()) / f1_past.std()
        z_f3 = (factor3.loc[current_date] - f3_past.mean()) / f3_past.std()
        
        # 合成因子
        composite.loc[current_date] = -z_f1 + z_f3
    
    # 创建与原始df等长的因子序列
    factor_series = pd.Series(index=df.index, dtype=float)
    
    # 将t日综合因子值赋给t+1日所有分钟
    for date_val in composite.index:
        next_date = date_val + pd.Timedelta(days=1)
        next_mask = (df.index.floor('D') == next_date)
        factor_series.loc[next_mask] = composite[date_val]
    
    return factor_series

# 测试因子表现

In [10]:
evaluator.set_factor(
    factor_func=lambda df: factor(df),
    factor_name='factor'
)

result=evaluator.run_full_evaluation(run_stationarity_test=True)

ValueError: 因子函数执行失败: factor.<locals>.calc_factor1() missing 1 required positional argument: 'df'

In [None]:
from tabulate import tabulate

def display_factor_evaluation(result):
    # 1. 基础统计指标
    basic_metrics = [
        ["信息系数(IC)", result['correlation_analysis']['IC']],
        ["秩相关系数(Rank IC)", result['correlation_analysis']['Rank_IC']],
        ["信息比率(IR)", result['information_ratio']['IR']],
        ["分组数量", result['information_ratio']['n_groups']],
        ["数据点数", f"{result['factor_return_scatter']['data_points']:,}"],
        ["异常值(>3σ)", f"{result['factor_distribution']['outlier_analysis']['>3std']*100:.2f}%"],
        ["异常值(>5σ)", f"{result['factor_distribution']['outlier_analysis']['>5std']*100:.2f}%"]
    ]
    
    # 2. 分组分析
    group_stats = result['group_analysis']['group_stats']
    group_table = []
    for idx, row in group_stats.iterrows():
        group_table.append([
            idx,
            f"{row['val_min']:.4f}",
            f"{row['val_max']:.4f}",
            f"{row['val_mean']:.4f}",
            f"{row['return_mean']:.6f}",
            f"{row['return_std']:.4f}",
            row['count']
        ])
    
    # 3. 因子分布描述
    dist_stats = result['factor_distribution']['descriptive_stats']
    distribution_metrics = [
        ["均值", f"{dist_stats['mean']:.4f}"],
        ["标准差", f"{dist_stats['std']:.4f}"],
        ["最小值", f"{dist_stats['min']:.4f}"],
        ["1%分位数", f"{dist_stats['1%']:.4f}"],
        ["5%分位数", f"{dist_stats['5%']:.4f}"],
        ["25%分位数", f"{dist_stats['25%']:.4f}"],
        ["中位数", f"{dist_stats['50%']:.4f}"],
        ["75%分位数", f"{dist_stats['75%']:.4f}"],
        ["95%分位数", f"{dist_stats['95%']:.4f}"],
        ["99%分位数", f"{dist_stats['99%']:.4f}"],
        ["最大值", f"{dist_stats['max']:.4f}"]
    ]
    
    # 4. 分组相关性
    group_correlations = []
    for i, corr in enumerate(result['information_ratio']['group_correlations']):
        group_correlations.append([f"组 {i+1}", f"{corr:.6f}"])
    
    # 5. 滚动IC分析 - 完整展示
    rolling_ic = result['rolling_ic_analysis']['rolling_ic']
    rolling_rank_ic = result['rolling_ic_analysis']['rolling_rank_ic']
    window_centers = result['rolling_ic_analysis']['window_centers']
    cumsum_ic = result['rolling_ic_analysis']['cumsum_ic']
    cumsum_rank_ic = result['rolling_ic_analysis']['cumsum_rank_ic']
    
    # 创建滚动IC数据表格
    rolling_table = []
    for i in range(len(rolling_ic)):
        rolling_table.append([
            window_centers[i].strftime('%Y-%m-%d %H:%M:%S'),
            f"{rolling_ic[i]:.6f}",
            f"{rolling_rank_ic[i]:.6f}",
            f"{cumsum_ic[i]:.6f}",
            f"{cumsum_rank_ic[i]:.6f}"
        ])
    
    # 滚动IC汇总统计
    rolling_metrics = [
        ["滚动IC均值", f"{pd.Series(rolling_ic).mean():.6f}"],
        ["滚动IC标准差", f"{pd.Series(rolling_ic).std():.6f}"],
        ["滚动Rank IC均值", f"{pd.Series(rolling_rank_ic).mean():.6f}"],
        ["滚动Rank IC标准差", f"{pd.Series(rolling_rank_ic).std():.6f}"],
        ["累积IC终值", f"{cumsum_ic[-1]:.6f}"],
        ["累积Rank IC终值", f"{cumsum_rank_ic[-1]:.6f}"],
        ["初始窗口大小", result['rolling_ic_analysis']['initial_window']],
        ["滚动步长", result['rolling_ic_analysis']['rolling_step']]
    ]
    
    # 6. 时间窗口统计
    window_stats = []
    window_data = result['factor_distribution']['window_stats'][10]
    
    # 正确处理时间窗口统计数据
    if isinstance(window_data['stats'], pd.DataFrame):
        # 如果是DataFrame结构
        for idx, row in window_data['stats'].iterrows():
            window_stats.append([
                idx,
                f"{row['mean']:.4f}",
                f"{row['std']:.4f}",
                f"{row['skew']:.4f}",
                f"{row['lambda']:.4f}",
                row['count']
            ])
    else:
        # 如果是字典结构
        for time_range, stats in window_data['stats'].items():
            window_stats.append([
                time_range,
                f"{stats['mean']:.4f}",
                f"{stats['std']:.4f}",
                f"{stats['skew']:.4f}",
                f"{stats['lambda']:.4f}",
                stats['count']
            ])
    
    # # 7. 平稳性测试结果
    # stationarity = result['stationarity_test']
    # stationarity_metrics = [
    #     ["ADF统计量", stationarity['adf_statistic']],
    #     ["P值", stationarity['p_value']],
    #     ["1%临界值", stationarity['critical_values']['1%']],
    #     ["5%临界值", stationarity['critical_values']['5%']],
    #     ["10%临界值", stationarity['critical_values']['10%']],
    #     ["是否平稳", stationarity['is_stationary']],
    #     #["显著性水平", stationarity['alpha']]
    # ]
    
    # 打印所有结果
    print("="*80)
    print("因子评估核心指标".center(80))
    print("="*80)
    print(tabulate(basic_metrics, headers=["指标", "值"], tablefmt="grid", floatfmt=".6f"))
    
    # print("\n" + "="*80)
    # print("平稳性测试结果".center(80))
    # print("="*80)
    # print(tabulate(stationarity_metrics, headers=["指标", "值"], tablefmt="grid", floatfmt=".6f"))
    
    print("\n" + "="*80)
    print("分组分析结果".center(80))
    print("="*80)
    print(tabulate(group_table, headers=["分组", "最小值", "最大值", "平均值", "收益均值", "收益标准差", "样本数"], tablefmt="grid"))
    
    print("\n" + "="*80)
    print("因子分布统计".center(80))
    print("="*80)
    print(tabulate(distribution_metrics, headers=["统计量", "值"], tablefmt="grid", floatfmt=".4f"))
    
    print("\n" + "="*80)
    print("分组相关性".center(80))
    print("="*80)
    print(tabulate(group_correlations, headers=["分组", "相关性"], tablefmt="grid", floatfmt=".6f"))
    
    print("\n" + "="*80)
    print("滚动IC分析汇总".center(80))
    print("="*80)
    print(tabulate(rolling_metrics, headers=["指标", "值"], tablefmt="grid", floatfmt=".6f"))
    
    print("\n" + "="*80)
    print("滚动IC详细数据".center(80))
    print("="*80)
    print(tabulate(rolling_table, headers=["窗口中心", "滚动IC", "滚动Rank IC", "累积IC", "累积Rank IC"], tablefmt="grid"))
    
    print("\n" + "="*80)
    print("时间窗口统计".center(80))
    print("="*80)
    print(tabulate(window_stats, headers=["时间窗口", "均值", "标准差", "偏度", "Lambda", "样本数"], tablefmt="grid"))
    print(f"时间范围: {window_data['time_ranges']}")

# 调用展示函数
display_factor_evaluation(result)

                                    因子评估核心指标                                    
+---------------------+-----------------------+
| 指标                | 值                    |
| 信息系数(IC)        | 0.00588752512916517   |
+---------------------+-----------------------+
| 秩相关系数(Rank IC) | -0.005036669243417518 |
+---------------------+-----------------------+
| 信息比率(IR)        | 0.33025350874392256   |
+---------------------+-----------------------+
| 分组数量            | 10                    |
+---------------------+-----------------------+
| 数据点数            | 130,462               |
+---------------------+-----------------------+
| 异常值(>3σ)         | 1.32%                 |
+---------------------+-----------------------+
| 异常值(>5σ)         | 0.00%                 |
+---------------------+-----------------------+

                                     分组分析结果                                     
+--------+----------+----------+----------+------------+--------------+----------+
|   分组 |   最小值 |