In [1]:
import sys
sys.path.append('/public/src')
from factor_evaluation_server import FactorEvaluation,DataService # type: ignore
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

In [2]:
ds=DataService()
df=ds['ETHUSDT_15m_2020_2025']['2021-10-01':]

In [3]:
evaluator=FactorEvaluation(df=df,future_return_periods=10)

# 定义因子！

In [4]:
def factor(df, window=20, update_freq=4):
    df = df.copy()
    df['return'] = (df['close'] - df['open']) / df['open']
    df['factor_value'] = np.nan  # 初始化因子值列
    
    # 每天96根15分钟K线 (24小时 * 4)
    daily_kline_count = 96
    
    # 滚动计算因子值
    for end_idx in range(window * daily_kline_count, len(df), update_freq):
        start_idx = end_idx - window * daily_kline_count
        window_df = df.iloc[start_idx:end_idx]
        
        daily_results = []
        for date, group in window_df.groupby(pd.Grouper(freq='D')):
            if len(group) < 4:
                continue
                
            # 收益率缩尾处理
            returns = group['return'].copy()
            q_low = returns.quantile(0.05)
            q_hi = returns.quantile(0.95)
            returns = returns.clip(lower=q_low, upper=q_hi)
            
            time_idx = np.arange(len(group))
            up_mask = returns > 0
            down_mask = returns < 0
            flat_mask = returns == 0
            
            # 成交量加权
            if up_mask.any():
                up_weights = returns[up_mask].abs() * group.loc[up_mask, 'volume']
                G_u = np.sum(time_idx[up_mask] * up_weights) / up_weights.sum()
            else:
                G_u = np.nan
                
            if down_mask.any():
                down_weights = returns[down_mask].abs() * group.loc[down_mask, 'volume']
                G_d = np.sum(time_idx[down_mask] * down_weights) / down_weights.sum()
            else:
                G_d = np.nan
                
            if flat_mask.any():
                flat_weights = group.loc[flat_mask, 'volume'] * (
                    1 - (group.loc[flat_mask, 'high'] - group.loc[flat_mask, 'low']).abs() / group.loc[flat_mask, 'open']
                )
                G_f = np.sum(time_idx[flat_mask] * flat_weights) / flat_weights.sum()
            else:
                G_f = np.nan
                
            daily_results.append({
                'date': date,
                'G_u': G_u,
                'G_d': G_d,
                'G_f': G_f
            })
        
        if not daily_results:
            current_factor = np.nan
        else:
            daily_df = pd.DataFrame(daily_results).set_index('date')
            
            # 回归计算残差
            residuals = []
            for i in range(window, len(daily_df)):
                train_data = daily_df.iloc[i-window:i].dropna(subset=['G_u', 'G_d'])
                if len(train_data) < 10:
                    residuals.append(np.nan)
                    continue
                    
                X = train_data[['G_u']].values
                y = train_data['G_d'].values
                model = LinearRegression().fit(X, y)
                
                current = daily_df.iloc[i]
                if pd.isna(current['G_u']) or pd.isna(current['G_d']):
                    residuals.append(np.nan)
                else:
                    pred_G_d = model.predict([[current['G_u']]])[0]
                    residuals.append(current['G_d'] - pred_G_d)
            
            if residuals:
                daily_df = daily_df.iloc[window:]
                daily_df['residual'] = residuals
                current_factor = daily_df['residual'].rolling(window).mean().iloc[-1]
            else:
                current_factor = np.nan
        
        # 更新当前窗口的因子值
        df.iloc[end_idx:min(end_idx+update_freq, len(df)), df.columns.get_loc('factor_value')] = current_factor
    
    return -df['factor_value']

# 测试因子表现

In [5]:
evaluator.set_factor(
    factor_func=lambda df: factor(df),
    factor_name='factor'
)

result=evaluator.run_full_evaluation(run_stationarity_test=False)

IndexError: cannot do a non-empty take from an empty axes.

In [None]:
result['information_ratio']['group_correlations']
# 这表示不同分组之间的相关性，通常用于评估因子在不同市场状态下的表现一致性。
# 如果相关性较高，说明因子在不同市场状态下表现一致；如果相关性较低，说明因子在不同市场状态下表现差异较大。

[-0.008476654572579111,
 0.03362084872988154,
 0.01583233018424272,
 -0.032675446495493365,
 -0.028712004481536865,
 -0.005930556451302199,
 -0.01253614420237807,
 -0.035266773452911876,
 -0.03456603738673479,
 0.004062187658820262]

In [None]:
print(result)

{'correlation_analysis': {'IC': -0.02419501066602169, 'Rank_IC': -0.024764644729254544}, 'information_ratio': {'IR': -0.4728877259093413, 'group_correlations': [-0.008476654572579111, 0.03362084872988154, 0.01583233018424272, -0.032675446495493365, -0.028712004481536865, -0.005930556451302199, -0.01253614420237807, -0.035266773452911876, -0.03456603738673479, 0.004062187658820262], 'n_groups': 10}, 'group_analysis': {'group_stats':        val_min  val_max  val_mean  return_mean  return_std  count
group                                                            
0      -5.7189  -2.6965   -3.5827      -0.0001      0.0154   6432
1      -2.6917  -2.1801   -2.4097       0.0003      0.0109   6336
2      -2.1733  -1.7637   -1.9545       0.0007      0.0122   6336
3      -1.7551  -1.3514   -1.5415       0.0010      0.0113   6432
4      -1.3448  -1.0844   -1.1991       0.0000      0.0110   6336
5      -1.0805  -0.8749   -0.9827      -0.0001      0.0103   6336
6      -0.8723  -0.5810   -0.7319   