In [1]:
import sys
sys.path.append('/public/src')
from factor_evaluation_server import FactorEvaluation,DataService # type: ignore
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import QuantileRegressor

In [2]:
ds=DataService()
df=ds['ETHUSDT_15m_2020_2025']['2021-10-01':]

In [3]:
evaluator=FactorEvaluation(df=df,future_return_periods=10)

# 定义因子！

In [4]:
def factor(df, window=20, update_freq=4):
    """
    集成所有改进的终极版本：
    1. 多空力量分析
    2. 流动性调整
    3. 高频冲击检测
    4. 熵加权
    5. 分位数回归
    6. 动态频率更新
    """
    df = df.copy()
    # 计算衍生指标
    df['return'] = (df['close'] - df['open']) / df['open']
    df['taker_sell_volume'] = df['volume'] - df['taker_buy_volume']
    df['buy_power'] = df['taker_buy_volume'] / df['volume']
    df['sell_power'] = df['taker_sell_volume'] / df['volume']
    df['liquidity'] = df['turnover'] / df['trade_count']
    df['price_change'] = df['close'].pct_change()
    df['abs_change'] = df['price_change'].abs()
    df['entropy'] = -df['abs_change'] * np.log(df['abs_change'] + 1e-10)
    df['mid_price'] = (df['high'] + df['low']) / 2
    df['price_slope'] = df['mid_price'].diff() / df['volume'].replace(0, 1)
    df['impact'] = np.where(df['price_slope'].abs() > df['price_slope'].rolling(50).mean() * 2, 
                           np.sign(df['price_slope']), 0)
    
    df['factor_value'] = np.nan
    
    # 每天96根15分钟K线
    daily_kline_count = 96
    
    # 滚动计算因子值
    for end_idx in range(window * daily_kline_count, len(df), update_freq):
        start_idx = end_idx - window * daily_kline_count
        window_df = df.iloc[start_idx:end_idx]
        
        daily_results = []
        for date, group in window_df.groupby(pd.Grouper(freq='D')):
            if len(group) < 4:
                continue
                
            time_idx = np.arange(len(group))
            
            # 1. 多空力量分析
            up_mask = group['close'] > group['open']
            down_mask = group['close'] < group['open']
            
            if up_mask.any():
                up_weights = group.loc[up_mask, 'buy_power'] * group.loc[up_mask, 'volume']
                G_u = np.sum(time_idx[up_mask] * up_weights) / up_weights.sum()
            else:
                G_u = np.nan
                
            if down_mask.any():
                down_weights = group.loc[down_mask, 'sell_power'] * group.loc[down_mask, 'volume']
                G_d = np.sum(time_idx[down_mask] * down_weights) / down_weights.sum()
            else:
                G_d = np.nan
                
            # 2. 高频冲击检测
            buy_impact_mask = group['impact'] > 0
            sell_impact_mask = group['impact'] < 0
            
            if buy_impact_mask.any():
                buy_impact_weights = group.loc[buy_impact_mask, 'volume']
                G_buy = np.sum(time_idx[buy_impact_mask] * buy_impact_weights) / buy_impact_weights.sum()
            else:
                G_buy = np.nan
                
            if sell_impact_mask.any():
                sell_impact_weights = group.loc[sell_impact_mask, 'volume']
                G_sell = np.sum(time_idx[sell_impact_mask] * sell_impact_weights) / sell_impact_weights.sum()
            else:
                G_sell = np.nan
                
            # 3. 熵加权
            entropy = group['entropy'].fillna(0).values
            entropy = (entropy - entropy.min()) / (entropy.max() - entropy.min() + 1e-10)
            entropy_factor = entropy.mean()
            
            # 4. 流动性指标
            liquidity = group['liquidity'].mean()
            
            daily_results.append({
                'date': date,
                'G_u': G_u,
                'G_d': G_d,
                'G_buy': G_buy,
                'G_sell': G_sell,
                'entropy': entropy_factor,
                'liquidity': liquidity,
                'volatility': group['return'].std()
            })
        
        if not daily_results:
            current_factor = np.nan
        else:
            daily_df = pd.DataFrame(daily_results).set_index('date')
            
            # 分位数回归
            residuals = []
            for i in range(window, len(daily_df)):
                train_data = daily_df.iloc[i-window:i].dropna()
                if len(train_data) < 10:
                    residuals.append(np.nan)
                    continue
                    
                # 根据波动率选择分位数
                current_vol = daily_df.iloc[i]['volatility']
                quantile = 0.3 if current_vol > daily_df['volatility'].quantile(0.75) else 0.5
                
                # 多特征分位数回归
                X = train_data[['G_u', 'G_buy', 'entropy', 'liquidity']].values
                y = train_data['G_d'].values
                model = QuantileRegressor(quantile=quantile, alpha=0).fit(X, y)
                
                current = daily_df.iloc[i]
                features = [current['G_u'], current['G_buy'], 
                           current['entropy'], current['liquidity']]
                if any(pd.isna(features)) or pd.isna(current['G_d']):
                    residuals.append(np.nan)
                else:
                    pred_G_d = model.predict([features])[0]
                    residuals.append(current['G_d'] - pred_G_d)
            
            if residuals:
                daily_df = daily_df.iloc[window:]
                daily_df['residual'] = residuals
                
                # 综合调整因子值
                vol_adjust = daily_df['volatility'] / daily_df['volatility'].rolling(10).mean()
                liq_adjust = 1 + (0.5 - daily_df['liquidity'].rank(pct=True))
                entropy_adjust = 2 - daily_df['entropy']
                
                raw_factor = daily_df['residual'].rolling(window).mean()
                current_factor = raw_factor.iloc[-1] * vol_adjust.iloc[-1] * liq_adjust.iloc[-1] * entropy_adjust.iloc[-1]
            else:
                current_factor = np.nan
        
        # 更新因子值
        df.iloc[end_idx:min(end_idx+update_freq, len(df)), df.columns.get_loc('factor_value')] = current_factor
    
    return -df['factor_value']

# 测试因子表现

In [5]:
evaluator.set_factor(
    factor_func=lambda df: factor(df),
    factor_name='factor'
)

result=evaluator.run_full_evaluation(run_stationarity_test=False)

IndexError: cannot do a non-empty take from an empty axes.

In [None]:
result['information_ratio']['group_correlations']
# 这表示不同分组之间的相关性，通常用于评估因子在不同市场状态下的表现一致性。
# 如果相关性较高，说明因子在不同市场状态下表现一致；如果相关性较低，说明因子在不同市场状态下表现差异较大。

[0.028596878554364453,
 -0.0059734265489050095,
 -0.007201828602226062,
 -0.00420796110399755,
 0.03306485157386855,
 -0.05614932122415626,
 -0.0280769844982311,
 -0.00584865907246881,
 -0.023491661734871605,
 0.011515315382877322]

In [None]:
print(result)

{'correlation_analysis': {'IC': 0.0452541355416024, 'Rank_IC': 0.033699142506178684}, 'information_ratio': {'IR': -0.22940380453776674, 'group_correlations': [0.028596878554364453, -0.0059734265489050095, -0.007201828602226062, -0.00420796110399755, 0.03306485157386855, -0.05614932122415626, -0.0280769844982311, -0.00584865907246881, -0.023491661734871605, 0.011515315382877322], 'n_groups': 10}, 'group_analysis': {'group_stats':        val_min  val_max  val_mean  return_mean  return_std  count
group                                                            
0     -19.4450  -9.7580  -12.7462      -0.0021      0.0142   6523
1      -9.6614  -7.5818   -8.5927      -0.0009      0.0121   6432
2      -7.5789  -6.2488   -6.8273      -0.0004      0.0115   6432
3      -6.2385  -4.9149   -5.6206      -0.0009      0.0118   6432
4      -4.9123  -3.9213   -4.4102      -0.0003      0.0110   6432
5      -3.9016  -3.0855   -3.4307      -0.0001      0.0118   6432
6      -3.0789  -2.2726   -2.6324      