In [11]:
import sys
sys.path.append('/public/src')
from factor_evaluation_server import FactorEvaluation,DataService # type: ignore
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

In [12]:
ds=DataService()
df=ds['ETHUSDT_15m_2020_2025']['2021-10-01':]

In [13]:
evaluator=FactorEvaluation(df=df,future_return_periods=10)

# 定义因子！

In [14]:
def factor(df, base_window=94, vol_window=20):
    df = df.copy()
    
    # 1. 计算波动率调整系数（修复NaN问题）
    df['log_ret'] = np.log(df['close']).diff()
    
    # 使用足够大的min_periods防止早期NaN
    min_periods_vol = max(1, int(0.1 * vol_window * 96))  # 至少10%的窗口大小
    df['volatility'] = df['log_ret'].rolling(
        window=vol_window * 96, 
        min_periods=min_periods_vol
    ).std()
    
    # 前向填充波动率并计算长期波动率
    df['volatility'].fillna(method='ffill', inplace=True)
    df['volatility'].fillna(0.01, inplace=True)  # 初始值填充
    
    long_term_vol = df['volatility'].expanding(min_periods=1).mean()
    df['window_adj'] = np.sqrt(long_term_vol / (df['volatility'] + 1e-8))
    
    # 2. 连续重心计算（去除日分组）
    df['range'] = (df['high'] - df['low']) / df['open']
    
    # 修复rolling计算中的除零问题
    min_periods_fractal = max(1, int(0.5 * 20))  # 至少10个周期
    rolling_20std = df['range'].rolling(20, min_periods=min_periods_fractal).std()
    rolling_20std.replace(0, 1e-5, inplace=True)
    
    df['fractal'] = df['range'].rolling(5).std() / rolling_20std
    df['is_up'] = (df['close'] > df['open']).astype(int)
    df['is_down'] = (df['close'] < df['open']).astype(int)
    
    # 使用cumsum优化性能（修复累积计算问题）
    weights_up = df['fractal'] * df['volume'] * df['is_up']
    weights_down = df['fractal'] * df['volume'] * df['is_down']
    
    # 3. 自适应窗口滚动回归（修复NaN和索引问题）
    df['avg_fractal'] = df['fractal'].rolling(24, min_periods=12).mean()
    regression_data = np.full(len(df), np.nan)
    
    # 确保有足够数据点
    valid_mask = (
        ~df[['G_u', 'avg_fractal', 'G_d']].isnull().any(axis=1) &
        (df.index >= df.index[200])  # 确保有足够历史数据
    )
    
    # 动态窗口回归
    for i in range(200, len(df)):
        if not valid_mask[i]:
            continue
            
        # 获取调整因子并确保有效
        adj_factor = df['window_adj'].iloc[i]
        if np.isnan(adj_factor) or adj_factor <= 0:
            adj_factor = 1.0
            
        dynamic_window = max(50, min(200, int(base_window * adj_factor)))
        start_idx = max(0, i - dynamic_window)
        
        # 提取有效回归数据
        window_data = df.iloc[start_idx:i]
        valid_data = window_data.dropna(subset=['G_u', 'G_d', 'avg_fractal'])
        
        if len(valid_data) < 10:
            continue
            
        # 执行线性回归
        X = valid_data[['G_u', 'avg_fractal']]
        y = valid_data['G_d']
        
        model = LinearRegression()
        model.fit(X, y)
        pred = model.predict([df.iloc[i][['G_u', 'avg_fractal']]])[0]
        
        residual = df['G_d'].iloc[i] - pred
        vol_adj = df['volatility'].iloc[i] / long_term_vol.iloc[i]
        
        # 波动率调整残差
        regression_data[i] = residual * df['avg_fractal'].iloc[i] * vol_adj
    
    # 4. 因子后处理
    factor_series = pd.Series(-regression_data, index=df.index)
    
    # 滚动标准化（修复窗口不足问题）
    min_periods_std = max(1, int(0.1 * 5000))  # 至少500个点
    rolling_mean = factor_series.rolling(5000, min_periods=min_periods_std).mean()
    rolling_std = factor_series.rolling(5000, min_periods=min_periods_std).std()
    rolling_std.replace(0, 1, inplace=True)  # 防止除零
    
    factor_series = (factor_series - rolling_mean) / rolling_std
    return factor_series.fillna(0)

# 测试因子表现

In [15]:
evaluator.set_factor(
    factor_func=lambda df: factor(df),
    factor_name='factor'
)

result=evaluator.run_full_evaluation(run_stationarity_test=False)

ValueError: 因子函数执行失败: "['G_u', 'G_d'] not in index"

In [None]:
result['information_ratio']['group_correlations']
# 这表示不同分组之间的相关性，通常用于评估因子在不同市场状态下的表现一致性。
# 如果相关性较高，说明因子在不同市场状态下表现一致；如果相关性较低，说明因子在不同市场状态下表现差异较大。

[-0.01690796479513809,
 0.017082174178146198,
 0.0015363590771937847,
 0.0048825823401946414,
 -0.0102667659819026,
 0.009478070331710262,
 0.0028196744056216494,
 0.04323663360510031]

In [None]:
print(result)

{'correlation_analysis': {'IC': 0.02731286940144614, 'Rank_IC': 0.009464318952115096}, 'information_ratio': {'IR': 0.3790177409362612, 'group_correlations': [-0.01690796479513809, 0.017082174178146198, 0.0015363590771937847, 0.0048825823401946414, -0.0102667659819026, 0.009478070331710262, 0.0028196744056216494, 0.04323663360510031], 'n_groups': 8}, 'group_analysis': {'group_stats':        val_min  val_max  val_mean  return_mean  return_std  count
group                                                            
0     -29.6643  -7.8114  -10.8188      -0.0003      0.0134   6552
1      -7.8113  -5.4966   -6.5002      -0.0005      0.0127   6552
2      -5.4966  -4.2037   -4.8063      -0.0003      0.0123   6552
3      -4.2037  -3.2444   -3.7009      -0.0000      0.0112   6551
4      -3.2443  -2.4907   -2.8543       0.0001      0.0113   6552
5      -2.4904  -1.8699   -2.1712       0.0001      0.0114   6552
6      -1.8697   0.0000   -0.1080      -0.0002      0.0111  49503
7       1.6002   1.8