In [21]:
import numpy as np
import pandas as pd

import sys
sys.path.append('/public/src')

from factor_evaluation_server import FactorEvaluation, DataService
ds = DataService()
df = ds['ETHUSDT_15m_2020_2025']['2021-10-01':]

factor_path = "/public/data/factor_data/ETHUSDT_15m_2020_2025_factor_data.pkl"
factors = pd.read_pickle(factor_path)

factors.head()

Unnamed: 0_level_0,open_time,open,high,low,close,volume,close_time,turnover,trade_count,taker_buy_volume,...,c_hide_023,c_hide_024,c_hide_025,c_hide_026,c_hide_027,c_hide_028,c_hide_029,c_hide_030,c_hide_031,c_hide_032
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-10-01 00:00:00,1633046400000,2999.45,3026.26,2995.88,3011.48,37953.687,1633047299999,114253757.6028,42570,20397.659,...,0.0073,0.0105,0.0051,0.0008,0.0026,0.0001,0.0029,0.0074,0.0104,0.005
2021-10-01 00:15:00,1633047300000,3011.48,3023.91,3005.6,3022.95,13523.609,1633048199999,40776341.6975,21048,7160.391,...,0.008,0.003,0.0029,0.0178,0.0162,0.0111,0.015,0.019,0.0081,0.0082
2021-10-01 00:30:00,1633048200000,3022.95,3040.54,3020.55,3029.37,28750.477,1633049099999,87119698.8131,39255,15659.277,...,0.0033,0.0011,0.0055,0.0173,0.0096,0.0078,0.0096,0.0111,0.0089,0.0024
2021-10-01 00:45:00,1633049100000,3029.11,3029.68,3014.8,3017.92,15488.049,1633049999999,46764677.9905,23806,5829.698,...,0.0053,0.009,0.0088,0.018,0.0122,0.0106,0.0151,0.0054,0.0017,0.0019
2021-10-01 01:00:00,1633050000000,3017.91,3023.39,2996.0,3000.99,25731.294,1633050899999,77375000.6151,34185,10056.814,...,0.0037,0.0039,0.0003,0.013,0.0146,0.0069,0.0098,0.0106,0.003,0.0066


In [22]:
def factor(df):
    # 原始因子计算（保持不变）
    df['buy_ratio'] = df['taker_buy_volume'] / (df['volume'] + 1e-7)
    price_change = df['close'].pct_change()
    volatility = price_change.ewm(span=96, min_periods=24).std()
    vp_ratio = df['volume'] / (volatility + 1e-7)
    
    periods = [96, 288, 672, 1440, 2880]
    vp_matrix = np.column_stack([
        vp_ratio.ewm(span=p, min_periods=int(p/4)).mean()
        for p in periods
    ])
    
    # === 改进点：波动率动态尾部阈值 ===
    # 计算波动率分位数
    vol_rank = volatility.rolling(window=672, min_periods=168).rank(pct=True)
    
    # 动态调整尾部阈值
    dynamic_tail_threshold = np.where(
        vol_rank < 0.3, 
        0.20,  # 低波动市场收紧阈值
        np.where(vol_rank > 0.7, 0.30, 0.25)  # 高波动市场放宽阈值
    )
    
    divergence_matrix = np.zeros((len(df), len(periods)))
    for i, p in enumerate(periods):
        price_momentum = df['close'].pct_change(p)
        volume_momentum = df['volume'].pct_change(p)
        flow_momentum = df['buy_ratio'].diff(p)
        
        price_volume_div = np.sign(price_momentum) * np.sign(volume_momentum) < 0
        price_flow_div = np.sign(price_momentum) * np.sign(flow_momentum) < 0
        div_strength = (price_volume_div.astype(int) + price_flow_div.astype(int)) * np.abs(flow_momentum)
        
        # 使用动态尾部阈值
        is_tail = (df['buy_ratio'] < dynamic_tail_threshold)
        div_strength = np.where(is_tail, div_strength * (1.8 - vol_rank * 0.6), div_strength)
        
        divergence_matrix[:, i] = div_strength
    
    combined_matrix = np.concatenate((vp_matrix, divergence_matrix), axis=1)
    
    # 标准化（保持不变）
    scaled_std = np.zeros_like(combined_matrix)
    window = 2800
    min_periods = 168
    for i in range(combined_matrix.shape[1]):
        col = pd.Series(combined_matrix[:, i])
        
        rolling_q20 = col.shift(1).rolling(window=window, min_periods=min_periods).quantile(0.20)
        rolling_q80 = col.shift(1).rolling(window=window, min_periods=min_periods).quantile(0.80)
        
        denominator = rolling_q80 - rolling_q20        
        adaptive_threshold = 0.01 * col.abs().rolling(288, min_periods=72).mean()
        denominator = np.where(denominator < 1e-5, adaptive_threshold, denominator)
        
        scaled_col = (col - rolling_q20) / (denominator + 1e-7)
        
        scaled_col = np.clip(scaled_col, -3, 3)
        scaled_std[:, i] = scaled_col
    
    # 最终因子计算（使用动态尾部权重）
    tail_weight = np.where(df['buy_ratio'] < dynamic_tail_threshold, 
                          (2 - vol_rank * 0.5), 
                          1.0)
    factor_vals = (np.mean(scaled_std, axis=1) * np.log1p(df['volume']) * df['buy_ratio'] * tail_weight)
    
    # 创建带索引的Series，保留原始时间戳
    factor_series = pd.Series(
        np.where(factor_vals < 0.76, np.nan, factor_vals),
        index=df.index
    )
    
    return factor_series

In [23]:
sig = factor(df)

In [24]:
factors['sig'] = sig

In [25]:
corr_matrix = factors.corr()

In [27]:
# 获取新因子与所有其他因子的相关性
sig_corr = corr_matrix.iloc[-1, :]

# 1. 计算相关性绝对值并排序
abs_corr = sig_corr.abs().sort_values(ascending=False)

# 2. 创建相关性报告
corr_report = pd.DataFrame({
    'Factor': abs_corr.index,
    'Correlation': sig_corr[abs_corr.index],
    'Absolute_Correlation': abs_corr.values
})

# ... 前面的代码保持不变 ...

# 3. 排除基础价格数据和自身
base_columns = ['open_time', 'open', 'high', 'low', 'close', 'volume', 'close_time', 
                'turnover', 'trade_count', 'taker_buy_volume', 'taker_buy_turnover']
factor_corr = corr_report[
    ~corr_report['Factor'].isin(base_columns) & 
    (corr_report['Factor'] != 'sig')
]

# 4. 处理 NaN 值并添加排名列
# 创建绝对相关性的副本并处理非有限值
abs_corr_clean = factor_corr['Absolute_Correlation'].copy()
abs_corr_clean.replace([np.inf, -np.inf], np.nan, inplace=True)  # 替换 inf 为 nan
abs_corr_clean.fillna(0, inplace=True)  # 将 nan 替换为 0

# 计算排名
factor_corr['Rank'] = abs_corr_clean.rank(ascending=False, method='min').astype(int)

# 5. 设置更美观的显示选项（保持不变）
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.4f}'.format)

# 6. 打印相关性排名报告
# ... 后续代码保持不变 ...

# 6. 打印相关性排名报告
print("\n" + "="*80)
print("新因子 'VCF' 与因子库中老因子的相关性绝对值排名")
print("="*80)
print(factor_corr[['Rank', 'Factor', 'Correlation', 'Absolute_Correlation']].reset_index(drop=True))


新因子 'VCF' 与因子库中老因子的相关性绝对值排名
     Rank                             Factor  Correlation  Absolute_Correlation
0       1                           c_chu019       0.4276                0.4276
1       2                           c_chu001       0.3564                0.3564
2       3                           c_chu060       0.3399                0.3399
3       4                           c_chu028       0.3191                0.3191
4       5                         c_hide_013       0.3152                0.3152
5       6                         c_hide_019       0.3091                0.3091
6       7                         c_hide_007       0.3089                0.3089
7       8                         c_hide_012       0.3083                0.3083
8       9                           c_chu055       0.3073                0.3073
9      10                         c_hide_006       0.3048                0.3048
10     11                         c_hide_025       0.3031                0.3031
11     12  

In [28]:
# 7. 额外分析：高相关性因子分析
high_corr_threshold = 0.6  # 设置高相关性阈值
high_corr_factors = factor_corr[factor_corr['Absolute_Correlation'] > high_corr_threshold]

if not high_corr_factors.empty:
    print("\n" + "-"*80)
    print(f"警告：发现 {len(high_corr_factors)} 个高度相关因子 (|corr| > {high_corr_threshold})")
    print("-"*80)
    for _, row in high_corr_factors.iterrows():
        print(f"因子 '{row['Factor']}': 相关性 = {row['Correlation']:.4f} (绝对值排名 #{row['Rank']})")
else:
    print("\n" + "-"*80)
    print(f"良好：没有发现高度相关因子 (|corr| > {high_corr_threshold})")
    print("-"*80)


--------------------------------------------------------------------------------
良好：没有发现高度相关因子 (|corr| > 0.6)
--------------------------------------------------------------------------------
