In [7]:
import numpy as np
import pandas as pd

import sys
sys.path.append('/public/src')

from factor_evaluation_server import FactorEvaluation, DataService
ds = DataService()
df = ds['ETHUSDT_15m_2020_2025']['2021-10-01':]

factor_path = "/public/data/factor_data/ETHUSDT_15m_2020_2025_factor_data.pkl"
factors = pd.read_pickle(factor_path)

factors.head()

Unnamed: 0_level_0,open_time,open,high,low,close,volume,close_time,turnover,trade_count,taker_buy_volume,...,c_hide_023,c_hide_024,c_hide_025,c_hide_026,c_hide_027,c_hide_028,c_hide_029,c_hide_030,c_hide_031,c_hide_032
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-10-01 00:00:00,1633046400000,2999.45,3026.26,2995.88,3011.48,37953.687,1633047299999,114253757.6028,42570,20397.659,...,0.0073,0.0105,0.0051,0.0008,0.0026,0.0001,0.0029,0.0074,0.0104,0.005
2021-10-01 00:15:00,1633047300000,3011.48,3023.91,3005.6,3022.95,13523.609,1633048199999,40776341.6975,21048,7160.391,...,0.008,0.003,0.0029,0.0178,0.0162,0.0111,0.015,0.019,0.0081,0.0082
2021-10-01 00:30:00,1633048200000,3022.95,3040.54,3020.55,3029.37,28750.477,1633049099999,87119698.8131,39255,15659.277,...,0.0033,0.0011,0.0055,0.0173,0.0096,0.0078,0.0096,0.0111,0.0089,0.0024
2021-10-01 00:45:00,1633049100000,3029.11,3029.68,3014.8,3017.92,15488.049,1633049999999,46764677.9905,23806,5829.698,...,0.0053,0.009,0.0088,0.018,0.0122,0.0106,0.0151,0.0054,0.0017,0.0019
2021-10-01 01:00:00,1633050000000,3017.91,3023.39,2996.0,3000.99,25731.294,1633050899999,77375000.6151,34185,10056.814,...,0.0037,0.0039,0.0003,0.013,0.0146,0.0069,0.0098,0.0106,0.003,0.0066


In [8]:
import math

def factor(df):
    """
    计算RSRJV因子 (归一化上下行跳跃波动不对称)
    公式：RSRJV_t = SRJV_t / RV_t
    SRJV_t: 上下行跳跃波动不对称
    RV_t: 总已实现波动率
    """
    # 计算对数收益率
    close_prices = df['close'].values
    log_returns = np.log(close_prices[1:]) - np.log(close_prices[:-1])
    
    # 常数
    mu1 = math.sqrt(2/math.pi)
    mu1_inv_sq = 1 / (mu1 ** 2)
    
    # 初始化结果数组
    rsrjv_values = np.full(len(df), np.nan)
    
    # 滚动窗口计算
    window_size = 96
    for i in range(window_size, len(df)):
        start_idx = i - window_size + 1
        end_idx = i
        
        window_returns = log_returns[start_idx:end_idx]
        n = len(window_returns)
        
        # 计算总波动
        rv = np.sum(window_returns ** 2)
        
        # 计算BV
        abs_returns = np.abs(window_returns)
        bv = mu1_inv_sq * (n/(n-1)) * np.sum(abs_returns[:-1] * abs_returns[1:])
        
        # 计算正向波动
        rv_plus = np.sum(window_returns[window_returns > 0] ** 2)
        
        # 计算负向波动
        rv_minus = np.sum(window_returns[window_returns < 0] ** 2)
        
        # 计算RJVP和RJVN
        rjvp = max(rv_plus - bv/2, 0)
        rjvn = max(rv_minus - bv/2, 0)
        
        # 计算SRJV
        srjv = rjvp - rjvn
        
        # 计算RSRJV (归一化)
        rsrjv = srjv / rv if rv > 0 else 0
        rsrjv_values[i] = rsrjv
        
    # 创建因子序列（原始值取负）
    factor_series = -pd.Series(rsrjv_values, index=df.index)
    
    # 对因子值最小的10.7%进行取反（使负值变正值）
    # 1. 计算10.7%分位数阈值
    threshold = factor_series.quantile(0.107)
    # 2. 找出小于等于阈值的值（即底部10.7%）
    mask = factor_series <= threshold
    # 3. 对这些值取反（负值变正值）
    factor_series[mask] = -factor_series[mask]

    # 对因子值最大的8%设为0
    top_threshold = factor_series.quantile(0.92)
    factor_series[factor_series >= top_threshold] = 0
    
    return factor_series

In [9]:
sig = factor(df)

In [10]:
factors['sig'] = sig

In [11]:
corr_matrix = factors.corr()

In [12]:
# 获取新因子与所有其他因子的相关性
sig_corr = corr_matrix.iloc[-1, :]

# 1. 计算相关性绝对值并排序
abs_corr = sig_corr.abs().sort_values(ascending=False)

# 2. 创建相关性报告
corr_report = pd.DataFrame({
    'Factor': abs_corr.index,
    'Correlation': sig_corr[abs_corr.index],
    'Absolute_Correlation': abs_corr.values
})

# ... 前面的代码保持不变 ...

# 3. 排除基础价格数据和自身
base_columns = ['open_time', 'open', 'high', 'low', 'close', 'volume', 'close_time', 
                'turnover', 'trade_count', 'taker_buy_volume', 'taker_buy_turnover']
factor_corr = corr_report[
    ~corr_report['Factor'].isin(base_columns) & 
    (corr_report['Factor'] != 'sig')
]

# 4. 处理 NaN 值并添加排名列
# 创建绝对相关性的副本并处理非有限值
abs_corr_clean = factor_corr['Absolute_Correlation'].copy()
abs_corr_clean.replace([np.inf, -np.inf], np.nan, inplace=True)  # 替换 inf 为 nan
abs_corr_clean.fillna(0, inplace=True)  # 将 nan 替换为 0

# 计算排名
factor_corr['Rank'] = abs_corr_clean.rank(ascending=False, method='min').astype(int)

# 5. 设置更美观的显示选项（保持不变）
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.4f}'.format)

# 6. 打印相关性排名报告
# ... 后续代码保持不变 ...

# 6. 打印相关性排名报告
print("\n" + "="*80)
print("新因子 'VCF' 与因子库中老因子的相关性绝对值排名")
print("="*80)
print(factor_corr[['Rank', 'Factor', 'Correlation', 'Absolute_Correlation']].reset_index(drop=True))


新因子 'VCF' 与因子库中老因子的相关性绝对值排名
     Rank                             Factor  Correlation  Absolute_Correlation
0       1               ret_rsi_bb_ma_signal      -0.2375                0.2375
1       2  ret_ma120_bolling_cross_sig_price      -0.2364                0.2364
2       3                           c_chu006      -0.1764                0.1764
3       4                           c_chu053      -0.1573                0.1573
4       5              ret_ma120_bbi_signals      -0.1424                0.1424
5       6                           c_chu021       0.1381                0.1381
6       7                           c_chu061      -0.1316                0.1316
7       8         ret_dc_bbi_cross_sig_price      -0.1175                0.1175
8       9     ret_ma20_ma120_cross_sig_price      -0.1094                0.1094
9      10                           c_chu015       0.1073                0.1073
10     11                           c_chu057      -0.1065                0.1065
11     12  

In [13]:
# 7. 额外分析：高相关性因子分析
high_corr_threshold = 0.6  # 设置高相关性阈值
high_corr_factors = factor_corr[factor_corr['Absolute_Correlation'] > high_corr_threshold]

if not high_corr_factors.empty:
    print("\n" + "-"*80)
    print(f"警告：发现 {len(high_corr_factors)} 个高度相关因子 (|corr| > {high_corr_threshold})")
    print("-"*80)
    for _, row in high_corr_factors.iterrows():
        print(f"因子 '{row['Factor']}': 相关性 = {row['Correlation']:.4f} (绝对值排名 #{row['Rank']})")
else:
    print("\n" + "-"*80)
    print(f"良好：没有发现高度相关因子 (|corr| > {high_corr_threshold})")
    print("-"*80)


--------------------------------------------------------------------------------
良好：没有发现高度相关因子 (|corr| > 0.6)
--------------------------------------------------------------------------------
