# 跳跃关联动量因子

## 导入模块

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import feather
import statsmodels.api as sm
from tqdm.notebook import tqdm
import os
import sys
%load_ext line_profiler

## 读入日线数据

### 日线数据

In [2]:
price_1d = feather.read_dataframe('../data/StockPriceK1d_20241231.feather')
price_1d = price_1d[(price_1d['date'] >= '2019-01-01') & (price_1d['date'] <= '2024-12-31')]

### 跳跃收益数据

In [3]:
jump = feather.read_dataframe('../data/jump/jump.feather')
price_1d = pd.merge(
    price_1d,
    jump[['issue', 'date', 'ret_jump', 'ret_nojump', 'ret_posjump', 'ret_negjump']],
    on=['issue', 'date'],
    how='left'
)

In [4]:
price_1d.loc[
    (price_1d['ret'] < 0.01) & (price_1d['ret'] > -0.01),
    ['ret_jump', 'ret_nojump', 'ret_posjump', 'ret_negjump']
] = np.nan

price_1d['log_ret'] = np.log(1 + price_1d['ret'])
price_1d['ret_nojump'] = price_1d['ret_nojump'].fillna(price_1d['log_ret'])
price_1d = price_1d.fillna(0.)
price_1d['ret_without_posjump'] = price_1d['ret_nojump'] + price_1d['ret_negjump']

In [5]:
price_1d = price_1d.set_index('issue')

### 调仓日

In [6]:
start_date = '2019-08-01'
end_date = '2025-01-01'
df_adj = feather.read_dataframe('../data/adj_date_daily.feather')
adj_date = df_adj.loc[
    (df_adj['adj_date'] >= start_date) &
    (df_adj['adj_date'] <= end_date), 'adj_date'
].to_list()

## 计算过去 20 天收益率

In [7]:
%%time
ret_cols = ['log_ret', 'ret_nojump', 'ret_posjump', 'ret_negjump', 'ret_without_posjump']
ret_20_cols = [col + '_20' for col in ret_cols]
price_1d[ret_20_cols] = (
    price_1d
        .groupby('issue')[ret_cols]
        .transform(lambda x: x.rolling(20).apply(np.sum, raw=True))
)
price_1d['ret_20'] = np.exp(price_1d['log_ret_20']) - 1

CPU times: total: 2min
Wall time: 2min 4s


## 跳跃关联动量

### 读入相关性 & 矩阵展平 & 稀疏化处理

In [8]:
def get_correlation(date, factor_type: str):
    year = date.year
    date_str = date.strftime('%Y%m%d')
    corr = feather.read_dataframe(f'../data/corr_daily/{year}/corr_{factor_type}_{date_str}.feather')
    np.fill_diagonal(corr.values, 0)
    corr = (
        corr
            .stack()
            .rename('corr')
            .rename_axis(['issue_i', 'issue_j'])
    )
    corr = corr.reset_index()
    
    med = corr.loc[corr['corr'] > 0, 'corr'].median()
    corr.loc[corr['corr'] < med, 'corr'] = 0

    return corr

In [9]:
def get_correlation_sparse(date, factor_type: str, upper_only: bool = True):
    year = date.year
    date_str = date.strftime('%Y%m%d')
    corr = feather.read_dataframe(f'../data/corr_daily/{year}/corr_{factor_type}_{date_str}.feather')
    corr_np = corr.values
    np.fill_diagonal(corr_np, 0.0)

    pos = corr_np > 0
    med = np.nanmedian(corr_np[pos]) if np.any(pos) else 0.0
    mask = (corr_np >= med)
    idx_i, idx_j = np.where(mask)
    
    return pd.DataFrame({
        'issue_i': corr.index.values[idx_i],
        'issue_j': corr.columns.values[idx_j],
        'corr': corr_np[mask]
    })

### 计算绝对动量

In [10]:
def peer_ret_calc(corr_ret:pd.DataFrame, ret_cols=['ret_20']):
    nume = corr_ret[ret_cols].mul(corr_ret['corr'], axis=0).sum()
    deno = corr_ret['corr'].sum()
    if deno == 0:
        return None
    peer_ret = nume / deno
    return peer_ret

def get_peer_abs_ret(date, corr:pd.DataFrame, ret_cols=['ret_20'], peer_cols=None):
    prc_date = price_1d.loc[price_1d['date'] == date, ret_cols].reset_index()
    corr_ret = pd.merge(
        corr,
        prc_date,
        left_on='issue_j',
        right_on='issue',
        how='left'
    )
    corr_ret = corr_ret.fillna(0.)
    
    peer_ret = (
        corr_ret
            .groupby('issue_i')[['corr'] + ret_cols]
            .apply(peer_ret_calc, ret_cols=ret_cols)
    )
    peer_ret = peer_ret.dropna()
    if peer_cols != None:
        map_ret_peer = {ret: peer for ret, peer in zip(ret_cols, peer_cols)}
    else:
        map_ret_peer = {ret: ret.replace('_20', '').replace('ret', 'peer') for ret in ret_cols}
    peer_ret = peer_ret.rename(columns=map_ret_peer)
    peer_ret = peer_ret.rename_axis('issue')
    peer_ret = peer_ret.reset_index()

    peer_ret = pd.merge(
        peer_ret,
        prc_date[['issue', 'ret_20']],
        on='issue',
        how='left'
    )
    peer_ret = peer_ret.dropna(subset='ret_20')
    peer_ret['date'] = date
    return peer_ret

In [11]:
def get_peer_abs_ret_fast(date, corr: pd.DataFrame,
                          ret_cols=['ret_20'], peer_cols=None):
    # 1) 取当日收益，并索引到 issue，避免后面 merge
    prc = price_1d.loc[price_1d['date'] == date, ret_cols]

    # 2) 用 join 替换 merge（基于索引的连接更快），只引入需要列
    #    corr 只保留必要列，避免大表拷贝
    c = corr[['issue_i', 'issue_j', 'corr']].copy()
    c = c.join(prc, on='issue_j', how='left')          # 在右侧按 issue_j 对应到 ret 列

    # 3) 只对 ret 列做 fillna(0)（不少数据里 ret 缺失要当 0 处理）
    if ret_cols:
        c[ret_cols] = c[ret_cols].fillna(0.0)

    # 4) 计算分子与分母：对每个 issue_i 汇总 corr*ret 与 corr
    #    注意：一次性对所有 ret_cols 做按行乘，再 groupby.sum
    weighted_ret_sum = c[ret_cols].multiply(c['corr'], axis=0)\
                                  .groupby(c['issue_i'], observed=True, sort=False).sum()
    corr_sum = c.groupby('issue_i', observed=True, sort=False)['corr'].sum()

    # 5) 加权均值（分母为 0 的行会产生 NaN，正好用于过滤）
    peer_ret = weighted_ret_sum.div(corr_sum, axis=0)

    # 6) 列名映射
    if peer_cols is not None:
        rename_map = {ret: peer for ret, peer in zip(ret_cols, peer_cols)}
    else:
        rename_map = {ret: ret.replace('_20', '').replace('ret', 'peer') for ret in ret_cols}
    peer_ret = peer_ret.rename(columns=rename_map)

    # 7) 把 index 变回列名 issue
    peer_ret.index.name = 'issue'
    peer_ret = peer_ret.reset_index()

    # 8) 只把需要的真实收益拼回（利用索引 reindex 避免再次 merge）
    #    这里把 ret_20 拼到 issue 上；如果你需要别的列也可以同理处理
    r20 = prc['ret_20'].rename('ret_20')
    peer_ret['ret_20'] = r20.reindex(peer_ret['issue']).values

    # 9) 丢掉 ret_20 为空的（极少量无法匹配的）
    peer_ret = peer_ret.dropna(subset=['peer', 'ret_20'])
    peer_ret['date'] = date
    return peer_ret

### 计算相对动量

In [12]:
def get_peer_relative_ret(peer_ret:pd.DataFrame, peer_cols=['peer_ret'], relative_cols=None, plot=False):
    x = peer_ret['ret_20']
    x = sm.add_constant(x)
    y = peer_ret[peer_cols]
    result = sm.OLS(y, x).fit()

    relative_ret = result.resid
    if relative_cols != None:
        map_peer_relative = {peer: relative for peer, relative in zip(peer_cols, relative_cols)}
    else:
        map_peer_relative = {peer: peer.replace('peer', 'relative') for peer in peer_cols}
    relative_ret = relative_ret.rename(columns=map_peer_relative)
    return relative_ret

### 规模运算 & 保存

In [13]:
def get_peer_ret_factor(date:np.datetime64, factor_type:str, ret_cols=['ret_20'], peer_cols=None, relative_cols=None):
    if peer_cols is None:
        peer_cols = [ret_col.replace('_20', '').replace('ret', 'peer') for ret_col in ret_cols]
    if relative_cols is None:
        relative_cols = [peer_col.replace('peer', 'relative') for peer_col in peer_cols]

    corr = get_correlation_sparse(date, factor_type)
    prd = get_peer_abs_ret_fast(
        date, corr,
        ret_cols=ret_cols, peer_cols=peer_cols
    )
    
    rrd = get_peer_relative_ret(
        prd,
        peer_cols=peer_cols, relative_cols=relative_cols
    )
    prd = pd.concat([prd, rrd], axis=1)
    return prd

#### 计算所有日期所有因子并保存

In [None]:
ret_cols = ['ret_20', 'ret_nojump_20', 'ret_posjump_20', 'ret_negjump_20', 'ret_without_posjump_20']
peer_cols = [ret_col.replace('_20', '').replace('ret', 'peer') for ret_col in ret_cols]
relative_cols = [peer_col.replace('peer', 'relative') for peer_col in peer_cols]
dirname = '../data/peer_ret_daily/'
os.makedirs(dirname, exist_ok=True)

for date in tqdm(adj_date):
    prd_num = get_peer_ret_factor(
        date, 'num',
        ret_cols=ret_cols,
        peer_cols=peer_cols,
        relative_cols=relative_cols
    )
    prd_size = get_peer_ret_factor(
        date, 'size',
        ret_cols=ret_cols,
        peer_cols=peer_cols,
        relative_cols=relative_cols
    )
    year = date.year
    date_str = date.strftime('%Y%m%d')
    os.makedirs(dirname + f'/{year}/', exist_ok=True)
    feather.write_dataframe(prd_num, dirname + f'/{year}/prd_num_{date_str}.feather')
    feather.write_dataframe(prd_size, dirname + f'/{year}/prd_size_{date_str}.feather')
    del prd_num, prd_size

  0%|          | 0/1315 [00:00<?, ?it/s]

#### 整合因子

In [None]:
dirname = '../data/peer_ret_daily/'
peer_ret_num = None
peer_ret_size = None
for date in adj_date:
    year = date.year
    date_str = date.strftime('%Y%m%d')
    prd_num = feather.read_dataframe(dirname + f'/{year}/prd_num_{date_str}.feather')
    peer_ret_num = pd.concat([peer_ret_num, prd_num], axis=0)
    prd_size = feather.read_dataframe(dirname + f'/{year}/prd_size_{date_str}.feather')
    peer_ret_size = pd.concat([peer_ret_size, prd_size], axis=0)

for peer, relative in zip(peer_cols, relative_cols):
    partial_num = peer_ret_num[['date', 'issue', peer, relative]]
    partial_num = partial_num.rename(columns={peer: 'peer_ret', relative: 'peer_relative_ret'})
    feather.write_dataframe(partial_num, dirname + peer + '_num.feather')

    partial_size = peer_ret_size[['date', 'issue', peer, relative]]
    partial_size = partial_size.rename(columns={peer: 'peer_ret', relative: 'peer_relative_ret'})
    feather.write_dataframe(partial_size, dirname + peer + '_size.feather')

### 将非跳跃动量因子与负跳跃动量因子相加

In [None]:
peer_nojump_num = feather.read_dataframe('../data/peer_ret_daily/peer_nojump_num.feather')
peer_negjump_num = feather.read_dataframe('../data/peer_ret_daily/peer_negjump_num.feather')
peer_without_posjump_1_num = pd.DataFrame(data={
    'date': peer_nojump_num['date'],
    'issue': peer_nojump_num['issue'],
    'peer_relative_ret': peer_nojump_num['peer_relative_ret'] + peer_negjump_num['peer_relative_ret']
})
feather.write_dataframe(peer_without_posjump_1_num, '../data/peer_ret_daily/peer_without_posjump_1_num.feather')

peer_nojump_size = feather.read_dataframe('../data/peer_ret_daily/peer_nojump_size.feather')
peer_negjump_size = feather.read_dataframe('../data/peer_ret_daily/peer_negjump_size.feather')
peer_without_posjump_1_size = pd.DataFrame(data={
    'date': peer_nojump_size['date'],
    'issue': peer_nojump_size['issue'],
    'peer_relative_ret': peer_nojump_size['peer_relative_ret'] + peer_negjump_size['peer_relative_ret']
})
feather.write_dataframe(peer_without_posjump_1_size, '../data/peer_ret_daily/peer_without_posjump_1_size.feather')

## 等权复合频率 / 幅度因子

In [None]:
num = feather.read_dataframe('../data/peer_ret_daily/peer_without_posjump_num.feather')
size = feather.read_dataframe('../data/peer_ret_daily/peer_without_posjump_size.feather')
num = num.reset_index(drop=True)
size = size.reset_index(drop=True)
peer_without_posjump = pd.DataFrame(data={
    'date': num['date'],
    'issue': num['issue'],
    'peer_relative_ret': num['peer_relative_ret'] + size['peer_relative_ret']
})
feather.write_dataframe(peer_without_posjump, '../data/peer_ret_daily/peer_without_posjump.feather')