# 识别计算跳跃收益

## 导入模块

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import feather
import math
from scipy import stats
import swifter
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
%load_ext line_profiler

## 读入测试数据

In [2]:
price_1m = feather.read_dataframe('../data/2024/StockPriceK1m_20240102.feather')
price_1m['date'] = pd.to_datetime(price_1m['date'].astype(str))
price_1d = feather.read_dataframe('../data/StockPriceK1d_20240630.feather')
price_1d['date'] = pd.to_datetime(price_1d['date'])
idx_date = (price_1d['date'] >= '2019-01-01') & (price_1d['date'] <= '2024-12-31')
price_1d = price_1d[idx_date]
price_1d = price_1d.set_index(['issue', 'date'])

In [3]:
print(len(price_1m['issue'].unique()))

5096


## 定义跳跃统计量

In [4]:
def mu(p: float):
    return (2 ** (p / 2)) * math.gamma((p + 1) / 2) / np.sqrt(np.pi)

mu1 = mu(1)
mu6 = mu(6)

def JS(ret, log_ret):
    n_series, n_points = ret.shape
    
    abs_log_ret = np.abs(log_ret)
    
    window_size = 6
    windows = np.lib.stride_tricks.sliding_window_view(
        abs_log_ret, window_shape=window_size, axis=1
    )
    prod_6 = np.prod(windows, axis=-1)
    sum_prod_6 = np.sum(prod_6, axis=1)
    
    coef_Omega = (mu6 / 9) * ((n_points ** 3) * (mu1 ** -6) / (n_points - 5))
    Omega_SwV = coef_Omega * sum_prod_6
    
    SwV_N = 2 * np.sum(ret - log_ret, axis=1)
    
    window_size_2 = 2
    windows_2 = np.lib.stride_tricks.sliding_window_view(
        abs_log_ret, window_shape=window_size_2, axis=1
    )
    prod_2 = np.prod(windows_2, axis=-1)
    sum_prod_2 = np.sum(prod_2, axis=1)
    
    coef_V = 1 / mu1
    V_01 = coef_V * sum_prod_2
    
    RV_N = np.sum(log_ret ** 2, axis=1)
    
    valid_mask = (Omega_SwV != 0) & (SwV_N != 0)
    js = np.full(n_series, np.nan)
    
    if np.any(valid_mask):
        valid_idx = np.where(valid_mask)[0]
        js[valid_idx] = n_points * (V_01[valid_idx] / np.sqrt(Omega_SwV[valid_idx])) * (1 - RV_N[valid_idx] / SwV_N[valid_idx])
    
    return js[0] if n_series == 1 else js

def pvalue(js: float):
    cdf = stats.norm.cdf(js, loc=0, scale=1)
    return 2 * min(cdf, 1 - cdf)

## 识别跳跃, 计算收益

In [5]:
def jump_identify(ret, log_ret):
    n = len(ret)
    jump = np.full(n, False, dtype=bool)
    med = np.median(ret)
    log_med = np.median(log_ret)
    ret_c = ret.copy()
    log_ret_c = log_ret.copy()
    js0 = JS(ret_c.reshape(1, n), log_ret_c.reshape(1, n))
    p = pvalue(js0)
    js0_pre = js0
    
    while (p < 0.05):
        ret_mat = np.tile(ret_c, (n, 1))
        log_ret_mat = np.tile(log_ret_c, (n, 1))
        np.fill_diagonal(ret_mat, med)
        np.fill_diagonal(log_ret_mat, log_med)
        js = JS(ret_mat, log_ret_mat)
        
        js_diff = np.abs(js0) - np.abs(js)
        idx_max = np.argmax(js_diff)
        jump[idx_max] = True
        ret_c[idx_max] = med
        log_ret_c[idx_max] = log_med
        js0 = JS(ret_c.reshape(1, n), log_ret_c.reshape(1, n))
        p = pvalue(js0)
        if js0 == js0_pre:
            break
        js0_pre = js0

    return np.any(jump), log_ret[jump].sum()

In [6]:
time_start = np.append(np.arange(93000, 113000, 500), np.arange(130000, 150000, 500))
time_start = time_start[np.where(time_start % 10000 < 6000)]
time_end =  time_start + 400

def jump_identify_price(price_1m, price_1d, date, date_next):
    issue = price_1m.iloc[0]['issue']
    prc = price_1m.copy()
    start_price = prc.loc[price_1m['time'].isin(time_start), 'open'].to_numpy()
    end_price = prc.loc[price_1m['time'].isin(time_end), 'close'].to_numpy()
    
    close_today = price_1d.loc[(issue, date), 'close']
    open_next = price_1d.loc[(issue, date), 'open']
    start_price = np.append(start_price, close_today)
    end_price = np.append(end_price, close_today)
    
    ret = (end_price - start_price) / start_price
    log_ret = np.log(1 + ret)

    flag_jump, ret_jump = jump_identify(ret, log_ret)
    df_jump = pd.DataFrame({'issue': [issue], 'jump': [flag_jump], 'ret_jump': [ret_jump]})
    return df_jump

def jump_identify_parallel(price_1m, price_1d, date, date_next):
    groups = list(price_1m.groupby('issue'))
    total_groups = len(groups)
    
    results = Parallel(n_jobs=-1, backend='loky')(
        delayed(jump_identify_price)(group[1], price_1d, '2024-01-02', '2024-01-03')
        for group in tqdm(groups, total=total_groups, desc="Processing stocks")
    )
    
    return pd.concat(results)

## 性能测试

### 单公司单日性能测试

In [7]:
prc = price_1m.loc[price_1m['issue'] == '000014']
date = '2024-01-02'
date_next = '2024-01-03'
%timeit -n 10 -r 3 jump_identify_price(prc, price_1d, date, date_next)

The slowest run took 15.48 times longer than the fastest. This could mean that an intermediate result is being cached.
16.4 ms ± 19.2 ms per loop (mean ± std. dev. of 3 runs, 10 loops each)


逐行运行时间

In [8]:
prc = price_1m.loc[price_1m['issue'] == '000014']
date = '2024-01-02'
date_next = '2024-01-03'
%lprun -f jump_identify_price jump_identify_price(prc, price_1d, date, date_next)

Timer unit: 1e-07 s

Total time: 0.0083049 s
File: C:\Users\admin\AppData\Local\Temp\ipykernel_19436\22860960.py
Function: jump_identify_price at line 5

Line #      Hits         Time  Per Hit   % Time  Line Contents
     5                                           def jump_identify_price(price_1m, price_1d, date, date_next):
     6         1       5782.0   5782.0      7.0      issue = price_1m.iloc[0]['issue']
     7         1       2888.0   2888.0      3.5      prc = price_1m.copy()
     8         1      12891.0  12891.0     15.5      start_price = prc.loc[price_1m['time'].isin(time_start), 'open'].to_numpy()
     9         1      11297.0  11297.0     13.6      end_price = prc.loc[price_1m['time'].isin(time_end), 'close'].to_numpy()
    10                                           
    11         1       4620.0   4620.0      5.6      close_today = price_1d.loc[(issue, date), 'close']
    12         1       3146.0   3146.0      3.8      open_next = price_1d.loc[(issue, date), 'open']


### 单线程 / 并行性能测试

apply 单线程运算

速度: 9.02s / 5096 公司

In [9]:
%%time
df_jump = price_1m.groupby('issue')[['issue', 'time', 'open', 'close']].apply(
    jump_identify_price,
    price_1d=price_1d,
    date='2024-01-02',
    date_next='2024-01-03'
)

CPU times: total: 9.53 s
Wall time: 9.84 s


使用 swifter 提速

速度: 6.03s / 5096 公司

In [10]:
%%time
df_jump = price_1m.swifter.groupby('issue')[['issue', 'time', 'open', 'close']].apply(
    jump_identify_price,
    price_1d=price_1d,
    date='2024-01-02',
    date_next='2024-01-03'
)

  0%|          | 0/8 [00:00<?, ?it/s]

2025-07-01 13:41:17,091	INFO worker.py:1917 -- Started a local Ray instance.


CPU times: total: 5.72 s
Wall time: 14.9 s


使用 joblib 并行运算

速度：18min4s / 5096 公司

In [11]:
# %%time
# df_jump = jump_identify_parallel(price_1m, price_1d, '2024-01-02', '2024-01-03')

## 处理筛选日线数据

In [12]:
price_1d

Unnamed: 0_level_0,Unnamed: 1_level_0,preclose,open,high,low,close,numTrades,volume,value,adj,ret,is_limit_buy,is_limit_sell
issue,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
000001,2019-01-02,9.38,9.39,9.42,9.16,9.19,25140,53938632,4.986951e+08,108.031388,-0.020256,0,0
000001,2019-01-03,9.19,9.18,9.33,9.15,9.28,19151,41553795,3.844577e+08,108.031388,0.009793,0,0
000001,2019-01-04,9.28,9.24,9.82,9.22,9.75,59551,148115906,1.422150e+09,108.031388,0.050647,0,0
000001,2019-01-07,9.75,9.84,9.85,9.63,9.74,34912,86568766,8.411664e+08,108.031388,-0.001026,0,0
000001,2019-01-08,9.74,9.73,9.74,9.62,9.66,21454,40238811,3.892478e+08,108.031388,-0.008214,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
689009,2024-06-24,42.14,41.44,42.10,38.52,38.81,24300,11157140,4.452130e+08,1.007044,-0.079022,0,0
689009,2024-06-25,38.81,39.07,39.87,38.70,39.27,15605,6005606,2.358702e+08,1.007044,0.011853,0,0
689009,2024-06-26,39.27,38.96,40.59,38.63,39.00,15963,6964262,2.744881e+08,1.007044,-0.006875,0,0
689009,2024-06-27,39.00,39.00,39.00,36.18,37.17,26804,10932445,4.051810e+08,1.007044,-0.046923,0,0


## 读入分钟行情, 保存每日股价跳跃信息