# 识别计算跳跃收益

## 导入模块

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import feather
import math
from scipy import stats
from tqdm.notebook import tqdm
import os
import swifter
from joblib import Parallel, delayed
%load_ext line_profiler

## 读入测试数据

In [2]:
price_1m = feather.read_dataframe('../data/StockPriceK1m/2024/StockPriceK1m_20240102.feather')
price_1m['date'] = pd.to_datetime(price_1m['date'].astype(str))
price_1d = feather.read_dataframe('../data/StockPriceK1d_20240630.feather')
price_1d['date'] = pd.to_datetime(price_1d['date'])

## 处理筛选日线数据

剔除上市不足 180 天、停牌、涨停股票

In [3]:
def IPO_time(price_1d, timedelta='180D'):
    start_date = price_1d['date'].min()
    idx_ipo = price_1d['date'] >= start_date + pd.Timedelta(timedelta)
    return price_1d[idx_ipo]

price_1d = price_1d.groupby('issue').apply(IPO_time)
price_1d = price_1d[(~price_1d['is_limit_buy'].astype(bool)) & (~price_1d['is_limit_sell'].astype(bool))]

In [4]:
idx_date = (price_1d['date'] >= '2019-01-01') & (price_1d['date'] <= '2024-12-31')
price_1d = price_1d[idx_date]
price_1d = price_1d.set_index(['issue', 'date'])

## 定义跳跃统计量

In [5]:
def mu(p: float):
    return (2 ** (p / 2)) * math.gamma((p + 1) / 2) / np.sqrt(np.pi)

mu1 = mu(1)
mu6 = mu(6)

def JS(ret, log_ret):
    n_series, n_points = ret.shape
    
    abs_log_ret = np.abs(log_ret)
    
    window_size = 6
    windows = np.lib.stride_tricks.sliding_window_view(
        abs_log_ret, window_shape=window_size, axis=1
    )
    prod_6 = np.prod(windows, axis=-1)
    sum_prod_6 = np.sum(prod_6, axis=1)
    
    coef_Omega = (mu6 / 9) * ((n_points ** 3) * (mu1 ** -6) / (n_points - 5))
    Omega_SwV = coef_Omega * sum_prod_6
    
    SwV_N = 2 * np.sum(ret - log_ret, axis=1)
    
    window_size_2 = 2
    windows_2 = np.lib.stride_tricks.sliding_window_view(
        abs_log_ret, window_shape=window_size_2, axis=1
    )
    prod_2 = np.prod(windows_2, axis=-1)
    sum_prod_2 = np.sum(prod_2, axis=1)
    
    coef_V = 1 / mu1
    V_01 = coef_V * sum_prod_2
    
    RV_N = np.sum(log_ret ** 2, axis=1)
    
    valid_mask = (Omega_SwV != 0) & (SwV_N != 0)
    js = np.full(n_series, np.nan)
    
    if np.any(valid_mask):
        valid_idx = np.where(valid_mask)[0]
        js[valid_idx] = n_points * (V_01[valid_idx] / np.sqrt(Omega_SwV[valid_idx])) * (1 - RV_N[valid_idx] / SwV_N[valid_idx])
    
    return js[0] if n_series == 1 else js

def pvalue(js: float):
    cdf = stats.norm.cdf(js, loc=0, scale=1)
    return 2 * min(cdf, 1 - cdf)

## 识别跳跃, 计算收益

In [6]:
def jump_identify(ret, log_ret):
    n = len(ret)
    jump = np.full(n, False, dtype=bool)
    med = np.median(ret)
    log_med = np.median(log_ret)
    ret_c = ret.copy()
    log_ret_c = log_ret.copy()
    js0 = JS(ret_c.reshape(1, n), log_ret_c.reshape(1, n))
    p = pvalue(js0)
    js0_pre = js0
    
    while (p < 0.05):
        ret_mat = np.tile(ret_c, (n, 1))
        log_ret_mat = np.tile(log_ret_c, (n, 1))
        np.fill_diagonal(ret_mat, med)
        np.fill_diagonal(log_ret_mat, log_med)
        js = JS(ret_mat, log_ret_mat)
        
        js_diff = np.abs(js0) - np.abs(js)
        idx_max = np.argmax(js_diff)
        jump[idx_max] = True
        ret_c[idx_max] = med
        log_ret_c[idx_max] = log_med
        js0 = JS(ret_c.reshape(1, n), log_ret_c.reshape(1, n))
        p = pvalue(js0)
        if js0 == js0_pre:
            break
        js0_pre = js0

    return np.any(jump), log_ret[jump].sum()

In [7]:
time_start = np.append(np.arange(93000, 113000, 500), np.arange(130000, 150000, 500))
time_start = time_start[np.where(time_start % 10000 < 6000)]
time_end =  time_start + 400

def jump_identify_price(price_1m, price_1d, date, date_next):
    issue = price_1m.iloc[0]['issue']
    prc = price_1m.copy()
    start_price = prc.loc[price_1m['time'].isin(time_start), 'open'].to_numpy()
    end_price = prc.loc[price_1m['time'].isin(time_end), 'close'].to_numpy()

    try:
        close_today = price_1d.loc[(issue, date), 'close']
        open_next = price_1d.loc[(issue, date), 'open']
    except KeyError:
        df_jump = pd.DataFrame({'issue': [issue], 'date': [date], 'jump': [False], 'ret_jump': [0.]})
        return df_jump
        
    start_price = np.append(start_price, close_today)
    end_price = np.append(end_price, close_today)
    
    ret = (end_price - start_price) / start_price
    log_ret = np.log(1 + ret)

    flag_jump, ret_jump = jump_identify(ret, log_ret)
    df_jump = pd.DataFrame({'issue': [issue], 'date': [date], 'jump': [flag_jump], 'ret_jump': [ret_jump]})
    return df_jump

def jump_identify_parallel(price_1m, price_1d, date, date_next):
    groups = list(price_1m.groupby('issue'))
    total_groups = len(groups)
    
    results = Parallel(n_jobs=-1, backend='loky')(
        delayed(jump_identify_price)(group[1], price_1d, '2024-01-02', '2024-01-03')
        for group in tqdm(groups, total=total_groups, desc="Processing stocks")
    )
    
    return pd.concat(results)

## 性能测试

### 单公司单日性能测试

In [8]:
prc = price_1m.loc[price_1m['issue'] == '000014']
date = '2024-01-02'
date_next = '2024-01-03'
%timeit -n 10 -r 3 jump_identify_price(prc, price_1d, date, date_next)

The slowest run took 16.06 times longer than the fastest. This could mean that an intermediate result is being cached.
19.5 ms ± 22.5 ms per loop (mean ± std. dev. of 3 runs, 10 loops each)


逐行运行时间

In [9]:
prc = price_1m.loc[price_1m['issue'] == '000014']
date = '2024-01-02'
date_next = '2024-01-03'
%lprun -f jump_identify_price jump_identify_price(prc, price_1d, date, date_next)

Timer unit: 1e-07 s

Total time: 0.0094589 s
File: C:\Users\admin\AppData\Local\Temp\ipykernel_11892\4145610477.py
Function: jump_identify_price at line 5

Line #      Hits         Time  Per Hit   % Time  Line Contents
     5                                           def jump_identify_price(price_1m, price_1d, date, date_next):
     6         1       4498.0   4498.0      4.8      issue = price_1m.iloc[0]['issue']
     7         1       2530.0   2530.0      2.7      prc = price_1m.copy()
     8         1      12269.0  12269.0     13.0      start_price = prc.loc[price_1m['time'].isin(time_start), 'open'].to_numpy()
     9         1       8448.0   8448.0      8.9      end_price = prc.loc[price_1m['time'].isin(time_end), 'close'].to_numpy()
    10                                           
    11         1          4.0      4.0      0.0      try:
    12         1       4796.0   4796.0      5.1          close_today = price_1d.loc[(issue, date), 'close']
    13         1       3132.0   3132.

### 单线程 / 并行性能测试

apply 单线程运算

速度: 9.02s / 5096 公司

In [10]:
%%time
df_jump = price_1m.groupby('issue')[['issue', 'time', 'open', 'close']].apply(
    jump_identify_price,
    price_1d=price_1d,
    date='2024-01-02',
    date_next='2024-01-03'
)

CPU times: total: 9.27 s
Wall time: 9.63 s


使用 swifter 提速

速度: 6.03s / 5096 公司

In [11]:
%%time
df_jump = price_1m.swifter.groupby('issue')[['issue', 'time', 'open', 'close']].apply(
    jump_identify_price,
    price_1d=price_1d,
    date='2024-01-02',
    date_next='2024-01-03'
)

  0%|          | 0/8 [00:00<?, ?it/s]

2025-07-01 15:15:47,231	INFO worker.py:1917 -- Started a local Ray instance.


CPU times: total: 5.67 s
Wall time: 15.3 s


使用 joblib 并行运算

速度：18min4s / 5096 公司

In [12]:
# %%time
# df_jump = jump_identify_parallel(price_1m, price_1d, '2024-01-02', '2024-01-03')

## 读入分钟行情, 保存每日股价跳跃信息

In [13]:
%%time

time_series = price_1d.index.get_level_values('date')
time_series = time_series.unique().sort_values()
tqdm_time_series = tqdm(zip(time_series[:-1],time_series[1:]),
                        total=len(time_series) - 1, desc='Processing Daily Stocks', unit='days')

for date, date_next in tqdm_time_series:
    year = date.year
    date_str = date.strftime('%Y%m%d')
    price_1m = feather.read_dataframe(f'../data/StockPriceK1m/{year}/StockPriceK1m_{date_str}.feather')
    df_jump = price_1m.groupby('issue')[['issue', 'time', 'open', 'close']].apply(
        jump_identify_price,
        price_1d=price_1d,
        date=date, date_next=date_next
    )
    df_jump = df_jump.reset_index(drop=True)

    os.makedirs(f'../data/jump/{year}/', exist_ok=True)
    feather.write_dataframe(df_jump, f'../data/jump/{year}/jump_{date_str}.feather')

Processing Daily Stocks:   0%|          | 0/1330 [00:00<?, ?days/s]

CPU times: total: 3h 10min 53s
Wall time: 3h 13min 54s


In [17]:
jump = pd.DataFrame(columns=['issue', 'date', 'jump', 'ret_jump'])
for date in time_series[:-1]:
    year = date.year
    date_str = date.strftime('%Y%m%d')
    jump_daily = feather.read_dataframe(f'../data/jump/{year}/jump_{date_str}.feather')
    if jump.empty:
        jump = jump_daily
    else:
        jump = pd.concat([jump, jump_daily])
jump = jump[jump['jump']]
jump['year_mon'] = jump['date'].dt.year * 100 + jump['date'].dt.month
feather.write_dataframe(jump, '../data/jump/jump.feather')