# 关联跳跃和跳跃关联度

## 导入模块

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import feather
from tqdm.notebook import tqdm
import os
import sys
%load_ext line_profiler

## 读入数据

### 读入日线数据及处理

In [2]:
price_1d = feather.read_dataframe('../data/StockPriceK1d_20241231.feather')
price_1d = price_1d[(price_1d['date'] >= '2019-01-01') & (price_1d['date'] <= '2024-12-31')]
price_1d = price_1d[(price_1d['ret'] > 0.01) | (price_1d['ret'] < -0.01)]

### 股价跳跃数据及处理

In [3]:
jump = feather.read_dataframe('../data/jump/jump.feather')
jump['date'] = pd.to_datetime(jump['date'])
jump = pd.merge(
    jump,
    price_1d[['issue', 'date']],
    on=['issue', 'date'],
    how='inner'
)
jump['sign'] = jump['ret_jump'].apply(np.sign)

### 调仓日和对应的构建关联度开始时间 (120 天前)

In [4]:
trade_date = price_1d['date'].sort_values().unique()
start_date = '2019-01-01'
end_date = '2025-01-01'
mes = pd.date_range(start=start_date, end=end_date, freq='1ME')
adj_date = np.array([], dtype=np.datetime64)
calc_start_date = np.array([], dtype=np.datetime64)

for me in mes:
    trade_date_before = trade_date[trade_date <= me]
    ad = trade_date_before[-1]
    adj_date = np.append(adj_date, ad)
    if (len(trade_date_before) > 120):
        csd = trade_date_before[-120]
    else:
        csd = trade_date_before[0]
    calc_start_date = np.append(calc_start_date, csd)

### 公司列表

In [5]:
issues = pd.DataFrame(columns=['date', 'issue'])
tqdm_date = tqdm(zip(adj_date, calc_start_date), total=len(adj_date))
for ad, csd in tqdm_date:
    idx_date = (price_1d['date'] >= csd) & (price_1d['date'] <= ad)
    issues_date = price_1d.loc[idx_date, 'issue'].sort_values().unique()
    issues_date = pd.DataFrame({'date': ad, 'issue': issues_date})
    if issues.empty:
        issues = issues_date
    else:
        issues = pd.concat([issues, issues_date])
issues = issues.set_index('date')
feather.write_dataframe(issues, '../data/issues.feather')

  0%|          | 0/72 [00:00<?, ?it/s]

In [6]:
issues_tot = issues['issue'].sort_values().unique()

## 关联跳跃

### 前一日后一日跳跃方向

In [7]:
jump_plus1 = jump[['issue', 'date', 'sign']].copy()
jump_plus1['date'] = jump_plus1['date'] + pd.Timedelta('1d')
jump_plus1 = jump_plus1.rename(columns={'sign': 'sign_yest'})
jump_expand = pd.merge(
    jump.reset_index(),
    jump_plus1,
    on=['issue', 'date'],
    how='outer'
)
jump_minus1 = jump[['issue', 'date', 'sign']].copy()
jump_minus1['date'] = jump_minus1['date'] - pd.Timedelta('1d')
jump_minus1 = jump_minus1.rename(columns={'sign': 'sign_tomo'})
jump_expand = pd.merge(
    jump_expand,
    jump_minus1,
    on=['issue', 'date'],
    how='outer'
)
jump_expand['index'] = jump_expand['index'].fillna(-1.).astype(int)
jump_expand[['sign', 'sign_yest', 'sign_tomo']] = jump_expand[['sign', 'sign_yest', 'sign_tomo']].fillna(0.)
jump_expand['jump'] = jump_expand['jump'].fillna(False)
jump_expand['ret_jump'] = jump_expand['ret_jump'].fillna(0.)
jump_expand['year_mon'] = jump_expand['date'].dt.year * 100 + jump_expand['date'].dt.month

  jump_expand['jump'] = jump_expand['jump'].fillna(False)


### 关联跳跃矩阵

index: 每一次跳跃

column: 每一家公司

In [8]:
global jump_corr
jump_corr = pd.DataFrame(data=False, index=jump.index, columns=issues_tot, dtype=bool)

### 找出一日内的关联跳跃

In [9]:
def identify_corr(jump_date):
    global jump_corr

    idx_pos = jump_date.loc[jump_date['sign'] == 1, 'index']
    idx_pos_corr = (jump_date['sign'] == 1) | (jump_date['sign_yest'] == 1) | (jump_date['sign_tomo'] == 1)
    issues_pos = jump_date.loc[idx_pos_corr, 'issue'].to_numpy()
    jump_corr.loc[idx_pos, issues_pos] = True
    
    idx_neg = jump_date.loc[jump_date['sign'] == -1, 'index']
    idx_neg_corr = (jump_date['sign'] == -1) | (jump_date['sign_yest'] == -1) | (jump_date['sign_tomo'] == -1)
    issues_neg = jump_date.loc[idx_neg_corr, 'issue'].to_numpy()
    jump_corr.loc[idx_neg, issues_neg] = True

### 关联跳跃 & 保存

In [10]:
%%time
jump_expand.groupby('date')[['issue', 'index', 'sign', 'sign_yest', 'sign_tomo']].apply(identify_corr)

CPU times: total: 24.6 s
Wall time: 24.6 s


In [11]:
%%time
os.makedirs('../data/corr/', exist_ok=True)
feather.write_dataframe(jump_corr, '../data/corr/jump_corr.feather')

CPU times: total: 2min 23s
Wall time: 38.6 s


## 跳跃关联度

### 一家公司一个调仓日的跳跃关联度

In [12]:
def corr_calc(jump, jump_corr_date):
    idx = jump.index
    sum_num = jump['jump'].count()
    corr_num = jump_corr_date.loc[idx].sum() / sum_num
    abs_ret_jump = jump['ret_jump'].apply(np.abs)
    sum_size = abs_ret_jump.sum()
    corr_size = jump_corr_date.loc[idx].mul(abs_ret_jump, axis=0).sum() / sum_size

    corr_num = corr_num
    corr_num['type'] = 'num'
    corr_size = corr_size
    corr_size['type'] = 'size'
    
    return pd.concat([corr_num, corr_size], axis=1).T

### 所有公司每个调仓日的跳跃关联度

In [13]:
%%time

# remove_num = []
# remove_size = []

tqdm_date = tqdm(
    zip(adj_date[-7:], calc_start_date[-7:]),
    total=len(adj_date[-7:]),
    desc='Processing Adjusting Date',
    unit='days'
)

os.makedirs('../data/corr/', exist_ok=True)
os.makedirs('../data/N_connect/', exist_ok=True)

for ad, csd in tqdm_date:
    ad_str = ad.strftime('%Y%m%d')
    issues_date = issues.loc[ad, 'issue'].to_numpy()
    jump_corr_date = jump_corr.loc[(jump['date'] <= ad) & (jump['date'] >= csd), issues_date]
    jump_date = jump[(jump['date'] <= ad) & (jump['date'] >= csd)]
    
    corr = (
        jump_date
            .groupby('issue')[['jump', 'ret_jump']]
            .apply(corr_calc, jump_corr_date=jump_corr_date)
            .reset_index()
    )
    
    corr_num = (
        corr[corr['type'] == 'num']
            .drop(columns=['level_1', 'type'])
            .set_index('issue')
            .reindex(index=issues_date, columns=issues_date, fill_value=0)
    )
    corr_size = (
        corr[corr['type'] == 'size']
            .drop(columns=['level_1', 'type'])
            .set_index('issue')
            .reindex(index=issues_date, columns=issues_date, fill_value=0)
    )
    feather.write_dataframe(corr_num, f'../data/corr/corr_num_{ad_str}.feather')
    feather.write_dataframe(corr_size, f'../data/corr/corr_size_{ad_str}.feather')
    
    # med_num = np.median(corr_num)
    # corr_num[corr_num <= med_num] = 0
    # med_size = np.median(corr_size)
    # corr_size[corr_size <= med_size] = 0
    # feather.write_dataframe(corr_num, f'../data/corr/sparse_corr_num_{ad_str}.feather')
    # feather.write_dataframe(corr_size, f'../data/corr/sparse_corr_size_{ad_str}.feather')
    
    # r_num = ((corr_num > 0).sum() <= 1).sum(axis=1)
    # remove_num.append(r_num / len(issues_date))
    # r_size = ((corr_size > 0).sum() <= 1).sum(axis=1)
    # remove_size.append(r_size / len(issues_date))
    
    # N_connect_num = (corr_num > 0).sum(axis=1)
    # N_connect_num = pd.concat([corr_num[['issue', 'date', 'type']], N_connect_num], axis=1)
    # feather.write_dataframe(N_connect_num, f'../data/N_connect/N_connect_num_{ad_str}.feather')
    # N_connect_size = (corr_size[issues] > 0).sum(axis=1)
    # N_connect_size = N_connect_size.rename('N_connect')
    # N_connect_size = pd.concat([corr_size[['issue', 'date', 'type']], N_connect_size], axis=1)
    # feather.write_dataframe(N_connect_size, f'../data/N_connect/N_connect_size_{ad_str}.feather')
        
    del corr, corr_num, corr_size
del jump_corr

Processing Adjusting Date:   0%|          | 0/7 [00:00<?, ?days/s]

CPU times: total: 8min 54s
Wall time: 8min 56s


### 合并保存

In [14]:
# N_connect_num = pd.DataFrame(columns=['issue', 'date', 'N_connect'])
# N_connect_size = pd.DataFrame(columns=['issue', 'date', 'N_connect'])
# for ad in adj_date:
#     ad_str = ad.strftime('%Y%m%d')
#     num_daily = feather.read_dataframe(f'../data/N_connect/N_connect_num_{ad_str}.feather')
#     size_daily = feather.read_dataframe(f'../data/N_connect/N_connect_size_{ad_str}.feather')
#     if N_connect_num.empty:
#         N_connect_num = num_daily
#     else:
#         N_connect_num = pd.concat([N_connect_num, num_daily])
#     if N_connect_size.empty:
#         N_connect_size = size_daily
#     else:
#         N_connect_size = pd.concat([N_connect_size, size_daily])
# feather.write_dataframe(N_connect_num, '../data/N_connect/N_connect_num.feather')
# feather.write_dataframe(N_connect_size, '../data/N_connect/N_connect_size.feather')