# W 式切割

## 导入模块

In [1]:
import numpy as np
import pandas as pd
import feather
import os
from tqdm.notebook import tqdm
from numpy.lib.stride_tricks import as_strided as stride

## 读入数据

In [2]:
start_date = '2023-01-01'
start_date_redundancy = pd.to_datetime('2022-12-01')
end_date = '2023-12-31'

In [3]:
price_60m = feather.read_dataframe('../data/StockPriceK60m_cache.feather')
price_60m = price_60m[(price_60m['date'] >= start_date_redundancy) & (price_60m['date'] <= end_date)]
price_60m['close_prev'] = price_60m.groupby('issue')['close'].shift(1)
price_60m['close_prev'] = price_60m['close_prev'].fillna(price_60m['preclose'])
price_60m['ret'] = price_60m['close'].div(price_60m['close_prev']).sub(1)
price_60m = price_60m.drop(columns='close_prev')
price_60m = price_60m[price_60m['time'] != 145900]
price_60m = price_60m.reset_index(drop=True)
price_60m

Unnamed: 0,datetime,date,time,issue,open,high,low,close,volume,value,num_trades,vwap1,vwap2,preclose,adj,is_limit_buy,is_limit_sell,filterFlag,ret
0,2022-12-01 10:29:00,2022-12-01,102900,000001,1524.466354,1556.368490,1495.982304,1501.679114,97910494.0,1.307261e+09,53650.0,1504.886163,1506.388110,1484.588684,113.936200,1.0,1.0,1.0,0.011512
1,2022-12-01 11:29:00,2022-12-01,112900,000001,1503.957838,1524.466354,1499.400390,1499.400390,38774081.0,5.154687e+08,24371.0,1501.791096,1502.077536,1484.588684,113.936200,1.0,1.0,1.0,-0.001517
2,2022-12-01 13:59:00,2022-12-01,135900,000001,1498.261028,1511.933372,1490.285494,1493.703580,26729831.0,3.521547e+08,21179.0,1491.884787,1493.084805,1484.588684,113.936200,1.0,1.0,1.0,-0.003799
3,2022-12-01 14:53:00,2022-12-01,145300,000001,1493.703580,1498.261028,1489.146132,1492.564218,33419321.0,4.383060e+08,21205.0,1493.027994,1494.005774,1484.588684,113.936200,1.0,1.0,1.0,-0.000763
4,2022-12-01 10:29:00,2022-12-01,102900,000002,3174.777339,3307.851838,3167.864378,3271.558793,89343918.0,1.678064e+09,79990.0,3275.758137,3270.878222,3223.168066,172.824025,1.0,1.0,1.0,0.015013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277927,2023-12-29 14:53:00,2023-12-29,145300,688981,53.180000,53.240000,52.900000,53.040000,2842269.0,1.508019e+08,6162.0,53.025741,53.023995,53.130000,1.000000,1.0,1.0,1.0,-0.002445
5277928,2023-12-29 10:29:00,2023-12-29,102900,689009,29.250000,30.140000,29.250000,29.960000,1736371.0,5.179270e+07,3193.0,29.951746,29.937856,29.200000,1.000000,1.0,1.0,1.0,0.026027
5277929,2023-12-29 11:29:00,2023-12-29,112900,689009,29.950000,30.080000,29.670000,29.680000,953149.0,2.850594e+07,1492.0,29.656756,29.639631,29.200000,1.000000,1.0,1.0,1.0,-0.009346
5277930,2023-12-29 13:59:00,2023-12-29,135900,689009,29.660000,29.680000,29.570000,29.660000,402061.0,1.191247e+07,810.0,29.657966,29.667571,29.200000,1.000000,1.0,1.0,1.0,-0.000674


## 获取成交额分位数

In [4]:
def qcut_weight(price:pd.DataFrame, q:float) -> float:
    prc = (
        price[['value_avg', 'num_trades']]
            .reset_index(drop=True).copy()
    )
    if prc['num_trades'].sum() == 0:
        return np.nan
    prc = prc.sort_values('value_avg')
    prc['weight'] = prc['num_trades'].cumsum().div(prc['num_trades'].sum())
    return prc[prc['weight'] >= q].iloc[0]['value_avg']

In [5]:
q = 13/16
label = '1316'
trade_date = price_60m['date'].sort_values().unique()
value_q = None
for date in tqdm(trade_date):
    year = date.year
    date_str = date.strftime('%Y%m%d')
    price_1m = feather.read_dataframe(f'../data/StockPriceK1m/{year}/StockPriceK1m_{date_str}.feather')
    trade_time = [102900, 112900, 135900, 145300]
    price_1m['period'] = 145900
    for tt in trade_time[::-1]:
        price_1m.loc[price_1m['time'] <= tt, 'period'] = tt
    price_1m['value_avg'] = price_1m['value'].div(price_1m['num_trades'])
    value_q_date = ( \
        price_1m \
            .groupby(['issue', 'period'])[['value_avg', 'num_trades']] \
            .apply(qcut_weight, q=q) \
            .rename('value_q') \
            .reset_index()
    )
    value_q_date['date'] = date
    value_q = pd.concat([value_q, value_q_date])
price_60m = pd.merge(
    price_60m,
    value_q,
    left_on=['date', 'time', 'issue'],
    right_on=['date', 'period', 'issue'],
    how='left'
)

  0%|          | 0/264 [00:00<?, ?it/s]

## 理想反转因子

In [6]:
price_60m['value_avg'] = price_60m['value'].div(price_60m['num_trades'])

rolling.apply

In [7]:
def roll_np(df: pd.DataFrame, apply_func: callable, window: int,
         return_col_num: int, **kwargs):
    v = df.reset_index().values
    dim0, dim1 = v.shape
    stride0, stride1 = v.strides

    stride_values = stride(v, (dim0 - (window - 1), window, dim1), (stride0, stride0, stride1))
    result_values = np.full((dim0, return_col_num), np.nan)
    for idx, values in enumerate(stride_values, window - 1):
        result_values[idx, ] = apply_func(values, **kwargs)

    return result_values

In [8]:
d = 20

def m_np_calc(price) -> float:
    prc = price[np.argsort(price[:, 1])]
    d = len(prc)
    m_low = prc[: d // 2, 2].sum()
    m_high = prc[d // 2:, 2].sum()
    return m_high - m_low

def m_series_calc(price:pd.DataFrame) -> pd.Series:
    if len(price) < d:
        return pd.DataFrame({'datetime': price['datetime'], 'm': np.nan})
    m_np = roll_np(
        df=price[['value_q', 'ret']],
        apply_func=m_np_calc,
        window=d, return_col_num=1
    )
    return pd.DataFrame({'datetime': price['datetime'], 'm': m_np.reshape(m_np.shape[0])})

tqdm.pandas()
factor_m = (
    price_60m
        .groupby('issue')[['datetime', 'value_q', 'ret']]
        .progress_apply(m_series_calc)
        .reset_index()
        .drop(columns='level_1')
)

  0%|          | 0/5141 [00:00<?, ?it/s]

## 保存

In [9]:
factor_m

Unnamed: 0,issue,datetime,m
0,000001,2022-12-01 10:29:00,
1,000001,2022-12-01 11:29:00,
2,000001,2022-12-01 13:59:00,
3,000001,2022-12-01 14:53:00,
4,000001,2022-12-02 10:29:00,
...,...,...,...
5277927,689009,2023-12-28 14:53:00,-0.087488
5277928,689009,2023-12-29 10:29:00,-0.043888
5277929,689009,2023-12-29 11:29:00,-0.063315
5277930,689009,2023-12-29 13:59:00,-0.125435


In [10]:
os.makedirs('../data/factor_m_60m/', exist_ok=True)
feather.write_dataframe(factor_m, f'../data/factor_m_60m/factor_m_{label}_60m.feather')