# 因子回测

## 导入模块

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import ticker
from scipy import stats
import feather
import os
import statsmodels.api as sm
import sunlandsdatasdk as sd

## 读入日线数据

### 读入日线数据

In [3]:
price_1d = feather.read_dataframe('../data/StockPriceK1d_20241231.feather')

### 计算调仓日

In [4]:
trade_date = price_1d['date'].sort_values().unique()
start_date = '2019-01-01'
end_date = '2025-01-01'
mes = pd.date_range(start=start_date, end=end_date, freq='1ME')
adj_date = np.array([], dtype=np.datetime64)
for me in mes:
    trade_date_before = trade_date[trade_date <= me]
    ad = trade_date_before[-1]
    adj_date = np.append(adj_date, ad)
map_mon_adj = {ad.year * 100 + ad.month: ad for ad in adj_date}

### 计算每两个调仓日之间的收益

In [5]:
price_1d['year_mon'] = price_1d['date'].dt.year * 100 + price_1d['date'].dt.month

def ret_acc_prod(price_1d):
    ret = price_1d['ret']
    return (1 + ret).prod() - 1

def ret_acc_adj(price_1d):
    preclose = price_1d.iloc[0]['preclose'] * price_1d.iloc[0]['adj']
    close = price_1d.iloc[-1]['close'] * price_1d.iloc[-1]['adj']
    return (close - preclose) / preclose

def ret_acc(price_1d):
    preclose = price_1d.iloc[0]['preclose']
    close = price_1d.iloc[-1]['close']
    return (close - preclose) / preclose

price_adj = (
    price_1d[(price_1d['date'] >= start_date) & (price_1d['date'] <= end_date)]
        .groupby(['issue', 'year_mon'])[['preclose', 'close', 'adj', 'ret']]
        .apply(ret_acc_prod)
        .reset_index()
)
price_adj = price_adj.rename(columns={0: 'ret'})
price_adj['date'] = price_adj['year_mon'].apply(map_mon_adj.get)
price_adj['ret_next'] = price_adj.groupby('issue')['ret'].shift(-1)
price_adj = price_adj[['issue', 'date', 'ret', 'ret_next']]
price_adj = price_adj.dropna(subset='ret_next')

### 剔除股票

In [6]:
def newly_listed(date, timedelta='180D'):
    start_date = date.min()
    newly = (date <= start_date + pd.Timedelta(timedelta))
    return newly

price_1d['newly_listed'] = (
    price_1d
        .groupby('issue')['date']
        .transform(newly_listed)
)

price_adj = pd.merge(
    price_adj,
    price_1d[['issue', 'date', 'newly_listed', 'is_limit_sell', 'is_limit_buy']],
    on=['issue', 'date'],
    how='left'
)

price_adj = price_adj[
    (~price_adj['newly_listed']) &
    (~price_adj['is_limit_buy'].astype(bool)) &
    (~price_adj['is_limit_sell'].astype(bool))
]
price_adj = price_adj[['issue', 'date', 'ret', 'ret_next']]

## IC 测试

In [7]:
def IC_calc_once(factor, factor_col, ret_col):
    IC, p = stats.spearmanr(factor[factor_col], factor[ret_col])
    return IC

def IC_calc(factor, factor_col):
    f = factor.copy()
    f = pd.merge(
        f,
        price_adj,
        on=['issue', 'date'],
        how='inner'
    )  
    IC = (
        f
            .groupby('date')[[factor_col, 'ret_next']]
            .apply(IC_calc_once, factor_col=factor_col, ret_col='ret_next')
    )
    return IC

## 分组测试

In [12]:
def grouped_ret(f, factor_col):
    f_date = f.copy()
    q = f_date[factor_col].quantile(np.arange(0, 1.1, 0.1))
    qcut = pd.cut(
        f_date[factor_col],
        q,
        right=True,
        labels=np.arange(1, 11)
    )
    f_date['group'] = qcut
    ret = f_date.groupby('group', observed=False)['ret_next'].mean()
    return ret

def grouped_calc(factor, factor_col, factor_name=None, dirname=None):
    if factor_name == None:
        factor_name = factor_col
    f = pd.merge(
        factor,
        price_adj,
        on=['issue', 'date'],
        how='inner'
    )
    ret = (
        f
            .groupby('date')[[factor_col, 'ret', 'ret_next']]
            .apply(grouped_ret, factor_col=factor_col)
    )
    
    ret_annual = (1 + ret.mean()) ** 12 - 1
    plt.bar(ret_annual.index, ret_annual, color='#a60021')
    plt.xlabel('Group')
    plt.ylabel('Annualized Return')
    plt.gca().yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1, decimals=0))
    if dirname == None:
        plt.show()
    else:
        os.makedirs(dirname, exist_ok=True)
        plt.savefig(dirname + '/grouped_' + factor_name + '.png')
        plt.close()
    
    long = (1 + ret[10]).cumprod()
    long_short = (1 + ret[10] - ret[1]).cumprod()
    plt.plot(long, linewidth=3, color='#a60021', label='long')
    plt.plot(long_short, linewidth=3, color='#ffc000', label='long-short')
    plt.legend()
    plt.grid()
    plt.xlabel('Time')
    plt.ylabel('Net Value')
    if dirname == None:
        plt.show()
    else:
        os.makedirs(dirname, exist_ok=True)
        plt.savefig(dirname + '/nv_' + factor_name + '.png')
        plt.close()

    return ret

## 一键回测

In [13]:
def backtest(filename, factor_col, factor_name=None, dirname=None):
    if factor_name == None:
        factor_name = factor_col
    factor = feather.read_dataframe(filename)

    df_IC = pd.DataFrame(columns=['factor', 'IC_mean', 'IC_IR', 't', "max", 'min'])
    IC = IC_calc(factor, factor_col=factor_col)
    df_IC.loc[0] = [
        factor_name,
        IC.mean(),
        IC.mean() / IC.std(),
        np.sqrt(len(IC)) * IC.mean() / IC.std(),
        IC.max(),
        IC.min()
    ]

    sign = np.sign(IC.mean())
    factor[factor_col] *= sign

    df_grouped = pd.DataFrame(columns=['factor', 'long_annual', 'long_sharpe', 'long_short_annual'])
    grouped = grouped_calc(
        factor, factor_col=factor_col,
        factor_name=factor_name, dirname=dirname
    )
    mean_annual = (1 + grouped[10].mean()) ** 12 - 1
    std_annual = np.sqrt(12) * grouped[10].std()
    long_short = (1 + (grouped[10] - grouped[1]).mean()) ** 12 - 1
    df_grouped.loc[0]=[
        factor_name,
        mean_annual,
        mean_annual / std_annual,
        long_short
    ]

    return df_IC, df_grouped

## 点度中心性回测

In [10]:
dirname = '../data/N_connect_0_1/'
filename_num = dirname + '/neutral_N_connect_num.feather'
filename_size = dirname + '/neutral_N_connect_size.feather'
cols = ['N_connect', 'indus_factor', 'neutral_factor']
names_num = ['num', 'num_indus', 'num_neutral']
names_size = ['size', 'size_indus', 'size_neutral']
for col, name_num, name_size in zip(cols, names_num, names_size):
    IC_num, grouped_num = backtest(filename_num, col, name_num, dirname)
    IC_size, grouped_size = backtest(filename_size, col, name_size, dirname)
    IC = pd.concat([IC_num, IC_size])
    grouped = pd.concat([grouped_num, grouped_size])
    display(IC)
    display(grouped)

Unnamed: 0,factor,IC_mean,IC_IR,t,max,min
0,num,0.032224,0.452363,3.618903,0.151475,-0.176551
0,size,0.031914,0.430881,3.447052,0.169751,-0.146404


Unnamed: 0,factor,long_annual,long_sharpe,long_short_annual
0,num,0.167297,0.782904,0.086636
0,size,0.147323,0.691332,0.041487


Unnamed: 0,factor,IC_mean,IC_IR,t,max,min
0,num_indus,0.033313,0.613168,4.905345,0.137594,-0.172949
0,size_indus,0.034927,0.644308,5.154463,0.152277,-0.11917


Unnamed: 0,factor,long_annual,long_sharpe,long_short_annual
0,num_indus,0.172246,0.820287,0.084519
0,size_indus,0.162453,0.778706,0.072393


Unnamed: 0,factor,IC_mean,IC_IR,t,max,min
0,num_neutral,0.026865,0.631019,5.048152,0.122659,-0.093227
0,size_neutral,0.030308,0.624561,4.996485,0.128182,-0.072891


Unnamed: 0,factor,long_annual,long_sharpe,long_short_annual
0,num_neutral,0.153029,0.740824,0.057152
0,size_neutral,0.151248,0.727917,0.056459


In [14]:
dirname = '../data/N_connect_1_any/'
filename_num = dirname + '/neutral_N_connect_num.feather'
filename_size = dirname + '/neutral_N_connect_size.feather'
cols = ['N_connect', 'indus_factor', 'neutral_factor']
names_num = ['num', 'num_indus', 'num_neutral']
names_size = ['size', 'size_indus', 'size_neutral']
for col, name_num, name_size in zip(cols, names_num, names_size):
    IC_num, grouped_num = backtest(filename_num, col, name_num, dirname + '/backtest1/')
    IC_size, grouped_size = backtest(filename_size, col, name_size, dirname + 'backtest1/')
    IC = pd.concat([IC_num, IC_size])
    grouped = pd.concat([grouped_num, grouped_size])
    display(IC)
    display(grouped)

Unnamed: 0,factor,IC_mean,IC_IR,t,max,min
0,num,-0.03056,-0.197708,-1.581668,0.263091,-0.48361
0,size,-0.029735,-0.19674,-1.573921,0.275136,-0.474753


Unnamed: 0,factor,long_annual,long_sharpe,long_short_annual
0,num,0.121806,0.686993,0.023078
0,size,0.143556,0.796636,0.045774


Unnamed: 0,factor,IC_mean,IC_IR,t,max,min
0,num_indus,-0.029872,-0.295697,-2.365578,0.161839,-0.283295
0,size_indus,-0.027219,-0.276944,-2.215551,0.171915,-0.260776


Unnamed: 0,factor,long_annual,long_sharpe,long_short_annual
0,num_indus,0.127403,0.665032,0.042444
0,size_indus,0.143704,0.760711,0.06354


Unnamed: 0,factor,IC_mean,IC_IR,t,max,min
0,num_neutral,-0.037929,-0.445513,-3.564107,0.12816,-0.239719
0,size_neutral,-0.03387,-0.391307,-3.130458,0.144331,-0.225552


Unnamed: 0,factor,long_annual,long_sharpe,long_short_annual
0,num_neutral,0.156862,0.769084,0.098534
0,size_neutral,0.156007,0.778322,0.092502


## 跳跃收益因子回测

In [19]:
cols = ['ret_jump', 'indus_factor', 'neutral_factor']
names = ['ret_jump', 'ret_jump_indus', 'ret_jump_neutral']
for factor_col, factor_name in zip(cols, names):
    IC, grouped = backtest(
        '../data/ret_jump/neutral_ret_jump.feather',
        factor_col=factor_col,
        factor_name=factor_name,
        dirname='../data/ret_jump/'
    )
    print('-' * 10 + factor_name + '-' * 10)
    display(IC)
    display(grouped)

----------ret_jump----------


Unnamed: 0,factor,IC_mean,IC_IR,t,max,min
0,ret_jump,-0.050036,-0.584903,-4.751778,0.150296,-0.25672


Unnamed: 0,factor,long_annual,long_sharpe,long_short_annual
0,ret_jump,0.10604,0.452888,0.101336


----------ret_jump_indus----------


Unnamed: 0,factor,IC_mean,IC_IR,t,max,min
0,ret_jump_indus,-0.041005,-0.653465,-5.308774,0.105706,-0.213272


Unnamed: 0,factor,long_annual,long_sharpe,long_short_annual
0,ret_jump_indus,0.095235,0.401981,0.088022


----------ret_jump_neutral----------


Unnamed: 0,factor,IC_mean,IC_IR,t,max,min
0,ret_jump_neutral,-0.037666,-0.670778,-5.44943,0.088623,-0.172599


Unnamed: 0,factor,long_annual,long_sharpe,long_short_annual
0,ret_jump_neutral,0.093376,0.398901,0.081155


## 跳跃关联动量因子回测

In [None]:
filename_num = '../data/peer_ret/neutral_peer_ret_num.feather'
filename_size = '../data/peer_ret/neutral_peer_ret_size.feather'
IC_num, grouped_num = backtest(filename_num, 'neutral_factor', 'peer_relative_ret_num')
IC_size, grouped_size = backtest(filename_size, 'neutral_factor', 'peer_relative_ret_size')
IC = pd.concat([IC_num, IC_size])
grouped = pd.concat([grouped_num, grouped_size])
display(IC)
display(grouped)