# 因子中性化和填充

## 导入模块

In [1]:
import numpy as np
import pandas as pd
import feather
import statsmodels.api as sm
import sunlandsdatasdk as sd
import os
from tqdm.notebook import tqdm

## 读入交易日和公司列表

In [2]:
price_1d = feather.read_dataframe('../data/StockPriceK1d_20241231.feather')
start_date = '2019-01-01'
end_date = '2024-12-31'
price_1d = price_1d[(price_1d['date'] >= start_date) & (price_1d['date'] <= end_date)]
price_1d = (
    price_1d
        .sort_values(['date', 'issue'])
        .set_index(['date', 'issue'])
)
price_1d

Unnamed: 0_level_0,Unnamed: 1_level_0,preclose,open,high,low,close,numTrades,volume,value,adj,ret,is_limit_buy,is_limit_sell
date,issue,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2019-01-02,000001,9.38,9.39,9.42,9.16,9.19,25140.0,53938632.0,4.986951e+08,108.031388,-0.020256,0.0,0.0
2019-01-02,000002,23.82,23.83,24.09,23.67,23.90,26541.0,24701028.0,5.893846e+08,142.667999,0.003359,0.0,0.0
2019-01-02,000004,16.03,16.05,16.24,16.01,16.06,241.0,142400.0,2.290041e+06,4.063862,0.001871,0.0,0.0
2019-01-02,000005,2.68,2.69,2.70,2.66,2.67,1187.0,2909600.0,7.788443e+06,9.267603,-0.003731,0.0,0.0
2019-01-02,000006,5.18,5.18,5.25,5.10,5.15,2643.0,6322964.0,3.273364e+07,34.226151,-0.005792,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-31,688799,42.65,42.50,43.00,41.60,41.88,2138.0,1074960.0,4.539020e+07,1.068341,-0.018054,0.0,0.0
2024-12-31,688800,55.07,54.04,55.00,49.56,50.11,18510.0,12258477.0,6.272253e+08,1.419317,-0.090067,0.0,0.0
2024-12-31,688819,27.85,27.75,27.95,27.12,27.22,5191.0,2023063.0,5.555536e+07,1.076469,-0.022621,0.0,0.0
2024-12-31,688981,99.29,99.00,99.58,94.59,94.62,154959.0,87823797.0,8.446754e+09,1.000000,-0.047034,0.0,0.0


## 市值行业中性化

### 读入申万行业和市值

In [3]:
industry = feather.read_dataframe('../data/industry_classes.feather')
industry = industry.sort_values('date').set_index('date')
market_cap = feather.read_dataframe('../data/market_cap.feather')
market_cap['log_market_cap'] = market_cap['market_cap'].apply(np.log)
market_cap = market_cap.sort_values('date').set_index('date')
prc = price_1d.reset_index(level=1)

### 市值行业中性化

In [4]:
def neutral_calc_step(factor, factor_col):
    date = factor.name
    try:
        price_keys = prc.loc[date, 'issue']
        indus_keys = industry.loc[date, ['issue', 'L1']]
        cap_keys = market_cap.loc[date, ['issue', 'log_market_cap']]
    except KeyError:
        return None

    f = factor.copy()
    f = f.merge(
        price_keys, on='issue', how='inner'
    )
    f = f.merge(
        indus_keys, on='issue', how='inner'
    )
    f['indus_mean'] = f.groupby('L1')[factor_col].transform('mean')
    f['indus_factor'] = f[factor_col] - f['indus_mean']
    
    f = f.merge(
        cap_keys, on='issue', how='inner'
    )
    x = f['log_market_cap']
    x = sm.add_constant(x)
    y = f['indus_factor']
    result = sm.OLS(y, x).fit()
    f['neutral_factor'] = result.resid

    return f[['issue', factor_col, 'indus_factor', 'neutral_factor']]

In [5]:
def neutral_calc_neo(factor, factor_col):
    date = factor.name
    try:
        price_keys = prc.loc[date, 'issue']
        indus_keys = industry.loc[date, ['issue', 'L1']]
        cap_keys = market_cap.loc[date, ['issue', 'log_market_cap']]
    except KeyError:
        return None

    f = factor.copy()
    f = f.merge(
        price_keys, on='issue', how='inner'
    )
    f = f.merge(
        indus_keys, on='issue', how='inner'
    )
    f = f.merge(
        cap_keys, on='issue', how='inner'
    )

    dummies = pd.get_dummies(f['L1'], prefix='ind')
    x = pd.concat([f['log_market_cap'], dummies], axis=1)
    x = x.astype(float)
    x = sm.add_constant(x)
    y = f[factor_col]
    result = sm.OLS(y, x).fit()
    f['neutral_factor'] = result.resid

    return f[['issue', factor_col, 'neutral_factor']]

## 因子填充

In [6]:
def factor_filling(factor):
    factor_fill = (
        factor
            .sort_values(['date', 'issue'])
            .set_index(['date', 'issue'])
    )
    factor_fill = factor_fill.reindex(index=price_1d.index)
    factor_fill = factor_fill.groupby('issue').ffill()
    factor_fill = factor_fill.groupby('issue').shift(1)
    factor_fill = factor_fill.reset_index()
    return factor_fill

## 因子中性化和填充

In [7]:
dirs = ['../data/peer_ret_daily/']
for dirname in tqdm(dirs):
    for filename in os.listdir(dirname):
        if os.path.isdir(dirname + '/' + filename):
            continue
        if not ('fill' in filename or 'neutral' in filename):
            if 'N_connect' in dirname:
                factor_col = 'N_connect'
            elif 'ret_jump' in dirname:
                factor_col = 'ret_jump'
            elif 'peer' in dirname:
                factor_col = 'peer_relative_ret'
            factor = feather.read_dataframe(dirname + '/' + filename)
            neutral = (
                factor
                    .groupby('date')[['issue', factor_col]]
                    .apply(neutral_calc_step, factor_col=factor_col)
                    .reset_index()
            )
            fill_neutral = factor_filling(neutral)
            feather.write_dataframe(neutral, dirname + 'neutral_' + filename)
            feather.write_dataframe(fill_neutral, dirname + '/fill_neutral_' + filename)

  0%|          | 0/1 [00:00<?, ?it/s]

## 检查

In [8]:
# ret_jump = feather.read_dataframe('../data/ret_jump_daily/neutral_ret_jump.feather')
# ret_jump[(ret_jump['issue'] == '000001') & (ret_jump['date'] >= '2019-08-29')]

In [9]:
# fill_ret_jump = feather.read_dataframe('../data/ret_jump_daily/fill_neutral_ret_jump.feather')
# fill_ret_jump[(fill_ret_jump['issue'] == '000001') & (fill_ret_jump['date'] >= '2019-08-29')]