In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from tqdm import tqdm
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

In [3]:
#neutralization函数构建
def neutralization(factor_df,tar_col,barra_col,mode):   
    result=pd.DataFrame() 
    for col in tar_col:
        #（1）去极值————按照均值上下3个标准差来取最大值和最小值，超过这个范围的就当作是极值，去掉
        edge_up = factor_df[col].mean()+3*factor_df[col].std()
        edge_low = factor_df[col].mean()-3*factor_df[col].std()
        factor_df[col]=factor_df[col].clip(edge_low,edge_up)

        #（2）标准化
        if int(mode)==1:
            factor_df[col] = (factor_df[col] - factor_df[col].min())/(factor_df[col].max() - factor_df[col].min())
        elif mode==2:
            factor_df[col] = (factor_df[col] - factor_df[tar_col].mean())/factor_df[col].std()
        elif mode==3:
            factor_df[col] = factor_df[col]/10**np.ceil(np.log10(factor_df[col].abs().max()))

        #（3）中性化
    #     为了平整数据、剔除风险和板块的线性影响。比如某个因子就是在小市值的股票上管用，其他情况都不管用，那因子暴露出来的结果肯定是分布不均匀的，
    #     中性化的方式是对因子值和barra因子、行业做线性回归，最后用剩下的残差替代因子值。这个残差肯定是跟barra和行业无关的。
        results = sm.OLS(factor_df[col], factor_df[barra_col]).fit()    #股票因子为因变量，barra信息为自变量
        result[col]=results.resid    #取残差项作为中性化后的因子值
    return result

In [4]:
date=pickle.load(open('./data/date.pkl','rb'))

output_folder = './new_factor_neutralization/factor_test'   #结果保存

if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_file=os.listdir(output_folder)

barra_path='./data/data_barra'     #Barra因子获取
barra_file=sorted(os.listdir(barra_path))

factor_path='./new_feature/factor_test'         #即将入库因子获取
factor_file=sorted(os.listdir(factor_path))

mode = 3    #标准化模式选择

for file in tqdm(barra_file):
    if file not in factor_file:
        continue
    date_tmp=file[:-4]
    factor=pd.read_csv(f'{factor_path}/{file}',index_col=0,header=0)   #读取因子
    barra_tmp=pd.read_csv(f'{barra_path}/{file}',index_col=0,header=0)   #读取barra


    target_list=barra_tmp.index.intersection(factor.index)    #对股票代码求交集

    barra_tmp=barra_tmp.loc[target_list,:]
    

    final=pd.concat([factor,barra_tmp],axis=1)
    
    final.replace([np.inf,-np.inf],np.nan,inplace=True)    #处理空值为0（可替代为均值等）
    final.fillna(0,inplace=True) 

    final_col=barra_tmp.columns.tolist()   
    col_name=factor.columns.tolist()      #获取因子名称
    data=neutralization(final,col_name,'size',mode)     #调用中性化函数
    data.to_csv(f'{output_folder}/{file}')     #保存结果

100%|█████████████████████████████████████████| 949/949 [00:27<00:00, 34.72it/s]


In [7]:
final_col

['size',
 'beta',
 'momentum',
 'residual_volatility',
 'non_linear_size',
 'book_to_price_ratio',
 'liquidity',
 'earnings_yield',
 'growth',
 'leverage']

In [6]:
final

Unnamed: 0_level_0,vwap,size,beta,momentum,residual_volatility,non_linear_size,book_to_price_ratio,liquidity,earnings_yield,growth,leverage
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
000001.XSHE,0.122667,1.063296,-0.176089,-0.711020,-0.602536,-1.248055,2.642980,0.036977,1.718258,0.302864,1.938893
000002.XSHE,0.184083,1.063296,0.310460,-1.187737,-0.081562,-1.248055,2.642980,0.672368,1.626614,-0.295445,1.996477
000004.XSHE,0.012891,-2.827903,-0.697807,1.206569,2.744198,-0.288351,-1.508455,1.605154,-2.939907,-2.899265,-0.928628
000005.XSHE,0.001431,-3.291918,-1.048058,-0.444092,0.707625,-1.664741,0.761868,-1.605220,-2.155739,-1.756629,-1.051760
000006.XSHE,0.023878,-1.767681,-0.741046,-0.182268,1.866016,2.027388,2.219277,0.731656,0.277272,-0.410180,1.975408
...,...,...,...,...,...,...,...,...,...,...,...
688799.XSHG,0.004681,-2.182259,0.291622,1.390719,-0.179561,1.562881,-0.573949,0.164546,-0.481455,-0.558332,-1.332210
688800.XSHG,0.006165,-1.658257,1.287896,-1.840018,1.551393,2.071821,-1.028135,0.385303,-0.818709,-0.302737,-0.841009
688819.XSHG,0.003112,-0.150760,0.042685,-0.400031,-0.575306,0.564675,-0.238636,0.415698,0.509849,0.751432,-0.390247
688981.XSHG,0.005380,1.063296,0.525434,0.985889,0.247993,-1.248055,-0.844221,1.879705,-1.487992,0.722739,-0.593271
