In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import KFold
from smartbeta.ai_factor import AiFactor
from smartbeta.i18n import _
from data_provider.datafeed.universe import Universe
from smartbeta.backtest.factor_backtest import FactorBackTest
from smartbeta.analyst import ReturnAnalyzer
from smartbeta.neutralizer import neutralize
import pdb

In [12]:
# neutralize?

## 合成因子

借助AiFactor类，我们可以基于已有的因子库，来自定义合成因子

In [6]:
import pandas as pd
from smartbeta.smartfactor import SmartFactor
from smartbeta.ai_factor import AiFactor
from data_provider.nestlib.market_info import Frequency

from smartbeta.factor_training_feed import standardization_rank

class RiskNeuFactor(AiFactor):
    
    def _x_rules(self):
        """
        在该方法中指定因子预处理方式
        
        更多方法，参见因子合成文档中的因子标准化方法
        """
        return [
#             {'func': standardization_rank}, # 因子值转换为排名
#             {'func': standardization_normalization} # 自定义标准化方法
        ]

    def _build_ai_model(self, dateTime, training_set):
        print(dateTime)
        """
        按指定frequency，滚动生成复合因子数据
        
        Parameters
        -----------
        dateTime: datetime类型
            当前时间
        trainint_set: pd.DataFrame
            训练集
        Return
        -------
        pd.Series类型，key为securityId, 值为factor value
        """
        #self.ff._get_training_XY用来获取因子的集合，分为训练集和预测集，在此例中我们不需要使用历史数据进行训练，仅使用预测集，即调仓日当天的因子数据
        #_get_training_XY返回的集合由factor_parameter中的lagCycleNumber参数决定训练集使用多少历史的因子数据
        #[0]第一个元素为training_set的X，所有因子经过转化的结果，也就是当前月份往前1个月＋lag_period的因子集合pandas对象
        #[1]第二个元素为trainning_set的Y， 即X对应的结果集，根据设置可以为后一个月的return，也可以是good/bad，根据AIFactor的factor_parameter决定
        #[2]第三个元素为当前月份的因子，用于预测
        
        #这里我们因为对因子等权叠加，所以只需要当前月份的因子值排名即可。
        #self.ff对象为一个FactorTrainingFeed对象，详细功能参见FactorTrainingFeed Reference部分
        ret = self.ff._get_training_XY(dateTime, training_set)
        factor_data = ret[2].loc[dateTime]
#        pdb.set_trace()
        source_factor_name=self.get_factor_param()['subFactors'][0]['factor_name']
        #factor_data=factor_data.rename(columns={source_factor_name: "factor_value"})
#        factor_data.set_index('security_code')['factor_value']
        #最后返回因子排名相加的值，由于在创建对象时，我们使用了standardization_rank的预处理方式，
        #该处理方式已经将因子值转化成了排名，所以这里只需要简单做Sum运算便可。
        factor_se=factor_data[source_factor_name]
        factor_se.name='factor_data'
        data=neutralize(factor_se, dateTime, expose_style_list=self.get_factor_param()['expose'])
        
        
        return data
    
            

In [24]:
# 未成功生成191因子序号

unsaved_191_factors_code = ['016',
 '027',
 '030',
 '050',
 '051',
 '055',
 '069',
 '073',
 '092',
 '115',
 '119',
 '121',
 '127',
 '128',
 '131',
 '135',
 '137',
 '138',
 '140',
 '143',
 '146',
 '147',
 '149',
 '151',
 '157',
 '165',
 '166',
 '181',
 '183',
 '190']

In [25]:
alpha_code = []
one_to_ten = ['00'+str(n) for n in range(1,10)]
alpha_code.extend(one_to_ten)
ten_to_one_hundre = ['0'+str(n) for n in range(10,100)]
alpha_code.extend(ten_to_one_hundre)
hundre_to_191 = [str(n) for n in range(100,192)]
alpha_code.extend(hundre_to_191)

In [26]:
names=Universe().get_user_factor_names()
neu_code_li = []
for x in names:
    if x[:11] == 'neutralized':
        neu_code_li.append(x.split('_')[-1])

In [27]:
ungen_191_factors_code = list(set(alpha_code)-set(unsaved_191_factors_code)-set(neu_code_li)) # len = 161

In [14]:
def generate_neu_factor(code):
    subFactors = [
        {'factor_name':'shared_alpha191_risk_'+code, 'factor_direction':1, 'frequency':Frequency.DAY}
    ]

    # 指定复合因子参数
    factor_parameters = {
        "subFactors": subFactors,
        "frequency": 'daily', # 复合因子的频率 
        "lagCycleNumber": 0, # 指定窗口大小
        "treat_null_factor_as_zero": False, # 因子数据为null时，是否按0值处理
        "expose": ['Beta', 'Momentum', 'ResidualVolatility', 'NLSize', 'Value','Liquidity', 'EarningsYield', 'Growth', 'Leverage']
    }
    
    error_li = []
    try:
        ab = RiskNeuFactor(
            factor_name='neutralized_'+factor_parameters['subFactors'][0]['factor_name'], # 复合因子的名称
            tickers='A', # 复合因子对应的股票池
            factor_parameters=factor_parameters 
        )



        # 是否使用cache
        ab.set_use_factor_cache(False)
        #从数据库清空因子，以便重新录入
        from_dt=20200518
        to_dt=20210204
        # 合成因子并入库
        ab.generate_factor_and_store(from_dt, to_dt, echo=False)
        print('因子合成完毕，已成功入库!')
    except Exception as e:
        error_li.append(code)
        print(code+':   '+str(e))

**因子合成并入库后，我们就可以在因子分析，或是因子回测中直接使用该复合因子，快去试试吧！**

In [None]:
from multiprocessing import Pool
if __name__=='__main__':
    with Pool(10) as p:
        p.map(generate_neu_factor,ungen_191_factors_code)

In [4]:
begin_day = '20200518'
end_day = '20210204'

In [7]:
SmartFactor('neutralized_shared_alpha191_risk_159').load(begin_day, end_day)

loading neutralized_shared_alpha191_risk_159 time cost 3.39s


Unnamed: 0,factor_value,security_code,tdate
0,4651.929353,000001.SZ,20200518
1,8596.570197,000002.SZ,20200518
2,-13978.940946,000004.SZ,20200518
3,-4693.834500,000005.SZ,20200518
4,18.749395,000006.SZ,20200518
5,-8867.522697,000007.SZ,20200518
6,21096.273066,000008.SZ,20200518
7,17952.853977,000009.SZ,20200518
8,7452.660161,000010.SZ,20200518
9,-8001.135903,000011.SZ,20200518
