In [1]:
import numpy as np
import pandas as pd
from WindPy import *
import talib as ta
from talib.abstract import *
w.start()
from collections import OrderedDict
from datetime import *
from  math  import *
import statsmodels.api as sm
import numpy.linalg as la   #用来做线性代数运算


Welcome to use Wind Quant API for Python (WindPy)!

COPYRIGHT (C) 2020 WIND INFORMATION CO., LTD. ALL RIGHTS RESERVED.
IN NO CIRCUMSTANCE SHALL WIND BE RESPONSIBLE FOR ANY DAMAGES OR LOSSES CAUSED BY USING WIND QUANT API FOR Python.


首先采用**2021年8月1号**的沪深300为选股的股票池，然后选用数据的时间为*2010年1月1日至2021年12月31日*的数据来建立模型，在这之前，必须去掉2010年之前未上市的股票，筛选过后共有171只股票。

In [2]:
codes =list(w.wset("sectorconstituent", "date=2021-08-01;windcode=000300.SH",usedf=True)[1].iloc[:,1])
date = '2010-01-01'
date = datetime.strptime(date,'%Y-%m-%d')
df = w.wss(','.join(codes), "ipo_issuedate",usedf=True)[1]
select_codes = df.loc[df['IPO_ISSUEDATE']<=date].index.tolist()
len(select_codes)

171

In [3]:
#估值因子
def get_values_factor(dates,stocks):
    dict_df = OrderedDict()
    for i in range(len(dates)-1):
        date=dates[i]
        
        #估值因子value_factor
        factors_codes= "pe_ttm,pe_lyr,pb_lf,pb_lyr,pcf_ncf_ttm,pcf_ocf_ttm,ps_ttm,ps_lyr,val_mvtofcff" 
        factors_names=['EP_TTM','EP_LYR','BP_LF','BP_LYR','NCF_TTM','OCF_TTM','SP_TTM','SP_LYR','FCFP_LYR'] 
        factors_value=w.wss(stocks,factors_codes,"tradeDate="+date) 
        factors_value=pd.DataFrame(factors_value.Data,index=factors_names,columns=factors_value.Codes).T
        factors_value=1/factors_value
            
        #获取PEG=市盈率/净利润同比增长率*100 
        PE=np.array(w.wss(stocks, "pe_ttm","tradeDate="+date).Data[0]) #获取市盈率 
        profit=np.array(w.wss(stocks, "fa_npgr_ttm","tradeDate="+date).Data[0]) #净利润同期增长率*100
        factors_value['PEG_TTM']=PE/profit   
        
        #获取企业价值倍数
        factors_value['EV/EBITDA']=w.wss(stocks, "ev2_to_ebitda","tradeDate="+date).Data[0]
        
        #获取股息率
        factors_value['DYR']=w.wss(stocks, "dividendyield2","tradeDate="+date).Data[0]
        
        dict_df[date]=factors_value
    factors_values=pd.concat(dict_df.values(),keys=dict_df.keys())
    return factors_values
#规模因子
def get_size_factor(dates,stocks):
    dict_df = OrderedDict()
    for i in range(len(dates)-1):
        date=dates[i]
        size_factors=w.wss(stocks, "val_lnmv,val_lnfloatmv,val_lntotassets","tradeDate="+date)
        factors_names=['LN_MV','LN_FLOAT_MV','LN_TOTAL_ASSETS']                       
        size_factors=pd.DataFrame(size_factors.Data,index=factors_names,columns=size_factors.Codes).T
        dict_df[date]=size_factors.iloc[:,:]
        #print(dict_df.values())
        #print(dict_df.keys())
    size_factors=pd.concat(dict_df.values(),keys=dict_df.keys())
    return size_factors

#杠杆因子
def get_leverage_factors(dates,stocks,factors_codes,factors_names):
    dict_df = OrderedDict()
    for i in range(len(dates)-1):
        date=dates[i]
        leverage_factors=w.wss(stocks,factors_codes,"tradeDate="+date)
        leverage_factors=pd.DataFrame(leverage_factors.Data,index=factors_names,columns=leverage_factors.Codes).T
        dict_df[date]=leverage_factors
    leverage_factors=pd.concat(dict_df.values(),keys=dict_df.keys())
    return leverage_factors
#技术因子
def get_Technical_factors(dates,stocks):
    dict_df = OrderedDict()
    for i in range(len(dates)-1):
        date=dates[i]
        factors_codes="tech_rvi,tech_rstr12,tech_cyf,tech_cry,tech_cr20"
        factors_names=['RVI','RSTR12','CYF','CRY','CR20']
        Technical_factors=w.wss(stocks,factors_codes,"tradeDate="+date)
        Technical_factors=pd.DataFrame(Technical_factors.Data,index=factors_names,columns=Technical_factors.Codes).T
         #获取RSI指标
        Technical_factors['RSI']=w.wss(stocks, "RSI","tradeDate="+date+";RSI_N=6;priceAdj=F;cycle=D").Data[0] 
        #获取DEA异同平均数指标
        Technical_factors['DEA']=w.wss(stocks, "MACD","tradeDate="+date+";MACD_L=26;MACD_S=12;MACD_N=9;MACD_IO=2;priceAdj=F;cycle=D").Data[0]
        #获取MACD指标
        Technical_factors['MACD']=w.wss(stocks, "MACD","tradeDate="+date+";MACD_L=26;MACD_S=12;MACD_N=9;MACD_IO=3;priceAdj=F;cycle=D").Data[0]
        #获取K\D\J
        Technical_factors['K']=w.wss(stocks, "KDJ","tradeDate="+date+";KDJ_N=9;KDJ_M1=3;KDJ_M2=3;KDJ_IO=1;priceAdj=F;cycle=D").Data[0]
        Technical_factors['D']=w.wss(stocks, "KDJ","tradeDate="+date+";KDJ_N=9;KDJ_M1=3;KDJ_M2=3;KDJ_IO=2;priceAdj=F;cycle=D").Data[0]
        Technical_factors['J']=w.wss(stocks, "KDJ","tradeDate="+date+";KDJ_N=9;KDJ_M1=3;KDJ_M2=3;KDJ_IO=3;priceAdj=F;cycle=D").Data[0]
        
        dict_df[date]=Technical_factors
    Liquidation_factors=pd.concat(dict_df.values(),keys=dict_df.keys())
    return Liquidation_factors
#动量因子
def get_Momentum_factors(dates,stocks):
    dict_df=OrderedDict()
    for i in range(len(dates)-1):
        date=dates[i]
        factors_codes="tech_revs5,tech_revs10,tech_revs60,tech_revs120,tech_revs250,tech_revs750,tech_revs1mmax,tech_lnhighlow20d"
        factors_names=['REV_5D','REV_10D','REV_3M','REV_6M','REV_1Y','REV_3Y','REV_LAST1M_MAX','LN_HIGH-LOW']
        Momentum_factors=w.wss(stocks,factors_codes,"tradeDate="+date)
        Momentum_factors=pd.DataFrame(Momentum_factors.Data,index=factors_names,columns=Momentum_factors.Codes).T
        dict_df[date]=Momentum_factors
    Momentum_factors=pd.concat(dict_df.values(),keys=dict_df.keys())
    return Momentum_factors

#获取成长因子
def get_growth_factors(dates,stocks):
    dict_df = OrderedDict()
    for i in range(len(dates)-1):
        date=dates[i]
        factors_codes= "fa_orgr_ttm,fa_nagr,fa_gpmgr_ttm,fa_npgr_ttm,fa_tagr,fa_ncgr_ttm,fa_cfigr_ttm,fa_cffgr_ttm,fa_cfogr_ttm,fa_oigr_ttm" 
        factors_names=['sales_gr_TTM','net_asset_gr_TTM','gross_margin_gr_TTM','net_profit_gr_TTM','total_asset_gr_TTM','net_cash_flow_gr_TTM','invest_cash_flow_gr_TTM','finance_cash_folw_gr_TTM','operate_cash_flow_gr_TTM','operete_profit_gr_TTM']
        growth_factors=w.wss(stocks,factors_codes,"tradeDate="+date)
        growth_factors=pd.DataFrame(growth_factors.Data,index=factors_names,columns=growth_factors.Codes).T
        #growth_factors['eps_growth_TTM']=w.wss(A_stocks, "yoyeps_basic","rptDate="+date+";N=1").Data[0]  #基本每股收益同比增长率
        #growth_factors['roe_growth_TTM']=w.wss(A_stocks, "growth_roe","rptDate="+date+";N=1").Data[0]  #净资产收益率N年同比增长率
        dict_df[date]=growth_factors
        growth_factors=pd.concat(dict_df.values(),keys=dict_df.keys())
    return growth_factors
#市值因子
def get_assisted_factors(dates,stocks):   
    dict_df = OrderedDict()
    for i in range(len(dates)-1):
        date=dates[i]
        assisted_factors=w.wss(stocks, "industry_sw,mkt_cap_ashare","tradeDate="+date+';industryType=1;unit=1')
        factors_names=['INDUSTRY_SW','CAP']                       
        assisted_factors=pd.DataFrame(assisted_factors.Data,index=factors_names,columns=assisted_factors.Codes).T
        dict_df[date]=assisted_factors
    assisted_factors=pd.concat(dict_df.values(),keys=dict_df.keys())
    return assisted_factors
#获取每月交易日期序列
def get_trade_date(start_date, end_date, period='M'):
    data = w.tdays(start_date, end_date, period=period) #获取每月最后一个交易日
    trade_dates = data.Data[0]
    trade_dates = [dt.strftime("%Y-%m-%d") for dt in trade_dates]
    return trade_dates
def get_feature_names(data):  #该函数用于获取数据集中需测试的因子名
    columns = data.columns.tolist()
    fea_names = [i for i in columns if i not in ["INDUSTRY_SW",'CAP'] ]
    return fea_names
def extreme_process_MAD(Data):
    feature_names = get_feature_names(Data)
    median=Data[feature_names].median(axis=0)  #获取中位数
    MAD=abs(Data[feature_names].sub(median,axis=1)).median(axis=0)
    for j in range(len(MAD)):
        for i in range(Data.shape[0]):
            if np.isnan(Data.iloc[i,j]) == False:
                if Data.iloc[i,j] <= median[j]-5*1.4826*MAD[j]:
                    Data.iloc[i,j] = median[j]-5*1.4826*MAD[j]
                if Data.iloc[i,j] >= median[j]+5*1.4826*MAD[j]:
                    Data.iloc[i,j] = median[j]+5*1.4826*MAD[j]
                    
    return Data
def fill_missing_value(Data):
    feature_names = get_feature_names(Data)
    for j in range(len(feature_names)):
        industry_fill_value = Data[feature_names[j]].groupby(Data['INDUSTRY_SW']).mean()
        #print(j,list(industry_fill_value))
        for i in range(Data.shape[0]):
            #if i < 3:
                #print(Data.iloc[i,:]['INDUSTRY_SW'])
            if np.isnan(Data.iloc[i,j]):
                #print(industry_fill_value[Data.iloc[i,-2]])
                Data.iloc[i,j] = industry_fill_value[Data.iloc[i,-2]]
    return Data
#市值中性化
def data_scale_CAP(data):
    feature_names = get_feature_names(data)
    data_=data.copy()
    cap_weight = data_["CAP"]/ data_["CAP"].sum()
    for name in feature_names:
        avg=(data_[name]*cap_weight).sum()
        data_[name]=(data_[name]-avg)/data_[name].std()
    return data_
#行业中性化
def data_scale_neutral(data):
    feature_names = get_feature_names(data)
    data_=data.copy()
    industrys=data['INDUSTRY_SW']  #获取所属申万一级行业代码
    data_med = pd.get_dummies(data,columns=['INDUSTRY_SW'],drop_first=True)
    n = len(data['INDUSTRY_SW'].unique())    #确定产生虚拟变量个数
    X = np.array(data_med[data_med.columns[-(n-1):]])  #行业虚拟变量作为为自变量
    for name in feature_names:
        y = np.array(data_[name])
        if la.matrix_rank(X.T.dot(X)) == (n-1): #当矩阵满秩时，估计回归参数
            beta_ols = la.inv(X.T.dot(X)).dot(X.T).dot(y)  
            residual = y - X.dot(beta_ols)      #计算残差，并将其作为剔除行业影响的因子值 
        else:
            residual = y   #如果逆不存在的话 则 用原值
        data_[name]=residual
    return data_
#因变量涨跌幅的获取以及处理
def get_pct(dates,stocks):
    dict_df = OrderedDict()
    for i in range(len(dates)-1):
        date=dates[i]
        h = "tradeDate="+date+";cycle=M"
        factors_value=w.wss(stocks,"pct_chg",h,usedf=True)[1]
        dict_df[date]=factors_value
    d=pd.concat(dict_df.values(),keys=dict_df.keys())
    return d
def accuracy(data1,data2):
    n=0
    for i in range(len(data1)):
        if data1[i] == data2[i]:
            n+=1
    acc = n/len(data1)
    return acc



In [5]:
start_date='20150101'
end_date='20201231'
dates=get_trade_date(start_date, end_date, period='M')
values_factor=get_values_factor(dates,select_codes)
size_factor=get_size_factor(dates,select_codes)
factors_codes="fa_current,fa_quick,fa_blev,fa_debttoasset,fa_cfotocurliabs_ttm,fa_debttoequity"
factors_names=['CUR','QR','BOOK_LEVEL','DEBT_TO_ASSETS','CASH_FLOW_LIABILITY','DEBT_TO_EQUITY']
leverage_factors = get_leverage_factors(dates,select_codes,factors_codes,factors_names)
Technical_factors = get_Technical_factors(dates,select_codes)
Momentum_factors = get_Momentum_factors(dates,select_codes)
assisted_factors = get_assisted_factors(dates,select_codes)
growth_factors=get_growth_factors(dates,select_codes)
Data= pd.concat([values_factor,growth_factors,leverage_factors,Momentum_factors,Technical_factors,assisted_factors],axis=1)


In [6]:
Data.to_csv('Newsvm_15_20.csv')

In [11]:
Data.head()


Unnamed: 0,Unnamed: 1,EP_TTM,EP_LYR,BP_LF,BP_LYR,NCF_TTM,OCF_TTM,SP_TTM,SP_LYR,FCFP_LYR,PEG_TTM,...,CRY,CR20,RSI,DEA,MACD,K,D,J,INDUSTRY_SW,CAP
2015-01-30,000001.SZ,0.120824,0.095703,0.796337,0.822809,0.090356,-0.089589,0.436667,0.327926,0.100122,0.281571,...,8.319482e+24,0.584092,35.086174,0.043812,-0.202159,31.088801,30.210627,32.84515,银行,137025000000.0
2015-01-30,000002.SZ,0.106428,0.104372,0.608822,0.608649,0.081991,0.215316,0.932968,0.934872,0.256925,1.115144,...,8.248641e+24,0.913342,47.638434,0.289514,-0.173679,53.680004,55.462782,50.114446,房地产,127356000000.0
2015-01-30,000063.SZ,0.039983,0.020592,0.376944,0.377337,-0.019092,0.102375,1.232218,1.14108,-0.064829,0.039888,...,8.177552e+24,1.464124,41.614252,0.425063,-0.181417,55.653137,63.954917,39.049576,通信,53726700000.0
2015-01-30,000066.SZ,0.002263,0.002943,0.250449,0.254481,0.1047,0.281104,7.59029,7.770629,0.069066,9.785377,...,6.778457e+24,2.11828,47.815606,0.367338,0.004699,39.006314,55.637458,5.744024,计算机,10019400000.0
2015-01-30,000069.SZ,0.075357,0.076936,0.456795,0.490742,0.028738,-0.0467,0.525569,0.491401,-0.040159,-1.106441,...,180151200000.0,,58.209361,0.176799,-0.045605,92.5893,82.194105,113.37969,房地产,24526200000.0


In [None]:

# start_date='20200701'
# end_date='20211231'
# dates=get_trade_date(start_date, end_date, period='M')
# values_factor=get_values_factor(dates,select_codes)
# size_factor=get_size_factor(dates,select_codes)
# factors_codes="fa_current,fa_quick,fa_blev,fa_debttoasset,fa_cfotocurliabs_ttm,fa_debttoequity"
# factors_names=['CUR','QR','BOOK_LEVEL','DEBT_TO_ASSETS','CASH_FLOW_LIABILITY','DEBT_TO_EQUITY']
# leverage_factors = get_leverage_factors(dates,select_codes,factors_codes,factors_names)
# Technical_factors = get_Technical_factors(dates,select_codes)
# Momentum_factors = get_Momentum_factors(dates,select_codes)
# assisted_factors = get_assisted_factors(dates,select_codes)
# growth_factors=get_growth_factors(dates,select_codes)
# Data1= pd.concat([values_factor,growth_factors,leverage_factors,Momentum_factors,Technical_factors,assisted_factors],axis=1)


In [None]:
# Data1.to_csv('svm_20_21.csv')

In [9]:
start_date='20210101'
end_date='20211231'
dates=get_trade_date(start_date, end_date, period='M')
values_factor=get_values_factor(dates,select_codes)
size_factor=get_size_factor(dates,select_codes)
factors_codes="fa_current,fa_quick,fa_blev,fa_debttoasset,fa_cfotocurliabs_ttm,fa_debttoequity"
factors_names=['CUR','QR','BOOK_LEVEL','DEBT_TO_ASSETS','CASH_FLOW_LIABILITY','DEBT_TO_EQUITY']
leverage_factors = get_leverage_factors(dates,select_codes,factors_codes,factors_names)
Technical_factors = get_Technical_factors(dates,select_codes)
Momentum_factors = get_Momentum_factors(dates,select_codes)
assisted_factors = get_assisted_factors(dates,select_codes)
growth_factors=get_growth_factors(dates,select_codes)
Data2= pd.concat([values_factor,growth_factors,leverage_factors,Momentum_factors,Technical_factors,assisted_factors],axis=1)


In [10]:
Data2.to_csv('svm_20_21.csv')

In [12]:
Data2.head()

Unnamed: 0,Unnamed: 1,EP_TTM,EP_LYR,BP_LF,BP_LYR,NCF_TTM,OCF_TTM,SP_TTM,SP_LYR,FCFP_LYR,PEG_TTM,...,CRY,CR20,RSI,DEA,MACD,K,D,J,INDUSTRY_SW,CAP
2021-01-29,000001.SZ,0.060194,0.062924,0.644584,0.812642,0.299755,-0.209352,0.33825,0.307885,,-4.598213,...,7.952696e+24,2.142526,70.939709,0.898653,0.230094,77.562155,78.647385,75.391694,银行,448079000000.0
2021-01-29,000002.SZ,0.125425,0.1204,0.627755,0.695389,0.220186,0.239528,1.193937,1.139496,-0.098837,8.159779,...,7.93012e+24,0.919544,21.911108,0.167273,-0.427492,15.545217,27.460237,-8.284823,房地产,270051000000.0
2021-01-29,000063.SZ,0.029404,0.034664,0.290647,0.291548,0.063589,0.056632,0.682659,0.610994,0.016527,-1.748847,...,7.827265e+24,0.847771,29.32928,0.271447,-0.727711,14.566158,27.223885,-10.749298,通信,111901000000.0
2021-01-29,000066.SZ,0.015294,0.021985,0.155111,0.17666,0.023442,-0.018304,0.219924,0.213813,-0.039934,-3.634978,...,8.038246e+24,0.782561,33.12483,0.382872,-0.658188,19.212243,23.450974,10.734782,计算机,50710100000.0
2021-01-29,000069.SZ,0.214765,0.228657,1.303356,1.452662,0.03202,-0.069047,1.177694,1.112238,-0.075735,2.713432,...,7.992214e+24,0.745938,30.975017,-0.006436,-0.049225,15.897448,28.693569,-9.694794,房地产,46430400000.0


In [14]:
start_date='20100101'
end_date='20141231'
dates=get_trade_date(start_date, end_date, period='M')
values_factor=get_values_factor(dates,select_codes)
size_factor=get_size_factor(dates,select_codes)
factors_codes="fa_current,fa_quick,fa_blev,fa_debttoasset,fa_cfotocurliabs_ttm,fa_debttoequity"
factors_names=['CUR','QR','BOOK_LEVEL','DEBT_TO_ASSETS','CASH_FLOW_LIABILITY','DEBT_TO_EQUITY']
leverage_factors = get_leverage_factors(dates,select_codes,factors_codes,factors_names)
Technical_factors = get_Technical_factors(dates,select_codes)
Momentum_factors = get_Momentum_factors(dates,select_codes)
assisted_factors = get_assisted_factors(dates,select_codes)
growth_factors=get_growth_factors(dates,select_codes)
Data3= pd.concat([values_factor,growth_factors,leverage_factors,Momentum_factors,Technical_factors,assisted_factors],axis=1)




In [15]:
Data3

Unnamed: 0,Unnamed: 1,EP_TTM,EP_LYR,BP_LF,BP_LYR,NCF_TTM,OCF_TTM,SP_TTM,SP_LYR,FCFP_LYR,PEG_TTM,...,CRY,CR20,RSI,DEA,MACD,K,D,J,INDUSTRY_SW,CAP
2010-01-29,000001.SZ,0.013867,0.009112,0.283262,0.303758,-0.064149,-0.229431,0.220895,0.215367,-0.317199,-0.934475,...,7.330568e+24,0.601909,40.679102,-0.154855,0.027383,37.374130,44.146017,23.830357,银行,6.34533e+10
2010-01-29,000002.SZ,0.045905,0.039273,0.340257,,0.074708,0.128499,0.467341,0.399159,-0.013718,-6.876091,...,7.632250e+24,0.384014,28.037944,-0.261729,-0.036921,14.844958,13.735378,17.064119,房地产,9.01879e+10
2010-01-29,000063.SZ,0.024815,0.020230,0.187205,0.205019,0.035129,0.056341,0.692237,0.539725,0.073643,1.193498,...,7.422045e+24,0.861125,32.976178,0.315580,-0.499569,17.045961,26.749536,-2.361188,通信,6.37251e+10
2010-01-29,000066.SZ,0.018792,0.001165,0.190177,0.215592,-0.002868,0.000856,0.408216,0.389139,-0.272620,0.075720,...,7.739198e+24,1.650590,58.051269,0.292211,-0.080614,55.216970,51.375066,62.900779,计算机,1.01489e+10
2010-01-29,000069.SZ,0.020689,0.019296,0.278921,0.220411,0.001522,0.055874,0.089973,0.073419,0.003596,3.327908,...,7.806521e+22,0.644256,50.207083,-0.276231,0.094665,52.090819,43.111073,70.050312,房地产,1.97199e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014-11-28,601939.SH,0.194437,0.182679,1.019379,0.907153,0.078678,0.418651,0.476797,0.432839,0.438226,0.641600,...,6.254627e+24,2.265193,89.805110,0.037257,0.045779,71.772696,55.516723,104.284641,银行,4.50902e+10
2014-11-28,601988.SH,0.180301,0.168567,1.080057,0.992548,0.634101,0.731984,0.481530,0.437779,0.111661,0.568230,...,5.849512e+24,2.062827,87.964457,0.053632,0.028531,71.548699,57.417016,99.812066,银行,6.55762e+11
2014-11-28,601989.SH,0.021843,0.025580,0.516177,0.412235,0.124779,-0.004532,0.482152,0.446778,-0.056746,-5.749944,...,7.660864e+24,1.005525,68.664428,-0.044711,0.075420,71.012876,59.969240,93.100149,国防军工,1.03285e+11
2014-11-28,601998.SH,0.142005,0.137038,0.874840,0.789172,0.093990,-0.067545,0.421708,0.365753,0.826807,0.395190,...,7.286596e+24,2.351064,93.475924,0.113167,0.119996,81.939610,64.229605,117.359619,银行,1.94941e+11


In [16]:
Data3.to_csv('svm_10_14.csv')