## **1. 导入库包**

In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
plt.style.use('seaborn')

## **2. 获取数据和数据合并处理**

In [2]:
start_date='2017-01-01'
end_date='2019-01-21'

start_date=datetime.datetime.strptime(start_date,'%Y-%m-%d')
end_date=datetime.datetime.strptime(end_date,'%Y-%m-%d')

stock_list1=get_index_stocks('000300.SH',start_date)
stock_list2=get_index_stocks('000300.SH',end_date)
stock_list=list(set(stock_list1).intersection(set(stock_list2))) 

flag=True

for stock in stock_list:
    
    trade_days = get_trade_days(start_date, end_date).strftime('%Y-%m-%d')
    q = query(
      factor.date,
      factor.symbol,
      #趋势性
      factor.bbi,factor.ma,factor.expma,factor.priceosc,factor.ddi,
      #反趋势
      factor.bias, factor.cci,factor.dbcd,factor.dpo,factor.kdj, 
      #能量型
      factor.arbr,factor.cr,factor.psy,factor.vr_rate,factor.wad,
      #量价指标
      factor.mfi,factor.obv,factor.pvt,factor.wvad,
      #压力支撑型
      factor.bbiboll,factor.boll,factor.cdp,factor.env,factor.mike,
      #成交量
      factor.vr,factor.vma,factor.vmacd,factor.vosc,factor.vstd,factor.tapi,
      #超买超卖
      factor.adtm,
      #摆动类
      factor.mi,factor.micd,factor.rc,factor.rccd,factor.srmi,
      #强弱型
      factor.dptb,factor.jdqs,factor.jdrs,factor.zdzb,factor.atr,factor.mass,
      #日期类
      factor.up_n,factor.down_n  
    ).filter(
      factor.symbol==stock,
      factor.date.in_(trade_days)
    )
    df = get_factors(q)

    for i in range(len(df['factor_symbol'])):
        try:
            df.ix[i,'price']=get_price(df.ix[i,'factor_symbol'], None,df.ix[i,'factor_date'], '1d', ['close'], True, None,1, is_panel=1)['close'].ix[0,0]
        except:
            continue

    df['return']=df['price']/df['price'].shift(1)-1
    df['next_return']=df['return'].shift(1)
    df['mtm5']=df['price']/df['price'].shift(5)-1
    df['next_mtm5']=df['mtm5'].shift(1)
    df.dropna(inplace=True)
    #print(df)    
    
    if flag:
        flag=False
        data_total=df
    else:
        data_total=data_total.append(df)

data_total.set_index(['factor_symbol','factor_date'],inplace=True)


NameError: name 'get_index_stocks' is not defined

In [None]:
data_total.head()

## **3. 决策树训练数据**

In [None]:
data_total['return_flag']=np.where(data_total.next_mtm5>=.1,1,0)
data_total['return_flag'].sum()/len(data_total)

In [None]:
x=data_total.ix[:,'factor_bbi':'factor_down_n']
y=data_total.ix[:,'return_flag']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=33)
features=x_train.columns

In [None]:
y_test.sum()/len(y_test)

In [None]:
clf=tree.DecisionTreeClassifier(criterion='gini',max_features='sqrt',max_depth=120,class_weight='balanced')
clf.fit(x_train,y_train)

### **3.1 评价因子数据**

In [None]:
features_important=clf.feature_importances_
features_important_df=pd.DataFrame([features,features_important],index=['feature','feature_importance']).T

ind = np.arange(len(features_important_df['feature'])) 
width = 0.35 
fig, ax = plt.subplots(figsize=(20,30))
rects1=ax.barh(ind, features_important_df['feature_importance'], width, color='IndianRed', label='Feature_importance')

plt.yticks(ind,([feature for feature in features_important_df['feature']]))
plt.tick_params(labelsize=17)
ax.set_title('Features_importance',size=25)
ax.legend()
plt.show()

In [None]:
def evaluation(clf,x_input_data,y_real_data):
    predict=clf.predict(x_input_data)
    compare_data=pd.DataFrame({'real_data' : y_real_data,'predict_data':predict})
    score=clf.score(x_input_data,y_real_data)
    print("The accuracy of the model is %f"%score)
    c=confusion_matrix(list(y_real_data), list(predict))
    confusion_data=pd.DataFrame(c,index=['real_zero','real_one'],columns=['predict_zero','predict_one'])
    return confusion_data

### **3.2 样本内预测能力**

In [None]:
evaluation(clf,x_train,y_train)

## **4. 决策树测试数据**

### **4.1 样本外预测能力**

In [None]:
evaluation(clf,x_test,y_test)

In [None]:
predict=clf.predict(x_test)
data_total.ix[x_test.index,'predict']=predict
data_holding=pd.DataFrame({i:data_total.predict.groupby(level=0).apply(lambda x:x.shift(i)) for i in range(1,6)})
data_holding['total']=data_holding.sum(axis=1)
data_total['flag_holding']=data_holding.total>0

return_data=data_total.ix[data_total.flag_holding==True,'return']
return_cross=return_data.unstack().T
#股票集等权选股
df_portfolio=pd.DataFrame({'daily_return':return_cross.mean(axis=1)})
df_portfolio.sort_index(inplace=True)
df_portfolio['strategy_return']=(df_portfolio.daily_return+1).cumprod()
df_portfolio.index=pd.to_datetime(df_portfolio.index)

In [None]:
df_portfolio.head(10)

## **5. 决策树绩效展示**

In [None]:
data_300 = get_price(['000300.SH'], start_date, end_date, '1d', ['close'], True, None, is_panel=1)['close']
data_300.columns=['close300']
data_300['return300']=data_300['close300'].pct_change()
data_300['net_value']=(1+data_300['return300']).cumprod()
df_portfolio['HS300']=data_300.ix[df_portfolio.index,'net_value']
df_portfolio[['strategy_return','HS300']].plot(figsize=(20,8))
plt.title('Decision Tree Strategy',y=1.05,size=25)

## **6. 随机森林训练数据**

In [None]:
clf_randomforest=RandomForestClassifier(n_estimators=20,criterion='gini',max_features='sqrt',max_depth=120,class_weight='balanced')
clf_randomforest.fit(x_train,y_train)

### **6.1 评价因子数据**

In [None]:
features_important=clf_randomforest.feature_importances_
features_important_df=pd.DataFrame([features,features_important],index=['feature','feature_importance']).T

ind = np.arange(len(features_important_df['feature'])) 
width = 0.35 
fig, ax = plt.subplots(figsize=(20,30))
rects1=ax.barh(ind, features_important_df['feature_importance'], width, color='IndianRed', label='Feature_importance')

plt.yticks(ind,([feature for feature in features_important_df['feature']]))
plt.tick_params(labelsize=17)
ax.set_title('Features_importance',size=25)
ax.legend()
plt.show()

In [None]:
evaluation(clf_randomforest,x_train,y_train)

In [None]:
evaluation(clf_randomforest,x_test,y_test)

## **7. 随机森林绩效展示**

In [None]:
predict=clf_randomforest.predict(x_test)
data_total.ix[x_test.index,'predict']=predict
data_holding=pd.DataFrame({i:data_total.predict.groupby(level=0).apply(lambda x:x.shift(i)) for i in range(1,6)})
data_holding['total']=data_holding.sum(axis=1)
data_total['flag_holding']=data_holding.total>0

return_data=data_total.ix[data_total.flag_holding==True,'return']
return_cross=return_data.unstack().T
#股票集等权选股
df_portfolio=pd.DataFrame({'daily_return':return_cross.mean(axis=1)})
df_portfolio.sort_index(inplace=True)
df_portfolio['strategy_return']=(df_portfolio.daily_return+1).cumprod()
df_portfolio.index=pd.to_datetime(df_portfolio.index)

In [None]:
data_300 = get_price(['000300.SH'], start_date, end_date, '1d', ['close'], True, None, is_panel=1)['close']
data_300.columns=['close300']
data_300['return300']=data_300['close300'].pct_change()
data_300['net_value']=(1+data_300['return300']).cumprod()
df_portfolio['HS300']=data_300.ix[df_portfolio.index,'net_value']
df_portfolio[['strategy_return','HS300']].plot(figsize=(20,8))
plt.title('Random Forest Strategy',y=1.05,size=25)
