In [1]:
import pandas as pd

df = pd.read_csv('./data/train_df.csv', index_col=0)
train_data = df.loc[df.time < '2018-01-01']
backtest_data = df.loc[df.time >= '2018-02-01']

etf_list = df.code.drop_duplicates().values
df.head(10)

Unnamed: 0,time,code,open,close,high,low,volume,money
0,2014-01-02,510300.XSHG,2.402,2.401,2.408,2.389,185205422.0,444109600.0
1,2014-01-03,510300.XSHG,2.391,2.375,2.393,2.358,434406281.0,1030051000.0
2,2014-01-06,510300.XSHG,2.367,2.318,2.367,2.312,476781679.0,1106969000.0
3,2014-01-07,510300.XSHG,2.302,2.319,2.327,2.296,150496913.0,348214900.0
4,2014-01-08,510300.XSHG,2.319,2.317,2.341,2.304,202243149.0,469652300.0
5,2014-01-09,510300.XSHG,2.312,2.303,2.338,2.301,332507001.0,769178800.0
6,2014-01-10,510300.XSHG,2.301,2.286,2.305,2.28,365103225.0,834948900.0
7,2014-01-13,510300.XSHG,2.286,2.27,2.3,2.261,481785296.0,1095836000.0
8,2014-01-14,510300.XSHG,2.27,2.285,2.292,2.256,247485927.0,562495500.0
9,2014-01-15,510300.XSHG,2.284,2.285,2.288,2.266,329409654.0,750456400.0


In [112]:
import talib 

def get_target_level(target):
    if target > 0:
        return 1
    return -1

def cross_over(x, y, data):
    data[f'RSI_{x} - RSI_{y}'] = data[f'RSI_{x}'] - data[f'RSI_{y}']
    data[f'MA_{x} - MA_{y}'] = data[f'MA_{x}'] - data[f'MA_{y}']
    data[f'SMA_{x} - SMA_{y}'] = data[f'SMA_{x}'] - data[f'SMA_{y}']
    data[f'AROONOSC_{x} - AROONOSC_{y}'] = data[f'AROONOSC_{x}'] - data[f'AROONOSC_{y}']
    return y


def get_features(data):
    tech_data = pd.DataFrame(index=data.index);
    days = [2, 3, 4, 5, 10, 15, 20]
    for t in days:
        tech_data[f'BETA_{t}'] = talib.BETA(data.high, data.low, timeperiod=t)
        tech_data[f'SMA_{t}'] = talib.SMA(data.close,timeperiod=t)
        tech_data[f'RSI_{t}'] = talib.RSI(data.close, timeperiod=t)
        tech_data[f'MOM_{t}'] = talib.MOM(data.close, timeperiod=t)
        tech_data[f'MA_{t}'] = talib.MA(data.close, timeperiod=t)
        tech_data[f'DX_{t}'] = talib.DX(data.high, data.low, data.close, timeperiod=t)
        tech_data[f'volatility_{t}'] = data.close.pct_change(periods=t).std()
        tech_data[f'ADX_{t}'] = talib.ADX(data.high, data.low, data.close, timeperiod=t)
        tech_data[f'ADXR_{t}'] = talib.ADXR(data.high, data.low, data.close, timeperiod=t)
        tech_data[f'AROONOSC_{t}'] = talib.AROONOSC(data.high, data.low, timeperiod=t)
        tech_data[f'ROC_{t}'] = talib.ROC(data.close, timeperiod=t)
        tech_data[f'BIAS_{t}'] = (data['close'] - data['close'].rolling(t, min_periods=1).mean())/ data['close'].rolling(t, min_periods=1).mean()*100
        tech_data[f'BOLL_upper_{t}'], tech_data[f'BOLL_middle_{t}'], tech_data[f'BOLL_lower_{t}'] = talib.BBANDS(
                data.close,
                timeperiod=t,
                nbdevup=2,
                nbdevdn=2,
                matype=0)

    tech_data['SAR'] = talib.SAR(data.high, data.low)
    tech_data['AD'] = talib.AD(data.high, data.low, data.close, data.volume)
    tech_data['OBV'] = talib.OBV(data.close, data.volume)
    tech_data['code'] = data.code
    tech_data['target'] = data.close.pct_change().shift(-1).apply(get_target_level).fillna(0)
    tech_data['time'] = data.time
    
    # 计算金叉feature
    reduce(lambda x,y: cross_over(x, y, tech_data), days)
    
    features = list(set(tech_data.columns) - set(data.columns) - set(['target'])) 
    return tech_data.dropna(), features

data_sets = pd.DataFrame()
features = []
for i in etf_list:
    etf_data, etf_features = get_features(train_data[train_data.code == i])
    data_sets = pd.concat([data_sets, etf_data])
    features = etf_features

data_sets = data_sets.dropna()
data_sets = data_sets.set_index('code')
data_sets = data_sets.sort_values('time')
data_sets.head(5)

Unnamed: 0_level_0,BETA_2,SMA_2,RSI_2,MOM_2,MA_2,DX_2,volatility_2,ADX_2,ADXR_2,AROONOSC_2,...,ROC_20,BIAS_20,BOLL_upper_20,BOLL_middle_20,BOLL_lower_20,SAR,AD,OBV,target,time
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
510300.XSHG,2.435225,2.225,64.547138,0.003,2.225,18.41872,0.024943,44.989641,58.275102,50.0,...,-0.976476,0.701889,2.269721,2.21545,2.161179,2.179932,-455372500.0,-805090390.0,1,2014-04-01
510500.XSHG,7.885061,1.0805,61.525336,0.011,1.0805,97.771944,0.030692,92.685653,90.142508,-50.0,...,-3.798587,-0.96849,1.134333,1.09965,1.064967,1.11892,57249890.0,-134341567.0,1,2014-04-01
159915.XSHE,0.417114,1.309,62.082587,0.029,1.309,2.67861,0.035454,42.604084,62.566821,-50.0,...,-7.037298,-4.271894,1.468522,1.37995,1.291378,1.420991,2298721000.0,188755697.0,-1,2014-04-01
510050.XSHG,1.690491,1.731,65.851922,-0.002,1.731,25.732804,0.023653,45.192396,54.922191,-50.0,...,0.696864,1.817328,1.757882,1.70305,1.648218,1.673724,-1988693000.0,642079842.0,1,2014-04-01
159915.XSHE,1.620734,1.3065,27.24242,-0.005,1.3065,33.533933,0.035454,38.069009,40.336546,100.0,...,-9.460406,-5.913195,1.466832,1.3732,1.279568,1.409712,2205556000.0,40381506.0,1,2014-04-02


In [101]:
print('Features:')
display(features)

Features:


['AROONOSC_4',
 'SMA_20',
 'MA_10',
 'OBV',
 'DX_10',
 'MA_4',
 'BOLL_lower_10',
 'BIAS_10',
 'SMA_2',
 'AROONOSC_2',
 'BOLL_lower_8',
 'ROC_20',
 'volatility_4',
 'ROC_8',
 'ADXR_10',
 'volatility_12',
 'BOLL_upper_2',
 'volatility_10',
 'ROC_6',
 'BOLL_middle_20',
 'BETA_8',
 'DX_12',
 'MOM_4',
 'volatility_20',
 'BIAS_2',
 'volatility_2',
 'RSI_2',
 'ADXR_8',
 'MOM_12',
 'ADXR_20',
 'DX_20',
 'BOLL_upper_4',
 'BOLL_upper_8',
 'SMA_4',
 'BOLL_middle_4',
 'ROC_2',
 'BETA_12',
 'MOM_6',
 'BOLL_lower_2',
 'ADX_4',
 'ADXR_2',
 'RSI_4',
 'ADX_10',
 'BIAS_12',
 'BOLL_middle_2',
 'MA_2',
 'BETA_2',
 'BIAS_8',
 'DX_6',
 'SMA_10',
 'MA_12',
 'RSI_10',
 'ADX_2',
 'ADX_20',
 'BOLL_upper_10',
 'AROONOSC_10',
 'SMA_6',
 'RSI_8',
 'BETA_20',
 'BIAS_4',
 'DX_4',
 'BOLL_upper_6',
 'MOM_8',
 'ADXR_4',
 'SMA_8',
 'RSI_20',
 'SAR',
 'BOLL_middle_6',
 'MA_20',
 'MOM_10',
 'BOLL_lower_4',
 'MA_6',
 'DX_8',
 'ADXR_6',
 'RSI_6',
 'ADX_12',
 'BOLL_middle_8',
 'BIAS_6',
 'AROONOSC_8',
 'AD',
 'ROC_12',
 'ADX

In [102]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression, Lasso
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
from sklearn.decomposition import PCA



date = data_sets.index.drop_duplicates()

# data_sets = data_sets.loc[data_sets.index.isin(date[::2])]


data_sets.tail(10)

Unnamed: 0_level_0,BETA_2,SMA_2,RSI_2,MOM_2,MA_2,DX_2,volatility_2,ADX_2,ADXR_2,AROONOSC_2,...,ROC_20,BIAS_20,BOLL_upper_20,BOLL_middle_20,BOLL_lower_20,SAR,AD,OBV,target,time
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
510300.XSHG,-1.435458,4.391,11.65027,-0.061,4.391,87.582456,0.024943,69.430086,60.353902,-100.0,...,-1.826381,-1.111755,4.467688,4.40295,4.338212,4.473,47010300000.0,34630330000.0,1,2017-12-27
510500.XSHG,0.184171,1.8275,24.140245,-0.005,1.8275,88.382665,0.030692,84.55726,82.644558,-100.0,...,-2.256851,-0.974468,1.867435,1.8369,1.806365,1.861,10719670000.0,6485471000.0,1,2017-12-27
510050.XSHG,-2.188196,3.581,45.753248,-0.048,3.581,72.262445,0.023653,77.539367,80.177828,-50.0,...,-0.111173,-0.300983,3.665537,3.60485,3.544163,3.559,32858410000.0,30534510000.0,1,2017-12-28
159915.XSHE,-6.058284,1.6335,43.185228,-0.007,1.6335,97.412793,0.035454,95.009275,93.807516,50.0,...,-1.56344,-1.720049,1.70229,1.66565,1.62901,1.685448,26855120000.0,13082630000.0,1,2017-12-28
510500.XSHG,0.000185,1.8235,53.509761,-0.008,1.8235,91.040388,0.030692,87.798824,86.178042,-50.0,...,-1.135749,-0.427595,1.866091,1.83585,1.805609,1.86012,10735700000.0,6565597000.0,1,2017-12-28
510300.XSHG,-3.021633,4.374,52.487223,-0.034,4.374,87.978065,0.024943,78.704075,74.067081,-100.0,...,0.228102,-0.214604,4.467748,4.40345,4.339152,4.47054,47095560000.0,34809660000.0,1,2017-12-28
510500.XSHG,1.01114,1.8375,82.354343,0.028,1.8375,68.334029,0.030692,78.066426,82.932625,50.0,...,-0.752284,0.645724,1.863626,1.83515,1.806674,1.858355,10819750000.0,6649652000.0,-1,2017-12-29
510300.XSHG,1.308844,4.401,64.102111,0.054,4.401,35.988749,0.024943,57.346412,68.025244,-50.0,...,0.455789,0.0806,4.468376,4.40445,4.340524,4.465678,47145630000.0,34909810000.0,-1,2017-12-29
510050.XSHG,0.630698,3.596,51.240359,0.03,3.596,27.560201,0.023653,52.549784,65.044576,0.0,...,0.699692,-0.224619,3.665118,3.6061,3.547082,3.559,32803980000.0,30752230000.0,-1,2017-12-29
159915.XSHE,-1.349516,1.6405,67.894128,0.014,1.6405,97.412793,0.035454,96.211034,95.610155,50.0,...,-2.548903,-1.172227,1.699923,1.6635,1.627077,1.680692,26948620000.0,13176130000.0,-1,2017-12-29


### Feature Selection

In [None]:
X = data_sets[features]
y = data_sets['target']


selector = RFECV(estimator=RandomForestRegressor(), step=1, cv=5)
selector = selector.fit(X, y)

feat_importances = pd.Series(selector.ranking_, index=X.columns)
feat_importances.nsmallest(10).plot(kind='barh')

print("Optimal number of features : %d" % selector.n_features_)

plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)
plt.show()

# 重新选择定义feature
real_feature = feat_importances.sort_values().head(selector.n_features_).index

X = data_sets[real_feature]
X_train, X_test = np.split(X, [int(.67 *len(X))])
y_train, y_test = np.split(y, [int(.67 *len(y))])


In [104]:
svm_parameters = {
#     'classification__C':(0.1, 1, 10, 20)
}
pipline = Pipeline([
    ('classification', SVC())
])

svm = GridSearchCV(pipline, svm_parameters, cv=3)


svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)

print('参数:', svm.best_params_)
print('R方值:', accuracy_score(y_test, svm_pred))
print('Report:', classification_report(y_test, svm_pred))

参数: {}
R方值: 0.542833607907743
Report:               precision    recall  f1-score   support

          -1       0.54      0.42      0.47       593
           1       0.54      0.66      0.60       621

    accuracy                           0.54      1214
   macro avg       0.54      0.54      0.54      1214
weighted avg       0.54      0.54      0.54      1214



In [105]:
svm.fit(X, y)

GridSearchCV(cv=3, estimator=Pipeline(steps=[('classification', SVC())]),
             param_grid={})

### Backtesting

In [106]:
import datetime
import backtrader as bt
import backtrader.feeds as btfeed



class MLStrategy(bt.Strategy):
    def log(self, txt, dt=None):
        ''' Logging function for this strategy'''
        dt = dt or self.datas[0].datetime.date(0)
        print('%s, %s' % (dt.isoformat(), txt))
        
    def next(self):
        today = self.datas[0].datetime.date(0).isoformat()
        buy_list = []
        pred_data, _ = get_features(df.loc[df.time < today])
        pred_data = pred_data.dropna()
        pred_data['pred'] = svm.predict(pred_data[real_feature])
        for index, e in enumerate(etf_list):
            today_signal = pred_data.loc[pred_data.code == e].iloc[-1]

            if today_signal.pred == 1 and len(buy_list) < 4:
                buy_list.append(index)
            else:
                self.close(self.datas[index])
                
                
        for i in buy_list:    
            self.order_target_percent(self.datas[i], target=1.00 / len(buy_list))

        if len(self) % 30 == 0:
            pred_data, factors = get_features(df.loc[df.time < today])
            pred_data = pred_data.dropna()
            pred_data_x = pred_data[real_feature]
            pred_data_y = pred_data['target']
            svm.fit(pred_data_x, pred_data_y)
            print('fit model again')


In [116]:
# Create a cerebro entity
cerebro = bt.Cerebro()

for etf in etf_list:
    etf_data = backtest_data[backtest_data.code == etf]
    etf_data = etf_data[['time', 'open', 'high', 'low', 'close', 'volume']]
    etf_data = etf_data.set_index('time')
    etf_data.index = pd.to_datetime(etf_data.index)
    etf_data = bt.feeds.PandasData(dataname=etf_data)
    cerebro.adddata(etf_data, etf)
    if etf == '159915.XSHE':
        benchdata = etf_data

        
# Add a strategy
cerebro.addstrategy(MLStrategy)
cerebro.addanalyzer(bt.analyzers.PyFolio, _name='pyfolio')
cerebro.addobserver(bt.observers.Benchmark,
                            data=benchdata,
                            timeframe=bt.TimeFrame.NoTimeFrame)
cerebro.broker.setcash(100000.0)


print('Starting Portfolio Value: %.2f' % cerebro.broker.getvalue())
results = cerebro.run()
strat = results[0]

print('Final Portfolio Value: %.2f' % cerebro.broker.getvalue())
%matplotlib notebook

cerebro.plot()

<backtrader.feeds.pandafeed.PandasData at 0x7f9cf2206310>

Starting Portfolio Value: 100000.00


ValueError: X.shape[1] = 1 should be equal to 4, the number of features at training time

### Backtesting Analyzers

In [72]:
import pyfolio as pf

pyfoliozer = strat.analyzers.getbyname('pyfolio')
returns, positions, transactions, gross_lev = pyfoliozer.get_pf_items()

# pf.create_full_tear_sheet(
#     returns=returns,
#     positions=positions,
#     transactions=transactions,
#     gross_lev=gross_lev,
#     live_start_date='2019-02-01',
# )

returns.plot()

ValueError: view limit minimum -0.45000000000001705 is less than 1 and is an invalid Matplotlib date value. This often happens if you pass a non-datetime value to an axis that has datetime units

In [195]:
rdf_parameters = {
    'classification__n_estimators':(140, 150, 160, 170, 180), 
    'classification__max_depth':[4, 5, 6, 7]
}

pipline = Pipeline([    
    ('feature_selection', SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear'))),
    ('classification', RandomForestClassifier())
])

rdf = GridSearchCV(pipline, rdf_parameters, cv=5)


rdf.fit(X_train, y_train)
rdf_pred = rdf.predict(X_test)
print('参数:', rdf.best_params_)
print('准确率:', accuracy_score(y_test, rdf_pred))
print('分类报告:', classification_report(y_test, rdf_pred))

NameError: name 'RandomForestClassifier' is not defined

In [74]:
%matplotlib notebook
cerebro.plot()

<IPython.core.display.Javascript object>

[[<Figure size 640x480 with 11 Axes>]]