In [12]:
import pandas as pd

df = pd.read_csv('./data/train_df.csv', index_col=0)

train_data = df.loc[:'2018-01-01']
backtest_data = df.loc['2018-01-02':]

In [51]:
import talib as ta


def get_features(data):
    tech_data = data.loc[:, ['close', 'time']]
    days = [2, 3, 4, 5]
    
    def get_target_level(increase):
        if increase > 0:
            return 1
        return 0
    
    for i in days:
        tech_data[f'factor_MOM_{i}'] = ta.MOM(tech_data.close, i)
        tech_data[f'factor_RSI_{i}'] = ta.RSI(tech_data.close, i)
        tech_data[f'factor_NATR_{i}'] = ta.NATR(data.high, data.low, data.close, timeperiod=i)
    
    tech_data['factor_OBV'] = ta.OBV(data.close, data.volume)
    
    tech_data['target'] = tech_data.close.pct_change().shift(-1).apply(get_target_level)
    tech_data = tech_data.set_index('time')
    
    features = [c for c in  tech_data.columns if c.startswith('factor')] 
    return tech_data.dropna(), features


data_sets, features= get_features(train_data)
print('训练数据:')
display(data_sets.head(5))
print('指标:', features)

训练数据:


Unnamed: 0_level_0,close,factor_MOM_2,factor_RSI_2,factor_NATR_2,factor_MOM_3,factor_RSI_3,factor_NATR_3,factor_MOM_4,factor_RSI_4,factor_NATR_4,factor_MOM_5,factor_RSI_5,factor_NATR_5,factor_OBV,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2014-01-09,2.303,-0.016,0.97561,1.660877,-0.015,0.843882,1.773854,-0.072,0.955414,1.818281,-0.098,1.0,1.832393,-1110236000.0,0
2014-01-10,2.286,-0.031,0.419287,1.383421,-0.033,0.568586,1.555901,-0.032,0.741351,1.647255,-0.089,0.824742,1.695538,-1475339000.0,0
2014-01-13,2.27,-0.033,0.202224,1.555617,-0.047,0.389294,1.617266,-0.049,0.578654,1.673665,-0.048,0.683761,1.709604,-1957124000.0,1
2014-01-14,2.285,-0.001,49.358645,1.560449,-0.018,30.990308,1.596264,-0.032,21.981164,1.640881,-0.034,17.258567,1.673803,-1709638000.0,0
2014-01-15,2.285,0.015,49.358645,1.261625,-0.001,30.990308,1.385109,-0.018,21.981164,1.471361,-0.032,17.258567,1.531603,-1709638000.0,1


指标: ['factor_MOM_2', 'factor_RSI_2', 'factor_NATR_2', 'factor_MOM_3', 'factor_RSI_3', 'factor_NATR_3', 'factor_MOM_4', 'factor_RSI_4', 'factor_NATR_4', 'factor_MOM_5', 'factor_RSI_5', 'factor_NATR_5', 'factor_OBV']


In [52]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression, LassoCV
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler


X = data_sets[features]
y = data_sets['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)


In [54]:
svm_parameters = {
    'classification__n_estimators':(115, 120, 140, 160)
}
pipline = Pipeline([    
    ('preprocessing', StandardScaler()),
#     ('feature_selection', SelectFromModel(LogisticRegression())),
    ('classification', AdaBoostClassifier(base_estimator=SVC(kernel='linear'), algorithm='SAMME'))
])

svm = GridSearchCV(pipline, svm_parameters, cv=5)


svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print('参数:', svm.best_params_)
print('准确率:', accuracy_score(y_test, svm_pred))
print('分类报告:', classification_report(y_test, svm_pred))

参数: {'classification__n_estimators': 115}
准确率: 0.48484848484848486
分类报告:               precision    recall  f1-score   support

           0       0.59      0.47      0.53        40
           1       0.38      0.50      0.43        26

    accuracy                           0.48        66
   macro avg       0.49      0.49      0.48        66
weighted avg       0.51      0.48      0.49        66



In [None]:
rdf_parameters = {
    'classification__n_estimators':(140, 160, 170, 180), 
    'classification__max_depth':[4, 6, 7, 8, 9]
}

pipline = Pipeline([    
    ('feature_selection', SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear'))),
    ('classification', RandomForestClassifier(random_state=44))
])

rdf = GridSearchCV(pipline, rdf_parameters, cv=5)


rdf.fit(X_train, y_train)
rdf_pred = rdf.predict(X_test)
print('参数:', rdf.best_params_)
print('准确率:', accuracy_score(y_test, rdf_pred))
print('分类报告:', classification_report(y_test, rdf_pred))





In [None]:
import datetime
import backtrader as bt
import backtrader.feeds as btfeed

class TestStrategy(bt.Strategy):

    def log(self, txt, dt=None):
        ''' Logging function for this strategy'''
        dt = dt or self.datas[0].datetime.date(0)
        print('%s, %s' % (dt.isoformat(), txt))

    def __init__(self):
        self.dataclose = self.datas[0].close
        self.today = self.data0.datetime.date
        pred_data, factors = get_features(df.loc[df.time >= '2018-01-01'])
        signal = pd.DataFrame(index=pred_data.index)
        signal.loc[:, 'is_buy'] = rdf.predict(pred_data[factors])
        self.signal = signal
        
        
    def next(self):
        today = self.datas[0].datetime.date(0).isoformat()
        today_signal = self.signal.loc[today, 'is_buy']
        if today_signal > 0:
            print('Buy!')
            self.order_target_percent(target=1)
        else:
            print('Sell!')
            self.close()


In [None]:
# Create a cerebro entity
cerebro = bt.Cerebro()

# Add a strategy
cerebro.addstrategy(TestStrategy)


# Create a Data Feed
data = bt.feeds.GenericCSVData(
        dataname='data/train_df.csv',
        dtformat='%Y-%m-%d',
        fromdate=datetime.datetime(2018, 2, 1),
        todate=datetime.datetime(2020, 7, 1),
        datetime=1,
        high=5,
        low=6,
        open=3,
        close=4,
        volume=7,
        reverse=True)

cerebro.adddata(data)

cerebro.addanalyzer(bt.analyzers.PyFolio, _name='pyfolio')
cerebro.broker.setcash(100000.0)

print('Starting Portfolio Value: %.2f' % cerebro.broker.getvalue())
results = cerebro.run()
strat = results[0]
pyfoliozer = strat.analyzers.getbyname('pyfolio')
returns, positions, transactions, gross_lev = pyfoliozer.get_pf_items()
print('Final Portfolio Value: %.2f' % cerebro.broker.getvalue())
cerebro.plot()

In [57]:
import pyfolio as pf
pf.create_full_tear_sheet(
    returns,
    positions=positions,
    transactions=transactions,
    round_trips=True)

Start date,2018-02-01,2018-02-01
End date,2020-06-30,2020-06-30
Total months,27,27
Unnamed: 0_level_3,Backtest,Unnamed: 2_level_3
Annual return,-5.8%,
Cumulative returns,-12.8%,
Annual volatility,8.9%,
Sharpe ratio,-0.62,
Calmar ratio,-0.28,
Stability,0.48,
Max drawdown,-20.3%,
Omega ratio,0.81,
Sortino ratio,-0.83,
Skew,-0.49,


IndexError: index -1 is out of bounds for axis 0 with size 0