In [83]:
import pandas as pd

df = pd.read_csv('./data/train_df.csv', index_col=0)

train_data = df.loc[:'2018-01-01']
backtest_data = df.loc['2018-01-02':]

In [181]:
import talib as ta


def get_features(data):
    tech_data = data.loc[:, ['close', 'time']]
    days = [2, 3, 4, 5]
    
    def get_target_level(increase):
        if increase > 0:
            return 1
        return 0
    
    for i in days:
        tech_data[f'factor_MOM_{i}'] = ta.MOM(tech_data.close, i)
        tech_data[f'factor_RSI_{i}'] = ta.RSI(tech_data.close, i)
        tech_data[f'factor_NATR_{i}'] = ta.NATR(data.high, data.low, data.close, timeperiod=i)
    
    tech_data['factor_OBV'] = ta.OBV(data.close, data.volume)
    
    tech_data['target'] = tech_data.close.pct_change().shift(-1).apply(get_target_level)
    tech_data = tech_data.set_index('time')
    
    features = [c for c in  tech_data.columns if c.startswith('factor')] 
    return tech_data.dropna(), features


data_sets, features= get_features(train_data)
print('训练数据:')
display(data_sets.head(5))
print('指标:', features)

训练数据:


Unnamed: 0_level_0,close,factor_MOM_2,factor_RSI_2,factor_NATR_2,factor_MOM_3,factor_RSI_3,factor_NATR_3,factor_MOM_4,factor_RSI_4,factor_NATR_4,factor_MOM_5,factor_RSI_5,factor_NATR_5,factor_OBV,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2014-01-09,2.303,-0.016,0.97561,1.660877,-0.015,0.843882,1.773854,-0.072,0.955414,1.818281,-0.098,1.0,1.832393,-1110236000.0,0
2014-01-10,2.286,-0.031,0.419287,1.383421,-0.033,0.568586,1.555901,-0.032,0.741351,1.647255,-0.089,0.824742,1.695538,-1475339000.0,0
2014-01-13,2.27,-0.033,0.202224,1.555617,-0.047,0.389294,1.617266,-0.049,0.578654,1.673665,-0.048,0.683761,1.709604,-1957124000.0,1
2014-01-14,2.285,-0.001,49.358645,1.560449,-0.018,30.990308,1.596264,-0.032,21.981164,1.640881,-0.034,17.258567,1.673803,-1709638000.0,0
2014-01-15,2.285,0.015,49.358645,1.261625,-0.001,30.990308,1.385109,-0.018,21.981164,1.471361,-0.032,17.258567,1.531603,-1709638000.0,1


指标: ['factor_MOM_2', 'factor_RSI_2', 'factor_NATR_2', 'factor_MOM_3', 'factor_RSI_3', 'factor_NATR_3', 'factor_MOM_4', 'factor_RSI_4', 'factor_NATR_4', 'factor_MOM_5', 'factor_RSI_5', 'factor_NATR_5', 'factor_OBV']


In [185]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression, LassoCV
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler


X = data_sets[features]
y = data_sets['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)


In [186]:
svm_parameters = {
    'classification__n_estimators':(115, 120, 140, 160)
}
pipline = Pipeline([    
    ('preprocessing', StandardScaler()),
#     ('feature_selection', SelectFromModel(LogisticRegression())),
    ('classification', AdaBoostClassifier(base_estimator=SVC(kernel='linear'), algorithm='SAMME'))
])

svm = GridSearchCV(pipline, svm_parameters, cv=5)


svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print('参数:', svm.best_params_)
print('准确率:', accuracy_score(y_test, svm_pred))
print('分类报告:', classification_report(y_test, svm_pred))

参数: {'classification__n_estimators': 115}
准确率: 0.45454545454545453
分类报告:               precision    recall  f1-score   support

           0       0.64      0.33      0.44        42
           1       0.36      0.67      0.47        24

    accuracy                           0.45        66
   macro avg       0.50      0.50      0.45        66
weighted avg       0.54      0.45      0.45        66



In [187]:
import datetime
import backtrader as bt
import backtrader.feeds as btfeed
from backtrader_plotting import Bokeh
from backtrader_plotting.schemes import Tradimo

class TestStrategy(bt.Strategy):
    def log(self, txt, dt=None):
        ''' Logging function for this strategy'''
        dt = dt or self.datas[0].datetime.date(0)
        print('%s, %s' % (dt.isoformat(), txt))

    def __init__(self):
        self.dataclose = self.datas[0].close
        self.today = self.data0.datetime.date
        pred_data, factors = get_features(df.loc[df.time >= '2018-01-01'])
        signal = pd.DataFrame(index=pred_data.index)
        signal.loc[:, 'is_buy'] = svm.predict(pred_data[factors])
        self.signal = signal
        
        
    def next(self):
        today = self.datas[0].datetime.date(0).isoformat()
        today_signal = self.signal.loc[today, 'is_buy']
        if today_signal > 0:
            print('Buy!')
            self.order_target_percent(target=1)
        else:
            print('Sell!')
            self.order_target_percent(target=0)


In [None]:
# Create a cerebro entity
cerebro = bt.Cerebro()

# Add a strategy
cerebro.addstrategy(TestStrategy)


# Create a Data Feed
data = bt.feeds.GenericCSVData(
        dataname='data/train_df.csv',
        dtformat='%Y-%m-%d',
        fromdate=datetime.datetime(2018, 2, 1),
        todate=datetime.datetime(2020, 7, 1),
        datetime=1,
        high=5,
        low=6,
        open=3,
        close=4,
        volume=7,
        reverse=True)

cerebro.adddata(data)

cerebro.addanalyzer(bt.analyzers.PyFolio, _name='pyfolio')
cerebro.broker.setcash(100000.0)

print('Starting Portfolio Value: %.2f' % cerebro.broker.getvalue())
results = cerebro.run()
strat = results[0]
pyfoliozer = strat.analyzers.getbyname('pyfolio')
returns, positions, transactions, gross_lev = pyfoliozer.get_pf_items()
print('Final Portfolio Value: %.2f' % cerebro.broker.getvalue())

cerebro.plot()

Starting Portfolio Value: 100000.00
Buy!
Buy!
Sell!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Sell!
Sell!
Sell!
Sell!
Sell!
Buy!
Buy!
Buy!
Buy!
Sell!
Sell!
Sell!
Sell!
Sell!
Sell!
Buy!
Sell!
Buy!
Buy!
Sell!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Sell!
Buy!
Buy!
Buy!
Buy!
Sell!
Sell!
Sell!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Sell!
Sell!
Buy!
Buy!
Buy!
Sell!
Buy!
Sell!
Sell!
Sell!
Sell!
Sell!
Sell!
Sell!
Sell!
Buy!
Sell!
Sell!
Sell!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Sell!
Sell!
Sell!
Sell!
Buy!
Buy!
Sell!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Sell!
Sell!
Sell!
Sell!
Sell!
Sell!
Buy!
Buy!
Buy!
Sell!
Sell!
Sell!
Sell!
Sell!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Sell!
Sell!
Sell!
Buy!
Buy!
Buy!
Buy!
Buy!
Sell!
Sell!
Sell!
Sell!
Sell!
Sell!
Sell!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Sell!
Sell!
Sell!
Sell!
Sell!
Sell!
Sell!
Sell!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Buy!
Sell!
Sell!
Sell!
Sell!
Buy!
Buy!
Buy!
Sell

In [160]:
import pyfolio as pf
%matplotlib inline

# returns.index = returns.index.strftime('%Y-%m-%d')
# positions.index = positions.index.strftime('%Y-%m-%d')
# transactions.index = transactions.index.strftime('%Y-%m-%d')

pf.create_returns_tear_sheet(
    returns=returns,
    positions=positions,
    live_start_date='2019-01-01',
    transactions=transactions)

  np.abs(np.percentile(returns, 5))


Start date,2018-02-01,2018-02-01,2018-02-01,2018-02-01
End date,2020-06-30,2020-06-30,2020-06-30,2020-06-30
In-sample months,10,10,10,10
Out-of-sample months,17,17,17,17
Unnamed: 0_level_4,In-sample,Out-of-sample,All,Unnamed: 4_level_4
Annual return,-0.9%,1.8%,0.7%,
Cumulative returns,-0.8%,2.5%,1.7%,
Annual volatility,14.2%,8.5%,11.0%,
Sharpe ratio,0.01,0.25,0.12,
Calmar ratio,-0.11,0.19,0.08,
Stability,0.00,0.12,0.24,
Max drawdown,-8.3%,-9.3%,-9.3%,
Omega ratio,1.00,1.14,1.05,
Sortino ratio,0.01,0.35,0.18,
Skew,0.43,-0.53,0.19,


AttributeError: 'numpy.int64' object has no attribute 'to_pydatetime'

In [67]:
rdf_parameters = {
    'classification__n_estimators':(140, 150, 160, 170, 180), 
    'classification__max_depth':[4, 5, 6, 7]
}

pipline = Pipeline([    
    ('feature_selection', SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear'))),
    ('classification', RandomForestClassifier())
])

rdf = GridSearchCV(pipline, rdf_parameters, cv=5)


rdf.fit(X_train, y_train)
rdf_pred = rdf.predict(X_test)
print('参数:', rdf.best_params_)
print('准确率:', accuracy_score(y_test, rdf_pred))
print('分类报告:', classification_report(y_test, rdf_pred))



KeyboardInterrupt: 

[sudo] password for jovyan: 
