In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [29]:
class TestProject(object):
#Test Project — Algorithmic Trading
#Prediction-based Trading & Event-based Backtesting
# author: Milica Medic Kiralj
    
    def __init__(self):
        self.get_data()
        
    '''Retrieve and prepare data'''   
    def get_data(self):
        self.raw = pd.read_csv('http://hilpisch.com/ref_eikon_eod_data.csv', index_col=0, parse_dates=True).dropna()
        symbol = '.SPX'
        self.raw = pd.DataFrame(self.raw[symbol].iloc[:1000])
        self.raw.rename(columns={symbol:'price'}, inplace = True)
        self.data = self.raw.dropna()
        
    def select_data(self):
        data = self.data.copy()    
        return data
    
    def prepare_data_features(self):
        data = self.select_data()
        '''Log Return'''
        data['return']=np.log(data['price']/data['price'].shift(1))
        data.dropna(inplace=True)
        
        '''Direction'''
        data['direction']= np.where(data['return'] > 0, 1, 0)
        
        ''' 5 categories returns'''
        self.bins = [ -0.006, -0.0003, 0.0003, 0.006]
        data['bins'] = np.digitize(data['return'], bins=self.bins)
        
        '''Simple Moving Average feature 20-day SMA'''
        data['sma_20'] = data['price'].rolling(20).mean()

        '''Simple Moving Average feature 60-day SMA'''
        data['sma_60'] = data['price'].rolling(60).mean()
        
        '''SMA difference'''
        data['sma_diff'] = data['sma_20']-data['sma_60']
        
        '''Exponentially Weighted Moving Average feature 20-day EWMA'''
        data['ewma_20'] = data['price'].ewm(halflife=20).mean()
        
        '''Exponentially Weighted Moving Average feature 20-day EWMA'''
        data['ewma_60'] = data['price'].ewm(halflife=60).mean()

        ''' EWMA difference'''
        data['ewma_diff'] = data['ewma_20'] - data['ewma_60']
        
        '''Rolling volatility short window'''
        data['vol_20'] = data['return'].rolling(20).std()

        '''Rolling volatility long window'''
        data['vol_60'] = data['return'].rolling(60).std()
        
        data.dropna(inplace=True)
        
        '''Split into train (70%) & test data'''
        self.train, self.test = train_test_split(data, test_size=0.3, shuffle=False)
        
        '''Normalize the training features data to have: zero mean and standard deviation of one'''
        self.cols = ['sma_20','sma_60','sma_diff','ewma_20','ewma_60','ewma_diff','vol_20','vol_60']
        self.mu, self.std = self.train[self.cols].mean(), self.train[self.cols].std() 
        
        '''Normalize the train cols'''
        self.train[self.cols] = (self.train[self.cols] - self.mu) / self.std
        
        '''Normalize the test cols'''
        self.test[self.cols] = (self.test[self.cols] - self.mu) / self.std
        
        self.cols.extend(['return', 'direction', 'bins'])
        
        lags = 5
        self.cols_ = []
        for col in self.cols:
            for lag in range(1, lags + 1):
                col_ = col + f'_lag_{lag}'
                self.train[col_] = self.train[col].shift(lag)
                self.test[col_] = self.test[col].shift(lag)
                self.cols_.append(col_)
        self.train.dropna(inplace=True)
        self.test.dropna(inplace=True)
        print (self.train, self.test)
    
    def fit_and_train_models(self):
        self.prepare_data_features()
        self.models = {'gauss': GaussianNB(),
          'logreg': LogisticRegression(C=1, solver='lbfgs', max_iter=500),
          'dtc': DecisionTreeClassifier(max_depth=7),
          'svm': SVC(C=1, gamma='auto', kernel='linear'),
          'mlp': MLPClassifier(hidden_layer_sizes=[64], shuffle=False,
                              max_iter=5000)}

        for m in self.models:
            self.model = self.models[m]
            self.model.fit(self.train[self.cols_], self.train['direction'])
            self.train['p_' + m] = self.model.predict(self.train[self.cols_])
            self.train['p_' + m] = np.where(self.train['p_' + m] == 1, 1, -1)
            self.train['s_train_' + m] = self.train['p_' + m] * self.train['return']
            self.perf_train = self.train[['return', 's_train_' + m]].sum().apply(np.exp)
            print(self.perf_train)

        for m in self.models:
            self.model = self.models[m]
            self.model.fit(self.train[self.cols_], self.train['direction'])
            self.test['p_' + m] = self.model.predict(self.test[self.cols_])
            self.test['p_' + m] = np.where(self.test['p_' + m] == 1, 1, -1)
            self.test['s_test_' + m] = self.test['p_' + m] * self.test['return']
            self.perf_test = self.test[['return', 's_test_' + m]].sum().apply(np.exp)
            print(self.perf_test)
    
        

        

In [30]:
prediction = TestProject()
prediction.fit_and_train_models()

              price    return  direction  bins    sma_20    sma_60  sma_diff  \
Date                                                                           
2010-04-08  1186.44  0.003369          1     3 -0.909862 -1.303077  0.962009   
2010-04-09  1194.37  0.006662          1     4 -0.888496 -1.293065  0.998997   
2010-04-12  1196.48  0.001765          1     3 -0.865986 -1.284319  1.043175   
2010-04-13  1197.30  0.000685          1     3 -0.843331 -1.275910  1.088778   
2010-04-14  1210.65  0.011088          1     4 -0.818546 -1.263063  1.128113   
...             ...       ...        ...   ...       ...       ...       ...   
2012-11-01  1427.59  0.010867          1     4  1.680280  1.826635 -0.098745   
2012-11-02  1414.20 -0.009424          0     0  1.662467  1.828847 -0.160136   
2012-11-05  1417.26  0.002161          1     3  1.641095  1.831437 -0.233605   
2012-11-06  1428.39  0.007823          1     4  1.625340  1.835842 -0.294992   
2012-11-07  1394.53 -0.023990          0