In [1]:
import pandas as pd
import numpy as np
from pandas_datareader import data, wb
import datetime as dt
import time
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import matplotlib.pylab as plt
import warnings
warnings.filterwarnings("ignore")

In [11]:
### Object to do everything fast and simple

class x_y_creator(object):
    
    def __init__(self, tickers_list, start, end, n_fwd, tick, lags, avg_vol_days, vol_lags,\
                 day_of_week, month, first_friday):
        self.lags = lags
        self.dfw = day_of_week
        self.m = month
        self.ff = first_friday
        self.tick = tick
        self.n_fwd = n_fwd
        self.st = start
        self.ed = end
        self.avd = avg_vol_days
        self.vlags = vol_lags
        self.tickers = tickers_list
        
    def get_prices(self, tickers_list, start, end, what_price):
        df = pd.DataFrame()
        for ticker in tickers_list:
            tmp = data.DataReader(ticker, 'yahoo', start, end)
            df[ticker] = tmp[what_price]
        return df 
        
    def data_feed(self):
        self.prices = self.get_prices(self.tickers, self.st, self.ed, 'Adj Close')
        if avg_vol_days != None:
            self.volume = self.get_prices(self.tickers, self.st, self.ed, 'Volume')
        return self
    
    
    def get_x(self):
        self.x = self.prices.pct_change(self.n_fwd)
        self.x_base = self.x
        if avg_vol_days != None:
            self.vx = self.volume.rolling(window=self.avd).mean().add_prefix('Vol_')
        return self
    
    def get_lags(self):
        if self.lags != 0 :
            lagged_x = []
            for i in range(1,self.lags+1):
                name = 'Lag'+str(i)+'_'
                xl = self.x.shift(i)
                xl = xl.add_prefix(name)
                lagged_x.append(xl)
            
            xal = pd.concat(lagged_x, axis=1)
            self.x = self.x.join(xal)
            
            if self.avd != None and self.vlags > 0:
                lagged_vx = []
                for i in range(1,self.vlags+1):
                    name = 'Lag'+str(i)+'_'
                    vxl = self.vx.shift(i)
                    vxl = vxl.add_prefix(name)
                    lagged_vx.append(vxl)
                vaxl = pd.concat(lagged_vx, axis=1)
                self.vx = self.vx.join(vaxl).dropna()
                self.x = self.x.join(self.vx)
            self.x = self.x
            return self
        else:
            return self
        
    def get_dummies(self):
        if self.dfw == True:
            self.x['WeekDay'] = self.x.index.weekday_name
            self.x = pd.get_dummies(self.x)
            
        if self.m == True:
            self.x['Month'] = self.x.index.strftime('%b')
            self.x = pd.get_dummies(self.x)
            
        if self.ff == True:
            self.x['First_Friday'] = pd.Series(self.x.index, index=self.x.index).apply(is_first_friday)
        
        return self
    
    def is_first_friday(date):
        if date.weekday() == 4 and date.day <= 7: # should by thu
            return 1
        else:
            return 0
    
    def get_y(self):
        self.y = self.x_base[self.tick].shift(-self.n_fwd)
        return self
    
    def get_both(self):
        self.data_feed()
        self.get_x()
        self.get_y()
        self.get_lags()
        self.get_dummies()
        x = self.x.dropna()
        y = self.y.dropna()
        x = x[x.index.isin(y.index)]
        y = y[y.index.isin(x.index)]
        y = np.where(y>=0, 1, 0)
        return x, y

In [2]:
tickers = ['SPY', 'IWM','TLT','EEM', 'IYR','LQD','TIP','GLD','OIH','FXE'] # 'JNK',, 'GSG'
st = dt.datetime(2011, 1, 1)
ed = dt.datetime(2016, 1, 1)

In [4]:
horizon = [1, 2, 3, 5, 10, 20, 40, 60]#, 120, 250]
#Scores = pd.DataFrame(index=tickers, columns =[str(x)+' days' for x in horizon])
Scores_base  = pd.DataFrame(index=tickers, columns =[str(x)+' days' for x in horizon])
Scores_train = pd.DataFrame(index=tickers, columns =[str(x)+' days' for x in horizon])
Scores_test  = pd.DataFrame(index=tickers, columns =[str(x)+' days' for x in horizon])

In [23]:
in_ = True
day_of_week = False
first_friday = False
month = False
lags = 3
avg_vol_days = None
vol_lags = 0
for tick in tickers:
    
    for n_fwd in horizon:

            creator = x_y_creator(tickers, st, ed, n_fwd, tick, lags, avg_vol_days, vol_lags,\
                 day_of_week, month, first_friday)
            x, y = creator.get_both()
            
            X_train, X_test, y_train, y_test = train_test_split(x, y, 
                    train_size = 0.7)

            ##### added pipeline #### 
            mlp = MLPClassifier()
            
            pipe_mlp = Pipeline([('scl', StandardScaler()),
                         ('clf', mlp)])

            param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
            #param_range = [0.01, 10.0]

            param_grid = [{'clf__alpha'     : param_range, # regularization strength on L2
                           'clf__activation': ['relu','logistic', 'tanh'],
                           'clf__solver'    : ['sgd'],
                           'clf__hidden_layer_sizes' : [(10,10), (10,10,10)]}] # 'lbfgs','adam'

            gs = GridSearchCV(estimator = pipe_mlp,
                              param_grid = param_grid,
                              scoring='accuracy',
                              cv = 3,
                              n_jobs = -1)

            gs = gs.fit(X_train, y_train)

            best_mlp = gs.best_estimator_
            best_mlp.fit(X_train, y_train)

            train_score = best_mlp.score(X_train, y_train)
            test_score  = best_mlp.score(X_test, y_test)

        #        train_score = f1_score(y_train, best_mlp.predict(X_train) ) 
        #        test_score  = f1_score(y_test , best_mlp.predict(X_test)  )

            #Scores[str(n_fwd)+' days'][tick] = [train_score, test_score]
            Scores_base[str(n_fwd)+' days'][tick]  = y.sum()/len(y)
            Scores_train[str(n_fwd)+' days'][tick] = train_score
            Scores_test[str(n_fwd)+' days'][tick]  = test_score
            
            if in_:
                print('Ticker, Base%, TrainScore, TestScore, next_days_forward ')
                print(tick,',%.3f, %.3f, %.3f, %.f' % (y_test.sum()/len(y_test), train_score, test_score, n_fwd)) 
                in_ = False
            else:
                print(tick,',%.3f, %.3f, %.3f, %.f' % (y_test.sum()/len(y_test), train_score, test_score, n_fwd))

Ticker, Base%, TrainScore, TestScore, next_days_forward 
SPY ,0.545, 0.567, 0.503, 1
SPY ,0.577, 0.577, 0.577, 2
SPY ,0.557, 0.611, 0.568, 3
SPY ,0.527, 0.621, 0.524, 5
SPY ,0.604, 0.612, 0.593, 10
SPY ,0.641, 0.682, 0.649, 20
SPY ,0.742, 0.828, 0.802, 40
SPY ,0.771, 0.815, 0.792, 60
IWM ,0.527, 0.545, 0.527, 1
IWM ,0.548, 0.566, 0.548, 2
IWM ,0.560, 0.571, 0.560, 3
IWM ,0.561, 0.551, 0.532, 5
IWM ,0.563, 0.625, 0.628, 10
IWM ,0.540, 0.668, 0.586, 20
IWM ,0.657, 0.809, 0.779, 40
IWM ,0.695, 0.836, 0.818, 60
TLT ,0.489, 0.568, 0.476, 1
TLT ,0.561, 0.586, 0.524, 2
TLT ,0.560, 0.589, 0.555, 3
TLT ,0.529, 0.569, 0.527, 5
TLT ,0.566, 0.634, 0.553, 10
TLT ,0.575, 0.642, 0.652, 20
TLT ,0.558, 0.769, 0.697, 40
TLT ,0.639, 0.626, 0.639, 60
EEM ,0.505, 0.560, 0.476, 1
EEM ,0.505, 0.550, 0.513, 2
EEM ,0.480, 0.546, 0.501, 3
EEM ,0.492, 0.588, 0.594, 5
EEM ,0.480, 0.632, 0.590, 10
EEM ,0.433, 0.664, 0.652, 20
EEM ,0.453, 0.679, 0.677, 40
EEM ,0.540, 0.772, 0.739, 60
IYR ,0.537, 0.550, 0.537, 1
IYR

In [24]:
name = 'NN_X_Lag3.xlsx'
writer = pd.ExcelWriter(name)
Scores_base.to_excel(writer, sheet_name='Base')
Scores_train.to_excel(writer, sheet_name='Train')
Scores_test.to_excel(writer, sheet_name='Test')