## Reading in packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import time
import h5py
import copy
import datetime
import ta
import pathlib
import shutil
import tempfile
import vaex
from IPython import display
from IPython.display import clear_output
import pyodbc

# Tensorflow related
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import layers
from tensorflow.keras import regularizers
import tensorflow.compat.v2.feature_column as fc

#!pip install -q git+https://github.com/tensorflow/docs

import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

print(tf.__version__)
logdir = pathlib.Path(tempfile.mkdtemp())/"tensorboard_logs"
shutil.rmtree(logdir, ignore_errors=True)
print(logdir)

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score, f1_score, log_loss


# Models
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.exceptions import ConvergenceWarning 
from sklearn import ensemble
# ConvergenceWarning('ignore')
# Do you wanna see?
verbose = True

import sys
sys.path.append('../')
#sys.path.append('...../')

from utils.data_extraction import load_data_final,load_data_and_save
from utils.data_cleaning import HFDataCleaning
from utils.generate_features import candleCreateNP_vect_final,\
                                    generateFeatures_final,\
                                    generateFeatures_multi_v2

from utils.preprocessing_features_and_labels import extract_labels,\
                                                    align_features_and_labels,\
                                                    pre_processing_initial,\
                                                    pre_processing_extended,\
                                                    pre_processing,\
                                                    extract_labels_multi_final,\
                                                    align_features_and_labels_multi_final,\
                                                    align_features_and_labels_multi_v5

from utils.models import make_input_fn
from utils.models import performanceTesting,scoreFunction
from utils.plotting import plot_confusion_matrix

2.2.0
C:\Users\PC\AppData\Local\Temp\tmp47865n6z\tensorboard_logs


## Extracting data

In [2]:
# Do we extract new data or read in?
readIn = True
# run load_data()
if readIn:
    
    # Listing the data files 
    path = '../../../Google Drev/Thesis/Data/TAQ/AggregatedTAQ'
#     path = 'F:/AggregatedTAQ/round3'
    datafiles = os.listdir(path)
    content = np.concatenate([['\n\n'],[str(j)+': '+i+'\n' for j,i in enumerate(datafiles) if 'csv' in i],['\n\n']])
    
    # Asking for user input
    file = input('Which one do you want to load? %s'%''.join(content))
    data = pd.read_csv(path + '/' + datafiles[int(file)],
                       header = None,
                       names=['open','high','low','close',
                              'spread_open','spread_high','spread_low','spread_close',
                              'bidsize_open','bidsize_high','bidsize_low','bidsize_close',
                              'ofrsize_open','ofrsize_high','ofrsize_low','ofrsize_close',
                              'Ticker'])
    # Using the choice of the user to determine the correct market file
    key = re.split('[_.]',datafiles[int(file)])[-2]
    marketDataFile = [file for file in os.listdir(path+'/round5_market_tickers') if key in file]
    
    # Reading in the market data
    tempData = pd.read_csv(path+'/round5_market_tickers/'+marketDataFile[0]
                           ,header = None
                           ,names=['open','high','low','close',
                                  'spread_open','spread_high','spread_low','spread_close',
                                  'bidsize_open','bidsize_high','bidsize_low','bidsize_close',
                                  'ofrsize_open','ofrsize_high','ofrsize_low','ofrsize_close',
                                  'Ticker'])
    # Adding the market data to the ticker data
    data = pd.concat([data,tempData],axis=0)
    # Lower casing all column names
#     data.columns = data.columns.str.lower()
else:
    
    # print(os.listdir())
    try:
        path = 'a:/taqhdf5'  #'a:/taqhdf5'
        os.listdir(path)
    except:
        path = 't:/taqhdf5'  #'a:/taqhdf5'
        os.listdir(path)
        
    # Sample type
    data_sample = 'full' # or 'stable'
    # allFiles = os.listdir(path)
    # print(len(allFiles), allFiles[:5], allFiles[-5:])
    # print(allFiles[-10:])

    #dates = np.array(['2020040' + str(i) if i < 10 else '202004' + str(i) for i in np.arange(1,16)]).astype(int)
    dates = np.array(['20200501']).astype(int)#,'20200402','20200403','20200406','20200407'

    # Provide a list of tickers of interest
    
    tickers = sorted(['TSLA','FB'])#'MSFT'
    
    # Do we need data on trades, quotes or both?
    dataNeeded = 'quotes' # 'trades', 'quotes' or 'both'
    
    if dataNeeded == 'trades':
        tradeData = load_data_final(dates, tickers, dataNeeded, path, verbose)
    elif dataNeeded == 'quotes':
        quoteData = load_data_final(dates,
                                    tickers,
                                    dataNeeded,
                                    path,
                                    verbose,
                                    extract_candles = False,
                                    aggHorizon = 1,
                                    extra_features_from_quotes = None,
                                    data_sample = data_sample)
    elif dataNeeded == 'both':
        tradeData, quoteData = load_data_final(dates, tickers, dataNeeded, path, verbose)

# Reading in sector information
stockInfo = pd.read_csv('../utils/stockInfo_v1.csv',header=[0,1])
stockInfo.columns = ['ticker','sector','exchange','marketCap']

# Creating a table with stock information based on the tickers available in the data.
uniqueTickers = data.Ticker.unique()
stockTable = stockInfo[stockInfo.ticker.isin(uniqueTickers)]
stockTable.head(10)

Which one do you want to load? 

0: aggregateTAQ_May2020_10sec (1).csv
1: aggregateTAQ_May2020_30sec (1).csv
2: aggregateTAQ_May2020_60sec.csv
8: trueAggregateTAQ_60sec.csv


2


Unnamed: 0,ticker,sector,exchange,marketCap
12,AAPL,Technology,NMS,1578173000000.0
20,ABBV,Healthcare,NYQ,174261200000.0
34,ABT,Healthcare,NYQ,163141000000.0
126,AEP,Utilities,NYQ,40895510000.0
379,AMT,Real Estate,NYQ,117125900000.0
428,APD,Basic Materials,NYQ,54643950000.0
697,BA,Industrials,NYQ,102035600000.0
699,BABA,Consumer Cyclical,NYQ,593653600000.0
700,BAC,Financial Services,NYQ,202055000000.0
870,BHP,Basic Materials,NYQ,125819400000.0


# reading in the market data (done automatically atm)

In [47]:
key = re.split('[_.]',datafiles[int(file)])[-2]
marketDataFile = [file for file in os.listdir(path+'/round5_market_tickers') if key in file]
tempData = pd.read_csv(path+'/round5_market_tickers/'+marketDataFile
                       ,header = None
                       ,names=['open','high','low','close',
                              'spread_open','spread_high','spread_low','spread_close',
                              'bidsize_open','bidsize_high','bidsize_low','bidsize_close',
                              'ofrsize_open','ofrsize_high','ofrsize_low','ofrsize_close',
                              'Ticker'])

data = pd.concat([data,tempData],axis=0)

['aggregateTAQ_60sec.csv']

'60sec'

In [17]:
temp = data.merge(stockTable[['ticker','sector']],left_on='Ticker',right_on='ticker',how='left').drop('ticker',axis=1)
data = temp.set_index(data.index)
data

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker,sector
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL,Technology
20200501,1,289.260,289.350,288.365,289.020,0.24,0.45,0.01,0.10,9.0,20.0,1.0,1.0,4.0,56.0,1.0,1.0,AAPL,Technology
20200501,2,289.035,289.705,288.280,288.580,0.07,0.49,0.01,0.30,1.0,50.0,1.0,1.0,1.0,13.0,1.0,1.0,AAPL,Technology
20200501,3,288.485,289.315,288.280,289.095,0.49,0.49,0.01,0.17,1.0,25.0,1.0,16.0,1.0,8.0,1.0,1.0,AAPL,Technology
20200501,4,289.100,290.435,288.940,290.320,0.16,0.33,0.01,0.10,13.0,71.0,1.0,1.0,1.0,236.0,1.0,1.0,AAPL,Technology
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,123.950,124.110,123.910,124.100,0.02,0.07,0.01,0.04,1.0,11.0,1.0,1.0,5.0,9.0,1.0,1.0,WMT,Consumer Defensive
20200529,386,124.085,124.085,123.920,123.995,0.01,0.06,0.01,0.01,1.0,8.0,1.0,3.0,1.0,9.0,1.0,2.0,WMT,Consumer Defensive
20200529,387,123.995,124.355,123.985,124.335,0.01,0.07,0.01,0.05,4.0,16.0,1.0,2.0,2.0,10.0,1.0,2.0,WMT,Consumer Defensive
20200529,388,124.335,124.355,124.060,124.075,0.05,0.12,0.01,0.01,3.0,6.0,1.0,2.0,2.0,10.0,1.0,4.0,WMT,Consumer Defensive


In [120]:
os.listdir('../../')

['.ipynb_checkpoints',
 '.tmp.drivedownload',
 '20191111_213304.jpg',
 'CryptoExtraction',
 'CryptoExtraction.zip',
 'dsjn.PNG',
 'Getting started with Python.gdoc',
 'Introduction To Programming',
 'Sisse',
 'Thesis',
 'Thesis_UCPH']

In [121]:
# Exporting the final data
# data.to_csv('../../trueAggregateTAQ_60sec.csv')

In [17]:
tempData

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker
20200501,0,239.575,239.945,239.060,239.495,0.09,0.79,0.01,0.09,5.0,18.0,1.0,3.0,5.0,23.0,1.0,10.0,DIA
20200501,1,239.495,239.650,239.420,239.610,0.09,0.12,0.02,0.06,8.0,18.0,1.0,1.0,10.0,19.0,1.0,5.0,DIA
20200501,2,239.615,240.045,239.520,239.990,0.07,0.25,0.01,0.06,1.0,19.0,1.0,5.0,10.0,16.0,1.0,5.0,DIA
20200501,3,239.985,240.165,239.910,240.070,0.07,0.10,0.01,0.06,5.0,15.0,1.0,1.0,5.0,16.0,1.0,1.0,DIA
20200501,4,240.075,240.345,240.020,240.085,0.07,0.11,0.01,0.07,1.0,16.0,1.0,3.0,10.0,14.0,1.0,5.0,DIA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,91.500,91.755,91.485,91.740,0.42,0.93,0.39,0.50,5.0,5.0,5.0,5.0,1.0,5.0,1.0,5.0,XNTK
20200529,386,91.740,91.740,91.740,91.740,0.50,0.50,0.50,0.50,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,XNTK
20200529,387,91.580,91.830,91.580,91.715,0.18,0.68,0.18,0.45,5.0,5.0,5.0,5.0,1.0,5.0,1.0,5.0,XNTK
20200529,388,91.595,91.880,91.595,91.750,0.21,0.78,0.21,0.52,5.0,5.0,5.0,5.0,1.0,5.0,1.0,5.0,XNTK


In [50]:
data

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL
20200501,1,289.260,289.350,288.365,289.020,0.24,0.45,0.01,0.10,9.0,20.0,1.0,1.0,4.0,56.0,1.0,1.0,AAPL
20200501,2,289.035,289.705,288.280,288.580,0.07,0.49,0.01,0.30,1.0,50.0,1.0,1.0,1.0,13.0,1.0,1.0,AAPL
20200501,3,288.485,289.315,288.280,289.095,0.49,0.49,0.01,0.17,1.0,25.0,1.0,16.0,1.0,8.0,1.0,1.0,AAPL
20200501,4,289.100,290.435,288.940,290.320,0.16,0.33,0.01,0.10,13.0,71.0,1.0,1.0,1.0,236.0,1.0,1.0,AAPL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,91.500,91.755,91.485,91.740,0.42,0.93,0.39,0.50,5.0,5.0,5.0,5.0,1.0,5.0,1.0,5.0,XNTK
20200529,386,91.740,91.740,91.740,91.740,0.50,0.50,0.50,0.50,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,XNTK
20200529,387,91.580,91.830,91.580,91.715,0.18,0.68,0.18,0.45,5.0,5.0,5.0,5.0,1.0,5.0,1.0,5.0,XNTK
20200529,388,91.595,91.880,91.595,91.750,0.21,0.78,0.21,0.52,5.0,5.0,5.0,5.0,1.0,5.0,1.0,5.0,XNTK


In [23]:
test = pd.concat([data,tempData],axis=0)
test

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL
20200501,1,289.260,289.350,288.365,289.020,0.24,0.45,0.01,0.10,9.0,20.0,1.0,1.0,4.0,56.0,1.0,1.0,AAPL
20200501,2,289.035,289.705,288.280,288.580,0.07,0.49,0.01,0.30,1.0,50.0,1.0,1.0,1.0,13.0,1.0,1.0,AAPL
20200501,3,288.485,289.315,288.280,289.095,0.49,0.49,0.01,0.17,1.0,25.0,1.0,16.0,1.0,8.0,1.0,1.0,AAPL
20200501,4,289.100,290.435,288.940,290.320,0.16,0.33,0.01,0.10,13.0,71.0,1.0,1.0,1.0,236.0,1.0,1.0,AAPL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,91.500,91.755,91.485,91.740,0.42,0.93,0.39,0.50,5.0,5.0,5.0,5.0,1.0,5.0,1.0,5.0,XNTK
20200529,386,91.740,91.740,91.740,91.740,0.50,0.50,0.50,0.50,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,XNTK
20200529,387,91.580,91.830,91.580,91.715,0.18,0.68,0.18,0.45,5.0,5.0,5.0,5.0,1.0,5.0,1.0,5.0,XNTK
20200529,388,91.595,91.880,91.595,91.750,0.21,0.78,0.21,0.52,5.0,5.0,5.0,5.0,1.0,5.0,1.0,5.0,XNTK


In [32]:
test.loc[20200501].Ticker.unique()

array(['AAPL', 'ABBV', 'ABT', 'AEP', 'AMT', 'APD', 'BA', 'BABA', 'BAC',
       'BHP', 'BP', 'CCI', 'CHL', 'COST', 'CSGP', 'D', 'DIS', 'ECL',
       'ENB', 'EXC', 'FB', 'FMX', 'GOOG', 'IDU', 'INTC', 'IYC', 'IYE',
       'IYG', 'IYH', 'IYJ', 'IYK', 'IYM', 'IYR', 'IYW', 'IYZ', 'JNJ',
       'KO', 'LFC', 'LIN', 'LMT', 'MA', 'MCD', 'MSFT', 'NKE', 'NVDA',
       'NVS', 'PBR', 'PEP', 'PFE', 'PLD', 'PSA', 'PTR', 'PYPL', 'RTX',
       'SHW', 'SNP', 'SO', 'SRE', 'T', 'TM', 'TSLA', 'TSM', 'UNP', 'UPS',
       'V', 'WMT', 'DIA', 'QQQ', 'SPY', 'XNTK'], dtype=object)

In [19]:
def generateFeatures_multi_final(data,
                                 listOfFeatures=[],
                                 feature_lags=1,
                                 #stockTable = None,
                                 sectorETFS=None):

    # try:
    #if (stockTable is not None) & ('sector' not in data.columns.str.lower()):
# if (stockTable != None) & ('sector' not in data.columns.str.lower()):
    # Appending the stock information to the data.
        #data = data.merge(stockTable[['ticker','sector']],left_on='Ticker',right_on='ticker',how='left')
    # except:
        # None
    if 'sector' in [ele.lower() for ele in listOfFeatures]:
        
        table = pd.pivot_table(sectorETFS.reset_index()[['level_0','level_1','close','Ticker']],
                               index=['level_0','level_1'],columns='Ticker')
        table.columns = table.columns.get_level_values(1)
        
        tempSector = pd.DataFrame(np.concatenate([np.array([0 for i in np.arange(table.shape[1])])\
                                          .reshape((1,table.shape[1])),
                                          ((table.values[1:]/table.values[0:-1])-1)*100]),
                          index=table.index,
                          columns=table.columns).shift(1).fillna(0)
        

    multi_features = pd.DataFrame()
    # print(data.columns)
    for ticker_iter, ticker_name in enumerate(data.Ticker.unique()):

        featuresPD = pd.DataFrame()
        dataPD = data[data.Ticker==ticker_name].copy(deep=True)

        for feature in listOfFeatures:

            # Past observations
            if feature.lower() == 'pastobs':
                featuresPD['open'] = dataPD.open
                featuresPD['high'] = dataPD.high
                featuresPD['low'] = dataPD.low
                featuresPD['close'] = dataPD.close

            elif feature.lower() == 'spread':
                featuresPD['spread_open'] = dataPD.spread_open
                featuresPD['spread_high'] = dataPD.spread_high
                featuresPD['spread_low'] = dataPD.spread_low
                featuresPD['spread_close'] = dataPD.spread_close

            elif feature.lower() == 'bidsize':
                featuresPD['bidsize_open'] = dataPD.bidsize_open
                featuresPD['bidsize_high'] = dataPD.bidsize_high
                featuresPD['bidsize_low'] = dataPD.bidsize_low
                featuresPD['bidsize_close'] = dataPD.bidsize_close

            elif feature.lower() == 'ofrsize':
                featuresPD['ofrsize_open'] = dataPD.ofrsize_open
                featuresPD['ofrsize_high'] = dataPD.ofrsize_high
                featuresPD['ofrsize_low'] = dataPD.ofrsize_low
                featuresPD['ofrsize_close'] = dataPD.ofrsize_close

            # Stochastic K
            elif feature.lower() == 'stok':

                tempFeatures= ta.momentum.stoch(dataPD.high,
                                                dataPD.low,
                                                dataPD.close)

                # Adding the feature
                featuresPD['stok'] = tempFeatures

            # Stochastic D
            elif feature.lower() == 'stod':

                tempFeatures= ta.momentum.stoch_signal(dataPD.high,
                                                       dataPD.low,
                                                       dataPD.close)
                # Adding the feature
                featuresPD['stod'] = tempFeatures

            # Slow Stochastic D
            elif feature.lower() == 'sstod':

                tempFeatures= ta.trend.sma_indicator(ta.momentum.stoch_signal(dataPD.high,
                                                                              dataPD.low,
                                                                              dataPD.close))
                # Adding the feature
                featuresPD['sstod'] = tempFeatures

            # Williams %R
            elif feature.lower() == 'wilr':

                tempFeatures= ta.momentum.wr(dataPD.high,
                                             dataPD.low,
                                             dataPD.close)
                # Adding the feature
                featuresPD['wilr'] = tempFeatures

            # Rate Of Change
            elif feature.lower() == 'roc':

                tempFeatures= ta.momentum.roc(dataPD.close)

                # Adding the feature
                featuresPD['roc'] = tempFeatures

            # Relative Strength Index
            elif feature.lower() == 'rsi':

                tempFeatures= ta.momentum.rsi(dataPD.close)

                # Adding the feature
                featuresPD['rsi'] = tempFeatures

            # Average True Range
            elif feature.lower() == 'atr':

                tempFeatures= ta.volatility.average_true_range(dataPD.high,
                                                               dataPD.low,
                                                               dataPD.close)
                # Adding the feature
                featuresPD['atr'] = tempFeatures

            # Commodity Channel Index
            elif feature.lower() == 'cci':

                tempFeatures= ta.trend.cci(dataPD.high,
                                           dataPD.low,
                                           dataPD.close)
                # Adding the feature
                featuresPD['cci'] = tempFeatures

             # Detrended Price Ocillator
            elif feature.lower() == 'dpo':

                tempFeatures= ta.trend.dpo(dataPD.close)

                # Adding the feature
                featuresPD['dpo'] = tempFeatures

            # Simple Moving Average
            elif feature.lower() == 'sma':

                tempFeatures= ta.trend.sma_indicator(dataPD.close)

                # Adding the feature
                featuresPD['sma'] = tempFeatures

            # Exponential Moving Average
            elif feature.lower() == 'ema':

                tempFeatures= ta.trend.ema_indicator(dataPD.close)

                # Adding the feature
                featuresPD['ema'] = tempFeatures

            # Moving Average Convergence Divergence
            elif feature.lower() == 'macd':

                # note: having all 3 causes multicollinearity. Maybe not a problem in ML, let's see :-)
                # macd is the difference between two EMAs
                # macd_signal is an EMA of the above macd line
                # macd_diff is the so-called histogram (just bars really) of the time-wise difference between macd and macd_signal

                # Adding the features
                featuresPD['macd'] = ta.trend.macd(dataPD.close)

            # Moving Average Convergence Divergence Difference
            elif feature.lower() == 'macd_diff':
                # Adding the features
                featuresPD['macd_diff'] = ta.trend.macd_diff(dataPD.close)

            # Moving Average Convergence Divergence Signal
            elif feature.lower() == 'macd_signal':
                # Adding the features
                featuresPD['macd_signal'] = ta.trend.macd_signal(dataPD.close)

             # Disparity 5
            elif feature.lower() == 'dis5':

                tempFeatures= (dataPD.close/ta.trend.sma_indicator(dataPD.close,5))*100

                # Adding the feature
                featuresPD['dis5'] = tempFeatures

            # Disparity 10
            elif feature.lower() == 'dis10':

                tempFeatures= (dataPD.close/ta.trend.sma_indicator(dataPD.close,10))*100

                # Adding the feature
                featuresPD['dis10'] = tempFeatures

            # Bollinger Bands
            elif feature.lower() == 'bb':

                # Define Bollinger Bands function to extract from
                bb_function = ta.volatility.BollingerBands(close=dataPD.close, n=20, ndev=2)

                # Adding the features
                featuresPD['bb_mavg'] = bb_function.bollinger_mavg()
                featuresPD['bb_hband'] = bb_function.bollinger_hband()
                featuresPD['bb_lband'] = bb_function.bollinger_lband()
                featuresPD['bb_pband'] = bb_function.bollinger_pband()
                featuresPD['bb_wband'] = bb_function.bollinger_wband()

            # Sector return
            elif feature.lower() == 'sector':
                
                returnPD = pd.DataFrame({'return':np.concatenate([[0],(((dataPD.close.values[1:]/\
                                                             dataPD.close.values[0:-1]))-1)*100])},
                                            index=dataPD.index)\
                                            .shift(1).fillna(0)
                
                relativeReturns = pd.DataFrame(returnPD.values - tempSector.values,
                                               columns=tempSector.columns,
                                               index=tempSector.index)
                
                featuresPD[['relReturns_'+i for i in relativeReturns.columns]] = relativeReturns
#                 featuresPD[relativeReturns.columns.str()] = relativeReturns
#                 colnames = 
#                 featuresPD['relative_return'] = pd.concat([APPLE,relativeReturns],axis=1)
                
                

        # if we want any lags:
        if feature_lags > 0:

            # collect names of all raw features (before any lagging)
            all_raw_features = featuresPD.columns

            # loop through each lag and shift all features at once
            for roll_i in np.arange(feature_lags + 1): # + 1 as we treat feature_lags = 1 as having both lag0 and lag1

                # define new column name (feature_name_ + lagX) where X = roll_i is the shifting parameter
                new_col_names = [feature_name + '_lag' + str(roll_i) for feature_name in all_raw_features]

                # Shift/roll all raw features with the shifting parameter roll_i and save as new columns.
                # The shift parameter must be negative (we want lag0 to be the 'newest'/'latest')
                featuresPD[new_col_names] = featuresPD[all_raw_features].shift( - (feature_lags - roll_i))

            # remove all raw features
            featuresPD = featuresPD.loc[:, ~featuresPD.columns.isin(all_raw_features)]

            # Adjust price feature
        if 'pastobs' in listOfFeatures:
            if feature_lags > 0:
                priceCols = np.concatenate([[c for c in featuresPD.columns if c.startswith(t,0,len(t))] for t in ['open','high','low','close']])#[0:len(t)] == t
                # print(priceCols)
                tempClose = featuresPD.close_lag0.copy(deep=True)
    #             print('\n')

    #             featuresPD.loc[:,priceCols] = featuresPD.loc[:,priceCols] - featuresPD.close_lag0
                featuresPD.loc[:,priceCols] = featuresPD.loc[:,priceCols].subtract(featuresPD.close_lag0,axis=0)
    #             print('\n')
    #             print([featuresPD.loc[:,priceCols] - featuresPD.close_lag0][0:5])
    #             print(tempClose)
                featuresPD.loc[:,'close_lag0'] = tempClose
            else:
    #             tempClose = copy.deepcopy(featuresPD.close.values)
                tempClose = featuresPD.close.copy(deep=True)

    #             print(tempClose)
    #             featuresPD.loc[:,['open','high','low','close']] = featuresPD.loc[:,['open','high','low','close']] - featuresPD.close
                featuresPD.loc[:,['open','high','low','close']] = featuresPD.loc[:,['open','high','low','close']].subtract(featuresPD.close,axis=0)
    #             print('\n')
    #             print(featuresPD.loc[:,['open','high','low','close']])
    #             print(featuresPD.close)
    #             print([featuresPD.loc[:,['open','high','low','close']] - featuresPD.close][0:5])
                featuresPD.loc[:,'close'] = tempClose

        featuresPD['ticker'] = ticker_name

        # append
        multi_features = pd.concat([multi_features, featuresPD])
        # print(ticker_name + " done")

    # Finally adding sector dummies if needed
    # Sector Dummies
#     if 'sector' in [ele.lower() for ele in listOfFeatures]:

#         ## Adding Sector dummies
#         sectors = data.pop('sector')
#         multi_features = pd.concat([multi_features, pd.get_dummies(sectors
#                                                                 , prefix='sector'
#                                                                 , drop_first=False)]
#                                 , axis=1)

    return multi_features

### Dropping ETFS and market indices

In [4]:
data.Ticker.unique()

array(['AAPL', 'ABBV', 'ABT', 'AEP', 'AMT', 'APD', 'BA', 'BABA', 'BAC',
       'BHP', 'BP', 'CCI', 'CHL', 'COST', 'CSGP', 'D', 'DIS', 'ECL',
       'ENB', 'EXC', 'FB', 'FMX', 'GOOG', 'IDU', 'INTC', 'IYC', 'IYE',
       'IYG', 'IYH', 'IYJ', 'IYK', 'IYM', 'IYR', 'IYW', 'IYZ', 'JNJ',
       'KO', 'LFC', 'LIN', 'LMT', 'MA', 'MCD', 'MSFT', 'NKE', 'NVDA',
       'NVS', 'PBR', 'PEP', 'PFE', 'PLD', 'PSA', 'PTR', 'PYPL', 'RTX',
       'SHW', 'SNP', 'SO', 'SRE', 'T', 'TM', 'TSLA', 'TSM', 'UNP', 'UPS',
       'V', 'WMT', 'DIA', 'QQQ', 'SPY', 'XNTK'], dtype=object)

In [5]:
# Removing the XNTK ticker
data = data[~data.Ticker.isin(['XNTK'])]

In [6]:
data.Ticker.unique()

array(['AAPL', 'ABBV', 'ABT', 'AEP', 'AMT', 'APD', 'BA', 'BABA', 'BAC',
       'BHP', 'BP', 'CCI', 'CHL', 'COST', 'CSGP', 'D', 'DIS', 'ECL',
       'ENB', 'EXC', 'FB', 'FMX', 'GOOG', 'IDU', 'INTC', 'IYC', 'IYE',
       'IYG', 'IYH', 'IYJ', 'IYK', 'IYM', 'IYR', 'IYW', 'IYZ', 'JNJ',
       'KO', 'LFC', 'LIN', 'LMT', 'MA', 'MCD', 'MSFT', 'NKE', 'NVDA',
       'NVS', 'PBR', 'PEP', 'PFE', 'PLD', 'PSA', 'PTR', 'PYPL', 'RTX',
       'SHW', 'SNP', 'SO', 'SRE', 'T', 'TM', 'TSLA', 'TSM', 'UNP', 'UPS',
       'V', 'WMT', 'DIA', 'QQQ', 'SPY'], dtype=object)

In [7]:
etfs = ['IYH','IYM','IYK','IYJ','IYG','IYW','IYC','IYR','IDU','IYZ','IYE','IYF','SPY','DIA','QQQ']

# Extracting the sector ETFs to a separate variable
sectorETFS = data[data.Ticker.isin(etfs)]

# Removing the ETFs
data = data[~data.Ticker.isin(etfs)]

In [8]:
data

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL
20200501,1,289.260,289.350,288.365,289.020,0.24,0.45,0.01,0.10,9.0,20.0,1.0,1.0,4.0,56.0,1.0,1.0,AAPL
20200501,2,289.035,289.705,288.280,288.580,0.07,0.49,0.01,0.30,1.0,50.0,1.0,1.0,1.0,13.0,1.0,1.0,AAPL
20200501,3,288.485,289.315,288.280,289.095,0.49,0.49,0.01,0.17,1.0,25.0,1.0,16.0,1.0,8.0,1.0,1.0,AAPL
20200501,4,289.100,290.435,288.940,290.320,0.16,0.33,0.01,0.10,13.0,71.0,1.0,1.0,1.0,236.0,1.0,1.0,AAPL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,123.950,124.110,123.910,124.100,0.02,0.07,0.01,0.04,1.0,11.0,1.0,1.0,5.0,9.0,1.0,1.0,WMT
20200529,386,124.085,124.085,123.920,123.995,0.01,0.06,0.01,0.01,1.0,8.0,1.0,3.0,1.0,9.0,1.0,2.0,WMT
20200529,387,123.995,124.355,123.985,124.335,0.01,0.07,0.01,0.05,4.0,16.0,1.0,2.0,2.0,10.0,1.0,2.0,WMT
20200529,388,124.335,124.355,124.060,124.075,0.05,0.12,0.01,0.01,3.0,6.0,1.0,2.0,2.0,10.0,1.0,4.0,WMT


In [124]:
sectorETFS.reset_index()

Unnamed: 0,level_0,level_1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker,sector
0,20200501,0,141.805,142.910,138.520,140.490,1.93,12.37,0.09,0.76,1.0,6.0,1.0,5.0,5.0,16.0,1.0,6.0,IDU,Utilities
1,20200501,1,140.490,140.600,140.280,140.335,0.76,0.81,0.56,0.59,5.0,6.0,1.0,5.0,5.0,7.0,1.0,6.0,IDU,Utilities
2,20200501,2,140.335,140.340,140.205,140.210,0.59,0.63,0.48,0.54,5.0,7.0,1.0,6.0,5.0,10.0,1.0,5.0,IDU,Utilities
3,20200501,3,140.210,140.275,139.440,139.465,0.54,0.57,0.26,0.43,6.0,7.0,1.0,6.0,6.0,10.0,1.0,6.0,IDU,Utilities
4,20200501,4,139.465,139.860,139.315,139.415,0.43,0.76,0.07,0.43,5.0,10.0,1.0,5.0,6.0,16.0,1.0,6.0,IDU,Utilities
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109195,20200529,385,304.610,304.855,304.505,304.800,0.02,0.04,0.01,0.02,2.0,109.0,1.0,9.0,9.0,74.0,1.0,2.0,SPY,
109196,20200529,386,304.800,304.850,304.690,304.725,0.02,0.06,0.01,0.01,9.0,104.0,1.0,1.0,3.0,114.0,1.0,5.0,SPY,
109197,20200529,387,304.725,304.890,304.620,304.800,0.01,0.04,0.01,0.02,1.0,102.0,1.0,5.0,6.0,53.0,1.0,9.0,SPY,
109198,20200529,388,304.800,304.910,304.715,304.745,0.02,0.14,0.01,0.01,6.0,66.0,1.0,9.0,9.0,138.0,1.0,29.0,SPY,


In [57]:
tempSectorETFS = sectorETFS.reset_index().copy(deep=True)
tempSectorETFS.close.values[1:]-tempSectorETFS.close.values[0:-1]

array([-0.155, -0.125, -0.745, ...,  0.075, -0.055, -0.58 ])

In [125]:
sectorETFS.reset_index()[['level_0','level_1','close','Ticker']]

Unnamed: 0,level_0,level_1,close,Ticker
0,20200501,0,140.490,IDU
1,20200501,1,140.335,IDU
2,20200501,2,140.210,IDU
3,20200501,3,139.465,IDU
4,20200501,4,139.415,IDU
...,...,...,...,...
109195,20200529,385,304.800,SPY
109196,20200529,386,304.725,SPY
109197,20200529,387,304.800,SPY
109198,20200529,388,304.745,SPY


In [58]:
tempSectorETFS[['level_0','level_1','close','Ticker']].groupby(['level_0','level_1','Ticker']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,close
level_0,level_1,Ticker,Unnamed: 3_level_1
20200501,0,DIA,239.495
20200501,0,IDU,140.490
20200501,0,IYC,200.125
20200501,0,IYE,19.565
20200501,0,IYG,112.445
...,...,...,...
20200529,389,IYR,77.475
20200529,389,IYW,252.030
20200529,389,IYZ,28.145
20200529,389,QQQ,232.895


In [59]:

table = pd.pivot_table(tempSectorETFS[['level_0','level_1','close','Ticker']],
                       index=['level_0','level_1'],columns='Ticker')
table.columns = table.columns.get_level_values(1)


In [60]:
table

Unnamed: 0_level_0,Ticker,DIA,IDU,IYC,IYE,IYG,IYH,IYJ,IYK,IYM,IYR,IYW,IYZ,QQQ,SPY
level_0,level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20200501,0,239.495,140.490,200.125,19.565,112.445,210.020,135.510,117.875,79.590,73.940,229.520,27.045,214.845,285.395
20200501,1,239.610,140.335,200.515,19.710,112.330,210.230,135.415,117.915,79.680,73.905,230.020,27.045,215.205,285.555
20200501,2,239.990,140.210,201.095,19.690,112.475,210.095,135.565,118.280,79.875,73.970,230.090,27.055,215.335,285.860
20200501,3,240.070,139.465,201.165,19.780,112.540,209.825,135.630,118.200,80.010,73.935,230.190,27.050,215.350,285.760
20200501,4,240.085,139.415,201.075,19.685,112.600,210.040,135.645,118.305,80.045,73.940,230.950,27.045,215.755,285.940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,254.690,148.625,218.055,20.405,120.455,221.120,147.990,125.065,87.765,77.605,252.605,28.135,233.470,304.800
20200529,386,254.670,148.715,218.025,20.415,120.390,221.105,147.895,125.045,87.715,77.575,252.405,28.155,233.340,304.725
20200529,387,254.775,148.580,218.050,20.425,120.405,221.125,148.010,125.005,87.755,77.575,252.595,28.145,233.395,304.800
20200529,388,254.720,148.550,218.005,20.415,120.465,221.100,147.945,125.015,87.680,77.505,252.485,28.175,233.375,304.745


In [76]:
np.concatenate([np.array([0 for i in np.arange(table.shape[1])]).reshape((1,table.shape[1])),
                (table.values[1:]/table.values[0:-1])-1])

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 4.80177039e-04, -1.10328137e-03,  1.94878201e-03, ...,
         0.00000000e+00,  1.67562661e-03,  5.60626500e-04],
       [ 1.58591044e-03, -8.90725763e-04,  2.89255168e-03, ...,
         3.69754114e-04,  6.04075184e-04,  1.06809546e-03],
       ...,
       [ 4.12298268e-04, -9.07776620e-04,  1.14665749e-04, ...,
        -3.55176700e-04,  2.35707551e-04,  2.46123554e-04],
       [-2.15876754e-04, -2.01911428e-04, -2.06374685e-04, ...,
         1.06590869e-03, -8.56916386e-05, -1.80446194e-04],
       [-2.15923367e-03, -2.62537866e-03, -2.75223045e-03, ...,
        -1.06477374e-03, -2.05677558e-03, -1.90323057e-03]])

In [83]:
tempSector = pd.DataFrame(np.concatenate([np.array([0 for i in np.arange(table.shape[1])])\
                                          .reshape((1,table.shape[1])),
                                          ((table.values[1:]/table.values[0:-1])-1)*100]),
                          index=table.index,
                          columns=table.columns).shift(1).fillna(0)

In [84]:
tempSector

Unnamed: 0_level_0,Ticker,DIA,IDU,IYC,IYE,IYG,IYH,IYJ,IYK,IYM,IYR,IYW,IYZ,QQQ,SPY
level_0,level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20200501,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
20200501,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
20200501,2,0.048018,-0.110328,0.194878,0.741119,-0.102272,0.099990,-0.070106,0.033934,0.113080,-0.047336,0.217846,0.000000,0.167563,0.056063
20200501,3,0.158591,-0.089073,0.289255,-0.101471,0.129084,-0.064215,0.110771,0.309545,0.244729,0.087951,0.030432,0.036975,0.060408,0.106810
20200501,4,0.033335,-0.531346,0.034809,0.457085,0.057791,-0.128513,0.047947,-0.067636,0.169014,-0.047316,0.043461,-0.018481,0.006966,-0.034982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,-0.070662,-0.047146,0.034437,-0.049032,-0.087018,-0.011319,-0.013513,0.056119,-0.039927,0.025825,-0.019795,0.089270,0.060025,-0.009848
20200529,386,0.053034,0.148243,0.087210,0.098111,-0.087094,0.122255,0.000000,0.208325,0.159772,0.180727,0.023758,0.374599,0.040707,0.062375
20200529,387,-0.007853,0.060555,-0.013758,0.049008,-0.053962,-0.006784,-0.064194,-0.015992,-0.056970,-0.038657,-0.079175,0.071086,-0.055682,-0.024606
20200529,388,0.041230,-0.090778,0.011467,0.048984,0.012460,0.009045,0.077758,-0.031988,0.045602,0.000000,0.075276,-0.035518,0.023571,0.024612


In [85]:
APPLE = data[data.Ticker=='AAPL']
APPLE

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL
20200501,1,289.260,289.350,288.365,289.020,0.24,0.45,0.01,0.10,9.0,20.0,1.0,1.0,4.0,56.0,1.0,1.0,AAPL
20200501,2,289.035,289.705,288.280,288.580,0.07,0.49,0.01,0.30,1.0,50.0,1.0,1.0,1.0,13.0,1.0,1.0,AAPL
20200501,3,288.485,289.315,288.280,289.095,0.49,0.49,0.01,0.17,1.0,25.0,1.0,16.0,1.0,8.0,1.0,1.0,AAPL
20200501,4,289.100,290.435,288.940,290.320,0.16,0.33,0.01,0.10,13.0,71.0,1.0,1.0,1.0,236.0,1.0,1.0,AAPL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,319.255,319.415,318.830,319.095,0.11,0.17,0.01,0.05,2.0,485.0,1.0,1.0,1.0,22.0,1.0,7.0,AAPL
20200529,386,319.095,319.295,318.810,318.845,0.05,0.22,0.01,0.03,1.0,20.0,1.0,2.0,6.0,11.0,1.0,1.0,AAPL
20200529,387,318.845,319.605,318.695,319.460,0.03,0.11,0.01,0.06,1.0,10.0,1.0,2.0,1.0,9.0,1.0,6.0,AAPL
20200529,388,319.465,319.555,318.660,318.675,0.05,0.12,0.01,0.03,1.0,38.0,1.0,1.0,6.0,19.0,1.0,1.0,AAPL


In [86]:
tempSector

Unnamed: 0_level_0,Ticker,DIA,IDU,IYC,IYE,IYG,IYH,IYJ,IYK,IYM,IYR,IYW,IYZ,QQQ,SPY
level_0,level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20200501,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
20200501,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
20200501,2,0.048018,-0.110328,0.194878,0.741119,-0.102272,0.099990,-0.070106,0.033934,0.113080,-0.047336,0.217846,0.000000,0.167563,0.056063
20200501,3,0.158591,-0.089073,0.289255,-0.101471,0.129084,-0.064215,0.110771,0.309545,0.244729,0.087951,0.030432,0.036975,0.060408,0.106810
20200501,4,0.033335,-0.531346,0.034809,0.457085,0.057791,-0.128513,0.047947,-0.067636,0.169014,-0.047316,0.043461,-0.018481,0.006966,-0.034982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,-0.070662,-0.047146,0.034437,-0.049032,-0.087018,-0.011319,-0.013513,0.056119,-0.039927,0.025825,-0.019795,0.089270,0.060025,-0.009848
20200529,386,0.053034,0.148243,0.087210,0.098111,-0.087094,0.122255,0.000000,0.208325,0.159772,0.180727,0.023758,0.374599,0.040707,0.062375
20200529,387,-0.007853,0.060555,-0.013758,0.049008,-0.053962,-0.006784,-0.064194,-0.015992,-0.056970,-0.038657,-0.079175,0.071086,-0.055682,-0.024606
20200529,388,0.041230,-0.090778,0.011467,0.048984,0.012460,0.009045,0.077758,-0.031988,0.045602,0.000000,0.075276,-0.035518,0.023571,0.024612


In [91]:
APPLE_return = pd.DataFrame({'return':np.concatenate([[0],(((APPLE.close.values[1:]/\
                                                             APPLE.close.values[0:-1]))-1)*100])},index=APPLE.index)\
                            .shift(1).fillna(0)
# APPLE_return.subtract(tempSector,axis=1)

APPLE_return

Unnamed: 0,Unnamed: 1,return
20200501,0,0.000000
20200501,1,0.000000
20200501,2,-0.082970
20200501,3,-0.152239
20200501,4,0.178460
...,...,...
20200529,385,-0.207868
20200529,386,-0.048551
20200529,387,-0.078347
20200529,388,0.192884


In [98]:
relativeReturns = pd.DataFrame(APPLE_return.values - tempSector.values,
                               columns=tempSector.columns,
                               index=tempSector.index)

In [99]:
relativeReturns

Unnamed: 0_level_0,Ticker,DIA,IDU,IYC,IYE,IYG,IYH,IYJ,IYK,IYM,IYR,IYW,IYZ,QQQ,SPY
level_0,level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20200501,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
20200501,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
20200501,2,-0.130988,0.027358,-0.277849,-0.824090,0.019302,-0.182961,-0.012865,-0.116905,-0.196050,-0.035635,-0.300816,-0.082970,-0.250533,-0.139033
20200501,3,-0.310830,-0.063166,-0.441494,-0.050767,-0.281323,-0.088023,-0.263009,-0.461784,-0.396968,-0.240189,-0.182671,-0.189214,-0.212646,-0.259048
20200501,4,0.145125,0.709806,0.143651,-0.278625,0.120669,0.306973,0.130513,0.246096,0.009446,0.225777,0.134999,0.196941,0.171494,0.213442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,-0.137206,-0.160722,-0.242305,-0.158836,-0.120850,-0.196549,-0.194355,-0.263987,-0.167941,-0.233692,-0.188073,-0.297137,-0.267893,-0.198020
20200529,386,-0.101585,-0.196794,-0.135761,-0.146663,0.038542,-0.170806,-0.048551,-0.256876,-0.208323,-0.229278,-0.072309,-0.423150,-0.089258,-0.110926
20200529,387,-0.070494,-0.138902,-0.064589,-0.127354,-0.024385,-0.071563,-0.014153,-0.062355,-0.021376,-0.039689,0.000828,-0.149432,-0.022665,-0.053740
20200529,388,0.151654,0.283661,0.181417,0.143900,0.180424,0.183838,0.115126,0.224872,0.147281,0.192884,0.117608,0.228401,0.169313,0.168271


['relReturns_DIA',
 'relReturns_IDU',
 'relReturns_IYC',
 'relReturns_IYE',
 'relReturns_IYG',
 'relReturns_IYH',
 'relReturns_IYJ',
 'relReturns_IYK',
 'relReturns_IYM',
 'relReturns_IYR',
 'relReturns_IYW',
 'relReturns_IYZ',
 'relReturns_QQQ',
 'relReturns_SPY']

In [131]:
temp = pd.DataFrame()


In [132]:
temp[['relReturns_'+i for i in relativeReturns.columns]] = relativeReturns

In [133]:
temp

Unnamed: 0_level_0,Unnamed: 1_level_0,relReturns_DIA,relReturns_IDU,relReturns_IYC,relReturns_IYE,relReturns_IYG,relReturns_IYH,relReturns_IYJ,relReturns_IYK,relReturns_IYM,relReturns_IYR,relReturns_IYW,relReturns_IYZ,relReturns_QQQ,relReturns_SPY
level_0,level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20200501,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
20200501,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
20200501,2,-0.130988,0.027358,-0.277849,-0.824090,0.019302,-0.182961,-0.012865,-0.116905,-0.196050,-0.035635,-0.300816,-0.082970,-0.250533,-0.139033
20200501,3,-0.310830,-0.063166,-0.441494,-0.050767,-0.281323,-0.088023,-0.263009,-0.461784,-0.396968,-0.240189,-0.182671,-0.189214,-0.212646,-0.259048
20200501,4,0.145125,0.709806,0.143651,-0.278625,0.120669,0.306973,0.130513,0.246096,0.009446,0.225777,0.134999,0.196941,0.171494,0.213442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,-0.137206,-0.160722,-0.242305,-0.158836,-0.120850,-0.196549,-0.194355,-0.263987,-0.167941,-0.233692,-0.188073,-0.297137,-0.267893,-0.198020
20200529,386,-0.101585,-0.196794,-0.135761,-0.146663,0.038542,-0.170806,-0.048551,-0.256876,-0.208323,-0.229278,-0.072309,-0.423150,-0.089258,-0.110926
20200529,387,-0.070494,-0.138902,-0.064589,-0.127354,-0.024385,-0.071563,-0.014153,-0.062355,-0.021376,-0.039689,0.000828,-0.149432,-0.022665,-0.053740
20200529,388,0.151654,0.283661,0.181417,0.143900,0.180424,0.183838,0.115126,0.224872,0.147281,0.192884,0.117608,0.228401,0.169313,0.168271


In [102]:
pd.concat([APPLE,relativeReturns],axis=1)

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,...,IYG,IYH,IYJ,IYK,IYM,IYR,IYW,IYZ,QQQ,SPY
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
20200501,1,289.260,289.350,288.365,289.020,0.24,0.45,0.01,0.10,9.0,20.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
20200501,2,289.035,289.705,288.280,288.580,0.07,0.49,0.01,0.30,1.0,50.0,...,0.019302,-0.182961,-0.012865,-0.116905,-0.196050,-0.035635,-0.300816,-0.082970,-0.250533,-0.139033
20200501,3,288.485,289.315,288.280,289.095,0.49,0.49,0.01,0.17,1.0,25.0,...,-0.281323,-0.088023,-0.263009,-0.461784,-0.396968,-0.240189,-0.182671,-0.189214,-0.212646,-0.259048
20200501,4,289.100,290.435,288.940,290.320,0.16,0.33,0.01,0.10,13.0,71.0,...,0.120669,0.306973,0.130513,0.246096,0.009446,0.225777,0.134999,0.196941,0.171494,0.213442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,319.255,319.415,318.830,319.095,0.11,0.17,0.01,0.05,2.0,485.0,...,-0.120850,-0.196549,-0.194355,-0.263987,-0.167941,-0.233692,-0.188073,-0.297137,-0.267893,-0.198020
20200529,386,319.095,319.295,318.810,318.845,0.05,0.22,0.01,0.03,1.0,20.0,...,0.038542,-0.170806,-0.048551,-0.256876,-0.208323,-0.229278,-0.072309,-0.423150,-0.089258,-0.110926
20200529,387,318.845,319.605,318.695,319.460,0.03,0.11,0.01,0.06,1.0,10.0,...,-0.024385,-0.071563,-0.014153,-0.062355,-0.021376,-0.039689,0.000828,-0.149432,-0.022665,-0.053740
20200529,388,319.465,319.555,318.660,318.675,0.05,0.12,0.01,0.03,1.0,38.0,...,0.180424,0.183838,0.115126,0.224872,0.147281,0.192884,0.117608,0.228401,0.169313,0.168271


In [93]:
APPLE_return

Unnamed: 0,Unnamed: 1,return
20200501,0,0.000000
20200501,1,0.000000
20200501,2,-0.082970
20200501,3,-0.152239
20200501,4,0.178460
...,...,...
20200529,385,-0.207868
20200529,386,-0.048551
20200529,387,-0.078347
20200529,388,0.192884


In [95]:
APPLE_return.values - tempSector.values

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.13098804,  0.0273578 , -0.27784854, ..., -0.08297034,
        -0.250533  , -0.13903299],
       ...,
       [-0.07049389, -0.13890166, -0.06458858, ..., -0.14943241,
        -0.0226649 , -0.05374027],
       [ 0.15165386,  0.28366135,  0.18141711, ...,  0.22840136,
         0.16931293,  0.16827133],
       [-0.22413949, -0.22553602, -0.2250897 , ..., -0.35231803,
        -0.237158  , -0.22768255]])

In [72]:
tempSectorETFS.loc[0,'close']

140.49

In [73]:
tempSectorETFS.loc[1:,'close']

1         140.335
2         140.210
3         139.465
4         139.415
5         139.130
           ...   
109195    304.800
109196    304.725
109197    304.800
109198    304.745
109199    304.165
Name: close, Length: 109199, dtype: float64

In [74]:
tempSectorETFS.loc[1:,'close'].values-tempSectorETFS.loc[0:-1,'close'].values

ValueError: operands could not be broadcast together with shapes (109199,) (0,) 

In [18]:
tempSectorETFS.close.loc[1:]-tempSectorETFS.close.loc[0:-1]

1       NaN
2       NaN
3       NaN
4       NaN
5       NaN
         ..
85795   NaN
85796   NaN
85797   NaN
85798   NaN
85799   NaN
Name: close, Length: 85799, dtype: float64

In [8]:
stockTable[stockTable.ticker.isin(etfs)]

Unnamed: 0,ticker,sector,exchange,marketCap
4015,IDU,Utilities,,
4336,IYC,Consumer Cyclical,,
4337,IYE,Energy,,
4339,IYG,Financial Services,,
4340,IYH,Healthcare,,
4341,IYJ,Industrials,,
4342,IYK,Consumer Defensive,,
4344,IYM,Basic Materials,,
4345,IYR,Real Estate,,
4347,IYW,Technology,,


In [9]:
data.merge(stockTable[['ticker','sector']],left_on='Ticker',right_on='ticker',how='left')

Unnamed: 0,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker,ticker,sector
0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL,AAPL,Technology
1,289.260,289.350,288.365,289.020,0.24,0.45,0.01,0.10,9.0,20.0,1.0,1.0,4.0,56.0,1.0,1.0,AAPL,AAPL,Technology
2,289.035,289.705,288.280,288.580,0.07,0.49,0.01,0.30,1.0,50.0,1.0,1.0,1.0,13.0,1.0,1.0,AAPL,AAPL,Technology
3,288.485,289.315,288.280,289.095,0.49,0.49,0.01,0.17,1.0,25.0,1.0,16.0,1.0,8.0,1.0,1.0,AAPL,AAPL,Technology
4,289.100,290.435,288.940,290.320,0.16,0.33,0.01,0.10,13.0,71.0,1.0,1.0,1.0,236.0,1.0,1.0,AAPL,AAPL,Technology
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428995,123.950,124.110,123.910,124.100,0.02,0.07,0.01,0.04,1.0,11.0,1.0,1.0,5.0,9.0,1.0,1.0,WMT,WMT,Consumer Defensive
428996,124.085,124.085,123.920,123.995,0.01,0.06,0.01,0.01,1.0,8.0,1.0,3.0,1.0,9.0,1.0,2.0,WMT,WMT,Consumer Defensive
428997,123.995,124.355,123.985,124.335,0.01,0.07,0.01,0.05,4.0,16.0,1.0,2.0,2.0,10.0,1.0,2.0,WMT,WMT,Consumer Defensive
428998,124.335,124.355,124.060,124.075,0.05,0.12,0.01,0.01,3.0,6.0,1.0,2.0,2.0,10.0,1.0,4.0,WMT,WMT,Consumer Defensive


## Generating Features

In [16]:
data

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL
20200501,1,289.260,289.350,288.365,289.020,0.24,0.45,0.01,0.10,9.0,20.0,1.0,1.0,4.0,56.0,1.0,1.0,AAPL
20200501,2,289.035,289.705,288.280,288.580,0.07,0.49,0.01,0.30,1.0,50.0,1.0,1.0,1.0,13.0,1.0,1.0,AAPL
20200501,3,288.485,289.315,288.280,289.095,0.49,0.49,0.01,0.17,1.0,25.0,1.0,16.0,1.0,8.0,1.0,1.0,AAPL
20200501,4,289.100,290.435,288.940,290.320,0.16,0.33,0.01,0.10,13.0,71.0,1.0,1.0,1.0,236.0,1.0,1.0,AAPL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,123.950,124.110,123.910,124.100,0.02,0.07,0.01,0.04,1.0,11.0,1.0,1.0,5.0,9.0,1.0,1.0,WMT
20200529,386,124.085,124.085,123.920,123.995,0.01,0.06,0.01,0.01,1.0,8.0,1.0,3.0,1.0,9.0,1.0,2.0,WMT
20200529,387,123.995,124.355,123.985,124.335,0.01,0.07,0.01,0.05,4.0,16.0,1.0,2.0,2.0,10.0,1.0,2.0,WMT
20200529,388,124.335,124.355,124.060,124.075,0.05,0.12,0.01,0.01,3.0,6.0,1.0,2.0,2.0,10.0,1.0,4.0,WMT


In [20]:
########### Generate Features ################

n_feature_lags = 1

# features = generateFeatures_multi_final(data = data, 
#                                   listOfFeatures = [
#                                                     'pastobs',
#                                                     'spread',
#                                                     'bidsize',
#                                                     'ofrsize',
# #                                                     'stok',
# #                                                     'stod',
# #                                                     'sstod',
# #                                                     'wilr',
# #                                                     'roc',
# #                                                     'rsi',
# #                                                     'atr',
# #                                                     'cci',
# #                                                     'dpo',
# #                                                     'sma',
# #                                                     'ema',
# #                                                     'macd',
# #                                                       'macd_diff',
# #                                                       'macd_signal',
# #                                                     'dis5',
# #                                                     'dis10',
#                                                       'sector'
#                                                    ], 
#                                    feature_lags = n_feature_lags
#                                      ,stockTable=stockTable)
features = generateFeatures_multi_final(data = data, 
                                  listOfFeatures = [
                                                    'pastobs',
                                                    'spread',
                                                    'bidsize',
                                                    'ofrsize',
#                                                     'stok',
#                                                     'stod',
#                                                     'sstod',
#                                                     'wilr',
#                                                     'roc',
#                                                     'rsi',
#                                                     'atr',
#                                                     'cci',
#                                                     'dpo',
#                                                     'sma',
#                                                     'ema',
#                                                     'macd',
#                                                       'macd_diff',
#                                                       'macd_signal',
#                                                     'dis5',
#                                                     'dis10',
                                                      'sector'
                                                   ], 
                                   feature_lags = n_feature_lags
                                     ,sectorETFS=sectorETFS)

########### Generate Labels ################

n_classes = 2
# extract first 4 columns as the lag0 or raw OHLC prices (used for labelling)
price_candles = data[['open','high','low','close','Ticker']]

########### Align Data ################

# from imported function (see testing_preprocessing_features_and_labels.ipynb for thorough experimenting with all the cut-offs):    
X, y,indices = align_features_and_labels_multi_final(price_candles = price_candles, 
                                                 all_features = features,
                                                 prediction_horizon = 1, 
                                                 n_feature_lags = n_feature_lags, 
                                                 n_classes = n_classes, # 5,
                                                 safe_burn_in = False, 
                                                 data_sample = 'full',
                                                 splitType='global',
                                                 noise=False,ticker_dummies=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


AAPL done
ABBV done
ABT done
AEP done
AMT done
APD done
BA done
BABA done
BAC done
BHP done
BP done
CCI done
CHL done
COST done
CSGP done
D done
DIS done
ECL done
ENB done
EXC done
FB done
FMX done
GOOG done
INTC done
JNJ done
KO done
LFC done
LIN done
LMT done
MA done
MCD done
MSFT done
NKE done
NVDA done
NVS done
Number of NaNs in label: 1. 1 is expected
Returns that lead to NaNs in label: [0.0907158]
PBR done
PEP done
PFE done
PLD done
PSA done
PTR done
PYPL done
RTX done
SHW done
SNP done
SO done
SRE done
T done
TM done
TSLA done
TSM done
UNP done
UPS done
V done
WMT done


In [21]:
features

Unnamed: 0,Unnamed: 1,open_lag0,high_lag0,low_lag0,close_lag0,spread_open_lag0,spread_high_lag0,spread_low_lag0,spread_close_lag0,bidsize_open_lag0,bidsize_high_lag0,...,relReturns_IYH_lag1,relReturns_IYJ_lag1,relReturns_IYK_lag1,relReturns_IYM_lag1,relReturns_IYR_lag1,relReturns_IYW_lag1,relReturns_IYZ_lag1,relReturns_QQQ_lag1,relReturns_SPY_lag1,ticker
20200501,0,0.240,0.330,-0.655,289.020,0.24,0.45,0.01,0.10,9.0,20.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,AAPL
20200501,1,0.455,1.125,-0.300,288.580,0.07,0.49,0.01,0.30,1.0,50.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,AAPL
20200501,2,-0.610,0.220,-0.815,289.095,0.49,0.49,0.01,0.17,1.0,25.0,...,-0.182961,-0.012865,-0.116905,-0.196050,-0.035635,-0.300816,-0.082970,-0.250533,-0.139033,AAPL
20200501,3,-1.220,0.115,-1.380,290.320,0.16,0.33,0.01,0.10,13.0,71.0,...,-0.088023,-0.263009,-0.461784,-0.396968,-0.240189,-0.182671,-0.189214,-0.212646,-0.259048,AAPL
20200501,4,0.235,0.355,-0.565,290.085,0.10,0.42,0.01,0.05,2.0,86.0,...,0.306973,0.130513,0.246096,0.009446,0.225777,0.134999,0.196941,0.171494,0.213442,AAPL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,0.090,0.090,-0.075,123.995,0.01,0.06,0.01,0.01,1.0,8.0,...,0.096102,0.098296,0.028664,0.124710,0.058959,0.104578,-0.004486,0.024758,0.094631,WMT
20200529,386,-0.340,0.020,-0.350,124.335,0.01,0.07,0.01,0.05,4.0,16.0,...,-0.001238,0.121017,-0.087308,-0.038755,-0.059710,0.097258,-0.253582,0.080310,0.058642,WMT
20200529,387,0.260,0.280,-0.015,124.075,0.05,0.12,0.01,0.01,3.0,6.0,...,-0.077826,-0.020416,-0.068618,-0.027639,-0.045952,-0.005434,-0.155695,-0.028928,-0.060003,WMT
20200529,388,0.220,0.370,-1.045,123.855,0.01,2.43,0.01,0.21,1.0,20.0,...,0.265159,0.196447,0.306193,0.228602,0.274205,0.198929,0.309722,0.250634,0.249592,WMT


In [11]:
features.columns

Index(['open_lag0', 'high_lag0', 'low_lag0', 'close_lag0', 'spread_open_lag0',
       'spread_high_lag0', 'spread_low_lag0', 'spread_close_lag0',
       'bidsize_open_lag0', 'bidsize_high_lag0', 'bidsize_low_lag0',
       'bidsize_close_lag0', 'ofrsize_open_lag0', 'ofrsize_high_lag0',
       'ofrsize_low_lag0', 'ofrsize_close_lag0', 'open_lag1', 'high_lag1',
       'low_lag1', 'close_lag1', 'spread_open_lag1', 'spread_high_lag1',
       'spread_low_lag1', 'spread_close_lag1', 'bidsize_open_lag1',
       'bidsize_high_lag1', 'bidsize_low_lag1', 'bidsize_close_lag1',
       'ofrsize_open_lag1', 'ofrsize_high_lag1', 'ofrsize_low_lag1',
       'ofrsize_close_lag1', 'ticker', 'sector_Basic Materials',
       'sector_Communication Services', 'sector_Consumer Cyclical',
       'sector_Consumer Defensive', 'sector_Energy',
       'sector_Financial Services', 'sector_Healthcare', 'sector_Industrials',
       'sector_Real Estate', 'sector_Technology', 'sector_Utilities'],
      dtype='object')

In [5]:
data

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL
20200501,1,289.260,289.350,288.365,289.020,0.24,0.45,0.01,0.10,9.0,20.0,1.0,1.0,4.0,56.0,1.0,1.0,AAPL
20200501,2,289.035,289.705,288.280,288.580,0.07,0.49,0.01,0.30,1.0,50.0,1.0,1.0,1.0,13.0,1.0,1.0,AAPL
20200501,3,288.485,289.315,288.280,289.095,0.49,0.49,0.01,0.17,1.0,25.0,1.0,16.0,1.0,8.0,1.0,1.0,AAPL
20200501,4,289.100,290.435,288.940,290.320,0.16,0.33,0.01,0.10,13.0,71.0,1.0,1.0,1.0,236.0,1.0,1.0,AAPL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,123.950,124.110,123.910,124.100,0.02,0.07,0.01,0.04,1.0,11.0,1.0,1.0,5.0,9.0,1.0,1.0,WMT
20200529,386,124.085,124.085,123.920,123.995,0.01,0.06,0.01,0.01,1.0,8.0,1.0,3.0,1.0,9.0,1.0,2.0,WMT
20200529,387,123.995,124.355,123.985,124.335,0.01,0.07,0.01,0.05,4.0,16.0,1.0,2.0,2.0,10.0,1.0,2.0,WMT
20200529,388,124.335,124.355,124.060,124.075,0.05,0.12,0.01,0.01,3.0,6.0,1.0,2.0,2.0,10.0,1.0,4.0,WMT


In [8]:
data.reset_index().sort_values(['level_0','level_1','Ticker']).set_index(['level_0','level_1'])

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker
level_0,level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL
20200501,0,81.000,82.110,80.580,81.010,2.00,2.06,0.01,0.24,12.0,12.0,1.0,1.0,1.0,9.0,1.0,1.0,ABBV
20200501,0,91.025,91.435,90.490,91.220,1.89,1.89,0.03,0.24,1.0,5.0,1.0,1.0,1.0,5.0,1.0,1.0,ABT
20200501,0,83.455,83.980,82.250,82.470,3.09,4.14,0.10,0.14,1.0,7.0,1.0,1.0,7.0,8.0,1.0,1.0,AEP
20200501,0,234.350,234.575,232.215,232.950,3.30,3.87,0.27,2.90,1.0,3.0,1.0,1.0,2.0,3.0,1.0,1.0,AMT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,389,50.455,50.495,50.325,50.325,0.01,0.05,0.01,0.05,5.0,38.0,1.0,2.0,7.0,24.0,1.0,9.0,TSM
20200529,389,169.960,170.030,169.610,169.675,0.04,0.36,0.01,0.23,10.0,11.0,1.0,1.0,1.0,10.0,1.0,2.0,UNP
20200529,389,100.095,100.110,99.670,99.670,0.05,0.13,0.01,0.12,1.0,11.0,1.0,7.0,1.0,11.0,1.0,1.0,UPS
20200529,389,195.855,195.920,194.195,194.195,0.03,2.39,0.01,2.39,2.0,7.0,1.0,1.0,1.0,9.0,1.0,2.0,V


In [11]:
int(data.shape[0]*0.8)

343200.0

In [13]:
data.reset_index().sort_values(['level_0',
                                'level_1',
                                'Ticker']).set_index(['level_0',
                                                      'level_1']).iloc[int(data.shape[0]*0.8):,:]

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker
level_0,level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
20200526,0,323.560,324.195,323.380,323.875,0.46,0.46,0.01,0.11,8.0,18.0,1.0,1.0,1.0,241.0,1.0,1.0,AAPL
20200526,0,93.625,93.625,93.060,93.295,0.35,0.38,0.01,0.03,1.0,8.0,1.0,1.0,1.0,10.0,1.0,1.0,ABBV
20200526,0,92.915,93.255,92.815,93.025,0.77,0.77,0.01,0.19,3.0,8.0,1.0,1.0,3.0,12.0,1.0,2.0,ABT
20200526,0,79.450,80.200,78.745,79.645,0.66,2.07,0.21,0.21,1.0,5.0,1.0,1.0,2.0,5.0,1.0,2.0,AEP
20200526,0,248.750,248.750,246.555,248.300,2.10,3.45,0.04,0.90,2.0,5.0,1.0,1.0,1.0,2.0,1.0,2.0,AMT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,389,50.455,50.495,50.325,50.325,0.01,0.05,0.01,0.05,5.0,38.0,1.0,2.0,7.0,24.0,1.0,9.0,TSM
20200529,389,169.960,170.030,169.610,169.675,0.04,0.36,0.01,0.23,10.0,11.0,1.0,1.0,1.0,10.0,1.0,2.0,UNP
20200529,389,100.095,100.110,99.670,99.670,0.05,0.13,0.01,0.12,1.0,11.0,1.0,7.0,1.0,11.0,1.0,1.0,UPS
20200529,389,195.855,195.920,194.195,194.195,0.03,2.39,0.01,2.39,2.0,7.0,1.0,1.0,1.0,9.0,1.0,2.0,V


In [35]:
data.index.get_level_values(0).unique()

Int64Index([20200501, 20200504, 20200505, 20200506, 20200507, 20200508,
            20200511, 20200512, 20200513, 20200514, 20200515, 20200518,
            20200519, 20200520, 20200521, 20200522, 20200526, 20200527,
            20200528, 20200529],
           dtype='int64')

In [36]:
data = data.reset_index().sort_values(['level_0',
                                'level_1',
                                'Ticker']).set_index(['level_0',
                                                      'level_1'])
first_val_day = int(np.floor(data.index.get_level_values(0).unique().shape[0]*0.8))

X_train = data.loc[(data.index.get_level_values(0).unique()[:first_val_day],)]
X_test = data.loc[(data.index.get_level_values(0).unique()[first_val_day:],)]

# first_val_day = int(np.floor(data.reset_index().loc[:,'level_0'].unique().shape[0]*0.8))

# data.reset_index().loc[:,'level_0'].unique()[0:first_val_day]
# # data.reset_index().loc[:,'level_0'].unique().shape

In [37]:
X_train

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker
level_0,level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL
20200501,0,81.000,82.110,80.580,81.010,2.00,2.06,0.01,0.24,12.0,12.0,1.0,1.0,1.0,9.0,1.0,1.0,ABBV
20200501,0,91.025,91.435,90.490,91.220,1.89,1.89,0.03,0.24,1.0,5.0,1.0,1.0,1.0,5.0,1.0,1.0,ABT
20200501,0,83.455,83.980,82.250,82.470,3.09,4.14,0.10,0.14,1.0,7.0,1.0,1.0,7.0,8.0,1.0,1.0,AEP
20200501,0,234.350,234.575,232.215,232.950,3.30,3.87,0.27,2.90,1.0,3.0,1.0,1.0,2.0,3.0,1.0,1.0,AMT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200522,389,49.775,49.840,49.765,49.805,0.01,0.02,0.01,0.01,14.0,88.0,1.0,2.0,33.0,58.0,1.0,17.0,TSM
20200522,389,165.010,165.245,165.000,165.140,0.04,0.21,0.01,0.10,1.0,5.0,1.0,1.0,5.0,7.0,1.0,2.0,UNP
20200522,389,96.590,96.780,96.590,96.685,0.06,0.17,0.01,0.15,2.0,7.0,1.0,1.0,2.0,12.0,1.0,1.0,UPS
20200522,389,190.635,190.980,190.580,190.710,0.03,0.39,0.01,0.06,1.0,6.0,1.0,3.0,1.0,6.0,1.0,1.0,V


In [39]:
X_test

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker
level_0,level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
20200526,0,323.560,324.195,323.380,323.875,0.46,0.46,0.01,0.11,8.0,18.0,1.0,1.0,1.0,241.0,1.0,1.0,AAPL
20200526,0,93.625,93.625,93.060,93.295,0.35,0.38,0.01,0.03,1.0,8.0,1.0,1.0,1.0,10.0,1.0,1.0,ABBV
20200526,0,92.915,93.255,92.815,93.025,0.77,0.77,0.01,0.19,3.0,8.0,1.0,1.0,3.0,12.0,1.0,2.0,ABT
20200526,0,79.450,80.200,78.745,79.645,0.66,2.07,0.21,0.21,1.0,5.0,1.0,1.0,2.0,5.0,1.0,2.0,AEP
20200526,0,248.750,248.750,246.555,248.300,2.10,3.45,0.04,0.90,2.0,5.0,1.0,1.0,1.0,2.0,1.0,2.0,AMT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,389,50.455,50.495,50.325,50.325,0.01,0.05,0.01,0.05,5.0,38.0,1.0,2.0,7.0,24.0,1.0,9.0,TSM
20200529,389,169.960,170.030,169.610,169.675,0.04,0.36,0.01,0.23,10.0,11.0,1.0,1.0,1.0,10.0,1.0,2.0,UNP
20200529,389,100.095,100.110,99.670,99.670,0.05,0.13,0.01,0.12,1.0,11.0,1.0,7.0,1.0,11.0,1.0,1.0,UPS
20200529,389,195.855,195.920,194.195,194.195,0.03,2.39,0.01,2.39,2.0,7.0,1.0,1.0,1.0,9.0,1.0,2.0,V


In [18]:
data.reset_index().loc[:,'level_0'].unique()[first_val_day:]

array([20200526, 20200527, 20200528, 20200529], dtype=int64)

In [21]:

data.reset_index().sort_values(['level_0',
                                'level_1',
                                'Ticker']).set_index(['level_0',
                                                      'level_1']).index.get_level_values(0)

Int64Index([20200501, 20200501, 20200501, 20200501, 20200501, 20200501,
            20200501, 20200501, 20200501, 20200501,
            ...
            20200529, 20200529, 20200529, 20200529, 20200529, 20200529,
            20200529, 20200529, 20200529, 20200529],
           dtype='int64', name='level_0', length=429000)

In [28]:
list(data.reset_index().loc[:,'level_0'].unique()[first_val_day:])

[20200526, 20200527, 20200528, 20200529]

In [29]:
data.reset_index().loc[:,'level_0'].unique()[first_val_day:]

AttributeError: 'numpy.ndarray' object has no attribute 'tuple'

In [30]:
data[(list(data.reset_index().loc[:,'level_0'].unique()[first_val_day:]),)]

TypeError: '([20200526, 20200527, 20200528, 20200529],)' is an invalid key

In [33]:
data.reset_index().loc[(data.reset_index().loc[:,'level_0'].unique()[first_val_day:],)]

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker
level_0,level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
20200526,0,323.560,324.195,323.380,323.875,0.46,0.46,0.01,0.11,8.0,18.0,1.0,1.0,1.0,241.0,1.0,1.0,AAPL
20200526,0,93.625,93.625,93.060,93.295,0.35,0.38,0.01,0.03,1.0,8.0,1.0,1.0,1.0,10.0,1.0,1.0,ABBV
20200526,0,92.915,93.255,92.815,93.025,0.77,0.77,0.01,0.19,3.0,8.0,1.0,1.0,3.0,12.0,1.0,2.0,ABT
20200526,0,79.450,80.200,78.745,79.645,0.66,2.07,0.21,0.21,1.0,5.0,1.0,1.0,2.0,5.0,1.0,2.0,AEP
20200526,0,248.750,248.750,246.555,248.300,2.10,3.45,0.04,0.90,2.0,5.0,1.0,1.0,1.0,2.0,1.0,2.0,AMT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,389,50.455,50.495,50.325,50.325,0.01,0.05,0.01,0.05,5.0,38.0,1.0,2.0,7.0,24.0,1.0,9.0,TSM
20200529,389,169.960,170.030,169.610,169.675,0.04,0.36,0.01,0.23,10.0,11.0,1.0,1.0,1.0,10.0,1.0,2.0,UNP
20200529,389,100.095,100.110,99.670,99.670,0.05,0.13,0.01,0.12,1.0,11.0,1.0,7.0,1.0,11.0,1.0,1.0,UPS
20200529,389,195.855,195.920,194.195,194.195,0.03,2.39,0.01,2.39,2.0,7.0,1.0,1.0,1.0,9.0,1.0,2.0,V


In [40]:
X

Unnamed: 0,open_lag0,high_lag0,low_lag0,close_lag0,spread_open_lag0,spread_high_lag0,spread_low_lag0,spread_close_lag0,bidsize_open_lag0,bidsize_high_lag0,...,sector_Consumer Cyclical,sector_Consumer Defensive,sector_Energy,sector_Financial Services,sector_Healthcare,sector_Industrials,sector_Real Estate,sector_Technology,sector_Utilities,ticker
0,0.240,0.330,-0.655,289.020,0.24,0.45,0.01,0.10,9.0,20.0,...,0,0,0,0,0,0,0,1,0,AAPL
1,0.455,1.125,-0.300,288.580,0.07,0.49,0.01,0.30,1.0,50.0,...,0,0,0,0,0,0,0,1,0,AAPL
2,-0.610,0.220,-0.815,289.095,0.49,0.49,0.01,0.17,1.0,25.0,...,0,0,0,0,0,0,0,1,0,AAPL
3,-1.220,0.115,-1.380,290.320,0.16,0.33,0.01,0.10,13.0,71.0,...,0,0,0,0,0,0,0,1,0,AAPL
4,0.235,0.355,-0.565,290.085,0.10,0.42,0.01,0.05,2.0,86.0,...,0,0,0,0,0,0,0,1,0,AAPL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428885,-0.110,0.010,-0.175,123.950,0.02,0.07,0.01,0.02,3.0,6.0,...,0,1,0,0,0,0,0,0,0,WMT
428886,-0.150,0.010,-0.190,124.100,0.02,0.07,0.01,0.04,1.0,11.0,...,0,1,0,0,0,0,0,0,0,WMT
428887,0.090,0.090,-0.075,123.995,0.01,0.06,0.01,0.01,1.0,8.0,...,0,1,0,0,0,0,0,0,0,WMT
428888,-0.340,0.020,-0.350,124.335,0.01,0.07,0.01,0.05,4.0,16.0,...,0,1,0,0,0,0,0,0,0,WMT


## Splitting the data

In [5]:
# Let's have a proper split (along tickers & dates)
train_size = 0.8
data_splits = pd.DataFrame()
data_splits = X.index.to_series().groupby(X['ticker']).agg(['first','last']).reset_index()

data_splits['val_size'] = ((1-train_size) * (data_splits['last'] - data_splits['first'])).astype(int)
data_splits['val_start_idx'] = data_splits['last'] - data_splits['val_size']
data_splits['val_end_idx'] = data_splits['last'] + 1 # to get the last observation included

data_splits['train_start_idx'] =  data_splits['first']
data_splits['train_end_idx'] = data_splits['val_start_idx']

# Store ranges

train_ranges = [list(x) for x in zip(data_splits['train_start_idx'], data_splits['train_end_idx'])]
val_ranges = [list(x) for x in zip(data_splits['val_start_idx'], data_splits['val_end_idx'])]


if verbose:
    data_splits

INFO:MainThread:numexpr.utils:NumExpr defaulting to 4 threads.


## Adding ticker dummies

In [6]:
## Adding ticker dummies
tickers = X.pop('ticker')
X = pd.concat([X, pd.get_dummies(tickers, prefix='ticker', drop_first=False)], axis=1)

In [13]:
X.columns

Index(['open_lag0', 'high_lag0', 'low_lag0', 'close_lag0', 'spread_open_lag0',
       'spread_high_lag0', 'spread_low_lag0', 'spread_close_lag0',
       'bidsize_open_lag0', 'bidsize_high_lag0', 'bidsize_low_lag0',
       'bidsize_close_lag0', 'ofrsize_open_lag0', 'ofrsize_high_lag0',
       'ofrsize_low_lag0', 'ofrsize_close_lag0', 'open_lag1', 'high_lag1',
       'low_lag1', 'close_lag1', 'spread_open_lag1', 'spread_high_lag1',
       'spread_low_lag1', 'spread_close_lag1', 'bidsize_open_lag1',
       'bidsize_high_lag1', 'bidsize_low_lag1', 'bidsize_close_lag1',
       'ofrsize_open_lag1', 'ofrsize_high_lag1', 'ofrsize_low_lag1',
       'ofrsize_close_lag1', 'sector_Basic Materials',
       'sector_Communication Services', 'sector_Consumer Cyclical',
       'sector_Consumer Defensive', 'sector_Energy',
       'sector_Financial Services', 'sector_Healthcare', 'sector_Industrials',
       'sector_Real Estate', 'sector_Technology', 'sector_Utilities',
       'ticker_AAPL', 'ticker_ABBV

In [21]:
train_ranges[0]

[0, 6238]

## Constructing our final train/validation sets

In [39]:
train_ds = pd.concat([X.iloc[start:end, :] for (start, end) in train_ranges]).reset_index(drop=True)
train_y = pd.concat([y.iloc[start:end] for (start, end) in train_ranges]).reset_index(drop=True)

validate_ds = pd.concat([X.iloc[start:end, :] for (start, end) in val_ranges]).reset_index(drop=True)
val_y = pd.concat([y.iloc[start:end] for (start, end) in val_ranges]).reset_index(drop=True)

train_ds.shape, train_y.shape, validate_ds.shape, val_y.shape, train_y.shape[0] + val_y.shape[0]

((343090, 98), (343090, 1), (85800, 98), (85800, 1), 428890)

In [40]:
train_ranges[0]

[0, 6238]

In [41]:
train_ranges[0][1]+10

6248

## Pre-processing

In [44]:
{i:colname for i,colname in enumerate(train_ds.columns)}

{0: 'open_lag0',
 1: 'high_lag0',
 2: 'low_lag0',
 3: 'close_lag0',
 4: 'spread_open_lag0',
 5: 'spread_high_lag0',
 6: 'spread_low_lag0',
 7: 'spread_close_lag0',
 8: 'bidsize_open_lag0',
 9: 'bidsize_high_lag0',
 10: 'bidsize_low_lag0',
 11: 'bidsize_close_lag0',
 12: 'ofrsize_open_lag0',
 13: 'ofrsize_high_lag0',
 14: 'ofrsize_low_lag0',
 15: 'ofrsize_close_lag0',
 16: 'open_lag1',
 17: 'high_lag1',
 18: 'low_lag1',
 19: 'close_lag1',
 20: 'spread_open_lag1',
 21: 'spread_high_lag1',
 22: 'spread_low_lag1',
 23: 'spread_close_lag1',
 24: 'bidsize_open_lag1',
 25: 'bidsize_high_lag1',
 26: 'bidsize_low_lag1',
 27: 'bidsize_close_lag1',
 28: 'ofrsize_open_lag1',
 29: 'ofrsize_high_lag1',
 30: 'ofrsize_low_lag1',
 31: 'ofrsize_close_lag1',
 32: 'sector_Basic Materials',
 33: 'sector_Communication Services',
 34: 'sector_Consumer Cyclical',
 35: 'sector_Consumer Defensive',
 36: 'sector_Energy',
 37: 'sector_Financial Services',
 38: 'sector_Healthcare',
 39: 'sector_Industrials',
 40: 

In [47]:
# Creating one ppdict for individual preprocessings
# ppdict1 = {'open':'minmax',
#           'high':'log',
#           'low':'log',
#           'close':'std'}
splitpoint = 32

# Standardize some features
ppdict1 = {i:'std' for i in train_ds.columns[0:splitpoint]} 
# Keep some in actual levels (Dummies in this case).
ppdict2 = {i:'act' for i in train_ds.columns[splitpoint:]} 

# Merging the two
ppdict = {**ppdict1,**ppdict2}

In [54]:
train_ds,validate_ds = pre_processing(train_ds,
                                    validate_ds,
                                    ppdict,
                                    100,
                                    verbose =True)

Pre-Processing Procedure:  act
Columns Processed: ['sector_Basic Materials' 'sector_Communication Services'
 'sector_Consumer Cyclical' 'sector_Consumer Defensive' 'sector_Energy'
 'sector_Financial Services' 'sector_Healthcare' 'sector_Industrials'
 'sector_Real Estate' 'sector_Technology' 'sector_Utilities' 'ticker_AAPL'
 'ticker_ABBV' 'ticker_ABT' 'ticker_AEP' 'ticker_AMT' 'ticker_APD'
 'ticker_BA' 'ticker_BABA' 'ticker_BAC' 'ticker_BHP' 'ticker_BP'
 'ticker_CCI' 'ticker_CHL' 'ticker_COST' 'ticker_CSGP' 'ticker_D'
 'ticker_DIS' 'ticker_ECL' 'ticker_ENB' 'ticker_EXC' 'ticker_FB'
 'ticker_FMX' 'ticker_GOOG' 'ticker_INTC' 'ticker_JNJ' 'ticker_KO'
 'ticker_LFC' 'ticker_LIN' 'ticker_LMT' 'ticker_MA' 'ticker_MCD'
 'ticker_MSFT' 'ticker_NKE' 'ticker_NVDA' 'ticker_NVS' 'ticker_PBR'
 'ticker_PEP' 'ticker_PFE' 'ticker_PLD' 'ticker_PSA' 'ticker_PTR'
 'ticker_PYPL' 'ticker_RTX' 'ticker_SHW' 'ticker_SNP' 'ticker_SO'
 'ticker_SRE' 'ticker_T' 'ticker_TM' 'ticker_TSLA' 'ticker_TSM'
 'ticker_UNP' 't

In [53]:
ppX_train.iloc[:,0].mean(),ppX_train.iloc[:,0].std()

(-1.8927265537610815e-16, 1.000001457346533)

## Prepping for models

In [8]:
N_VALIDATION = val_y.shape[0] #int(1e3)
N_TRAIN = train_y.shape[0] #int(1e4)
# BUFFER_SIZE = int(1e4)
BATCH_SIZE = 256 #512 #32
MAX_EPOCHS = 500

STEPS_PER_EPOCH = N_TRAIN//BATCH_SIZE

N_REPEAT = int(N_TRAIN / ((STEPS_PER_EPOCH * MAX_EPOCHS) / BATCH_SIZE))
FEATURES = X.shape[1]

N_TRAIN, N_VALIDATION, N_TRAIN + N_VALIDATION, STEPS_PER_EPOCH, N_REPEAT, STEPS_PER_EPOCH * MAX_EPOCHS

(343090, 85800, 428890, 1340, 131, 670000)

## A Logistic Regression model in TF/Keras

In [55]:
METRICS = [
      #keras.metrics.TruePositives(name='tp'),
      #keras.metrics.FalsePositives(name='fp'),
      #keras.metrics.TrueNegatives(name='tn'),
      #keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      #keras.metrics.Precision(name='precision'),
      #keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

# def make_model(metrics = METRICS, output_bias=None):
#   if output_bias is not None:
#     output_bias = tf.keras.initializers.Constant(output_bias)
#   model = keras.Sequential([
#       keras.layers.Dense(
#           16, activation='relu',
#           input_shape=(train_features.shape[-1],)),
#       keras.layers.Dropout(0.5),
#       keras.layers.Dense(1, activation='sigmoid',
#                          bias_initializer=output_bias),
#   ])

#   model.compile(
#       optimizer=keras.optimizers.Adam(lr=1e-3),
#       loss=keras.losses.BinaryCrossentropy(),
#       metrics=metrics)

#   return model

# model = keras.Sequential({
#   keras.layers.Dense(1, input_shape=(FEATURES,))
# })

model = keras.Sequential([
#     keras.layers.Flatten(input_shape=(28, 28)),
#     keras.layers.Dense(128, activation='relu'),
#     keras.layers.Dense(10)
    keras.layers.Dense(1,
                       input_shape=(FEATURES,),
                       activation='sigmoid',
                       kernel_regularizer=regularizers.l2(1))
])

model.summary()

# with final activation (Keras/TF tutorial advises against this practice, but they also use it later in the tutorial)
# model = keras.Sequential({
#   keras.layers.Dense(1, input_shape=(FEATURES,), activation='sigmoid')
# })

#model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy', ])
model.compile(
              optimizer=keras.optimizers.Adam(), #lr=1e-3
              loss=keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=METRICS)

early_stopping = tf.keras.callbacks.EarlyStopping(
                                                monitor='val_auc', 
                                                verbose=1,
                                                patience=100,
                                                mode='max',
                                                restore_best_weights=True)

def get_callbacks(run_id):
      return [
             tfdocs.modeling.EpochDots(),
             early_stopping,
             tf.keras.callbacks.TensorBoard(logdir), #/run_id),
      ]

baseline_history = model.fit(
                            train_ds, #train_features,
                            train_y, #train_labels,
                            batch_size=512, #BATCH_SIZE,
                            epochs=1000, #EPOCHS,
                            callbacks = get_callbacks(run_id = 'first'), #[early_stopping],
                            validation_data=(validate_ds, val_y),
                            verbose=0) #(val_features, val_labels))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1)                 99        
Total params: 99
Trainable params: 99
Non-trainable params: 0
_________________________________________________________________





Epoch: 0, accuracy:0.5352,  auc:0.5034,  loss:0.8996,  val_accuracy:0.5456,  val_auc:0.5453,  val_loss:0.6876,  
....................................................................................................
Epoch: 100, accuracy:0.5480,  auc:0.5440,  loss:0.6873,  val_accuracy:0.5454,  val_auc:0.5459,  val_loss:0.6879,  
..................Restoring model weights from the end of the best epoch.
Epoch 00118: early stopping


In [56]:
model.evaluate(validate_ds,  val_y, verbose=2)

2682/2682 - 6s - loss: 0.6879 - accuracy: 0.5457 - auc: 0.5513


[0.6878659725189209, 0.5456876754760742, 0.5513222217559814]

In [11]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [12]:
import datetime
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
%tensorboard --logdir logs

ERROR: Timed out waiting for TensorBoard to start. It may still be running as pid 9296.