## Reading in packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import time
import h5py
import copy
import datetime
import ta
import pathlib
import shutil
import tempfile
import vaex
from IPython import display
from IPython.display import clear_output
import pyodbc

# Tensorflow related
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import layers
from tensorflow.keras import regularizers
import tensorflow.compat.v2.feature_column as fc

#!pip install -q git+https://github.com/tensorflow/docs

import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

print(tf.__version__)
logdir = pathlib.Path(tempfile.mkdtemp())/"tensorboard_logs"
shutil.rmtree(logdir, ignore_errors=True)
print(logdir)

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score, f1_score, log_loss


# Models
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.exceptions import ConvergenceWarning 
from sklearn import ensemble
# ConvergenceWarning('ignore')
# Do you wanna see?
verbose = True

import sys
sys.path.append('../')
#sys.path.append('...../')

from utils.data_extraction import load_data_final,load_data_and_save
from utils.data_cleaning import HFDataCleaning
from utils.generate_features import candleCreateNP_vect_final,\
                                    generateFeatures_final,\
                                    generateFeatures_multi_final

from utils.preprocessing_features_and_labels import extract_labels,\
                                                    align_features_and_labels,\
                                                    pre_processing_initial,\
                                                    pre_processing_extended,\
                                                    pre_processing,\
                                                    extract_labels_multi_final,\
                                                    align_features_and_labels_multi_final,\
                                                    align_features_and_labels_multi_v5

from utils.models import make_input_fn
from utils.models import performanceTesting,scoreFunction
from utils.plotting import plot_confusion_matrix

2.2.0
C:\Users\PC\AppData\Local\Temp\tmpndlv6fiu\tensorboard_logs


## Extracting data

In [2]:
# Do we extract new data or read in?
readIn = True
# run load_data()
if readIn:
    
    # Listing the data files 
    path = '../../../Google Drev/Thesis/Data/TAQ/AggregatedTAQ'
#     path = 'F:/AggregatedTAQ/round3'
    datafiles = os.listdir(path)
    content = np.concatenate([['\n\n'],[str(j)+': '+i+'\n' for j,i in enumerate(datafiles) if 'csv' in i],['\n\n']])
    
    # Asking for user input
    file = input('Which one do you want to load? %s'%''.join(content))
    if int(file) <= 2:
        data = pd.read_csv(path + '/' + datafiles[int(file)],
                           header = None,
                           names=['open','high','low','close',
                                  'spread_open','spread_high','spread_low','spread_close',
                                  'bidsize_open','bidsize_high','bidsize_low','bidsize_close',
                                  'ofrsize_open','ofrsize_high','ofrsize_low','ofrsize_close',
                                  'Ticker'])
        # Using the choice of the user to determine the correct market file
        key = re.split('[_.]',datafiles[int(file)])[-2]
        marketDataFile = [file for file in os.listdir(path+'/round5_market_tickers') if key in file]

        # Reading in the market data
        tempData = pd.read_csv(path+'/round5_market_tickers/'+marketDataFile[0]
                               ,header = None
                               ,names=['open','high','low','close',
                                      'spread_open','spread_high','spread_low','spread_close',
                                      'bidsize_open','bidsize_high','bidsize_low','bidsize_close',
                                      'ofrsize_open','ofrsize_high','ofrsize_low','ofrsize_close',
                                      'Ticker'])
        # Adding the market data to the ticker data
        data = pd.concat([data,tempData],axis=0)
        # Lower casing all column names
    #     data.columns = data.columns.str.lower()
        
        
    else:
        data = pd.read_csv(path + '/' + datafiles[int(file)],
                           header = 0,
                           index_col=[0,1]
#                            names=['open','high','low','close',
#                                   'spread_open','spread_high','spread_low','spread_close',
#                                   'bidsize_open','bidsize_high','bidsize_low','bidsize_close',
#                                   'ofrsize_open','ofrsize_high','ofrsize_low','ofrsize_close',
#                                   'Ticker']
                          )
    
else:
    
    # print(os.listdir())
    try:
        path = 'a:/taqhdf5'  #'a:/taqhdf5'
        os.listdir(path)
    except:
        path = 't:/taqhdf5'  #'a:/taqhdf5'
        os.listdir(path)
        
    # Sample type
    data_sample = 'full' # or 'stable'
    # allFiles = os.listdir(path)
    # print(len(allFiles), allFiles[:5], allFiles[-5:])
    # print(allFiles[-10:])

    #dates = np.array(['2020040' + str(i) if i < 10 else '202004' + str(i) for i in np.arange(1,16)]).astype(int)
    dates = np.array(['20200501']).astype(int)#,'20200402','20200403','20200406','20200407'

    # Provide a list of tickers of interest
    
    tickers = sorted(['TSLA','FB'])#'MSFT'
    
    # Do we need data on trades, quotes or both?
    dataNeeded = 'quotes' # 'trades', 'quotes' or 'both'
    
    if dataNeeded == 'trades':
        tradeData = load_data_final(dates, tickers, dataNeeded, path, verbose)
    elif dataNeeded == 'quotes':
        quoteData = load_data_final(dates,
                                    tickers,
                                    dataNeeded,
                                    path,
                                    verbose,
                                    extract_candles = False,
                                    aggHorizon = 1,
                                    extra_features_from_quotes = None,
                                    data_sample = data_sample)
    elif dataNeeded == 'both':
        tradeData, quoteData = load_data_final(dates, tickers, dataNeeded, path, verbose)

# Reading in sector information
stockInfo = pd.read_csv('../utils/stockInfo_v1.csv',header=[0,1])
stockInfo.columns = ['ticker','sector','exchange','marketCap']

# Creating a table with stock information based on the tickers available in the data.
uniqueTickers = data.Ticker.unique()
stockTable = stockInfo[stockInfo.ticker.isin(uniqueTickers)]
stockTable.head(10)

Which one do you want to load? 

0: aggregateTAQ_May2020_10sec (1).csv
1: aggregateTAQ_May2020_30sec (1).csv
2: aggregateTAQ_May2020_60sec.csv
8: trueAggregateTAQ_60sec.csv


8


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ticker,sector,exchange,marketCap
12,AAPL,Technology,NMS,1578173000000.0
20,ABBV,Healthcare,NYQ,174261200000.0
34,ABT,Healthcare,NYQ,163141000000.0
126,AEP,Utilities,NYQ,40895510000.0
379,AMT,Real Estate,NYQ,117125900000.0
428,APD,Basic Materials,NYQ,54643950000.0
697,BA,Industrials,NYQ,102035600000.0
699,BABA,Consumer Cyclical,NYQ,593653600000.0
700,BAC,Financial Services,NYQ,202055000000.0
870,BHP,Basic Materials,NYQ,125819400000.0


# reading in the market data (done automatically atm)

In [3]:
data.columns

Index(['open', 'high', 'low', 'close', 'spread_open', 'spread_high',
       'spread_low', 'spread_close', 'bidsize_open', 'bidsize_high',
       'bidsize_low', 'bidsize_close', 'ofrsize_open', 'ofrsize_high',
       'ofrsize_low', 'ofrsize_close', 'Ticker', 'sector'],
      dtype='object')

### Dropping ETFS and market indices

In [4]:
data.Ticker.unique()

array(['AAPL', 'ABBV', 'ABT', 'AEP', 'AMT', 'APD', 'BA', 'BABA', 'BAC',
       'BHP', 'BP', 'CCI', 'CHL', 'COST', 'CSGP', 'D', 'DIS', 'ECL',
       'ENB', 'EXC', 'FB', 'FMX', 'GOOG', 'IDU', 'INTC', 'IYC', 'IYE',
       'IYG', 'IYH', 'IYJ', 'IYK', 'IYM', 'IYR', 'IYW', 'IYZ', 'JNJ',
       'KO', 'LFC', 'LIN', 'LMT', 'MA', 'MCD', 'MSFT', 'NKE', 'NVDA',
       'NVS', 'PBR', 'PEP', 'PFE', 'PLD', 'PSA', 'PTR', 'PYPL', 'RTX',
       'SHW', 'SNP', 'SO', 'SRE', 'T', 'TM', 'TSLA', 'TSM', 'UNP', 'UPS',
       'V', 'WMT', 'DIA', 'QQQ', 'SPY', 'XNTK'], dtype=object)

In [5]:
# Removing the XNTK ticker
data = data[~data.Ticker.isin(['XNTK'])]

In [6]:
data.Ticker.unique()

array(['AAPL', 'ABBV', 'ABT', 'AEP', 'AMT', 'APD', 'BA', 'BABA', 'BAC',
       'BHP', 'BP', 'CCI', 'CHL', 'COST', 'CSGP', 'D', 'DIS', 'ECL',
       'ENB', 'EXC', 'FB', 'FMX', 'GOOG', 'IDU', 'INTC', 'IYC', 'IYE',
       'IYG', 'IYH', 'IYJ', 'IYK', 'IYM', 'IYR', 'IYW', 'IYZ', 'JNJ',
       'KO', 'LFC', 'LIN', 'LMT', 'MA', 'MCD', 'MSFT', 'NKE', 'NVDA',
       'NVS', 'PBR', 'PEP', 'PFE', 'PLD', 'PSA', 'PTR', 'PYPL', 'RTX',
       'SHW', 'SNP', 'SO', 'SRE', 'T', 'TM', 'TSLA', 'TSM', 'UNP', 'UPS',
       'V', 'WMT', 'DIA', 'QQQ', 'SPY'], dtype=object)

In [7]:
# Removing the XNTK ticker
data = data[~data.Ticker.isin(['XNTK'])]

etfs = ['IYH','IYM','IYK','IYJ','IYG','IYW','IYC','IYR','IDU','IYZ','IYE','IYF','SPY','DIA','QQQ']

# Extracting the sector ETFs to a separate variable
sectorETFS = data[data.Ticker.isin(etfs)]

# Removing the ETFs
data = data[~data.Ticker.isin(etfs)]

In [8]:
data.columns

Index(['open', 'high', 'low', 'close', 'spread_open', 'spread_high',
       'spread_low', 'spread_close', 'bidsize_open', 'bidsize_high',
       'bidsize_low', 'bidsize_close', 'ofrsize_open', 'ofrsize_high',
       'ofrsize_low', 'ofrsize_close', 'Ticker', 'sector'],
      dtype='object')

In [9]:
data

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker,sector
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL,Technology
20200501,1,289.260,289.350,288.365,289.020,0.24,0.45,0.01,0.10,9.0,20.0,1.0,1.0,4.0,56.0,1.0,1.0,AAPL,Technology
20200501,2,289.035,289.705,288.280,288.580,0.07,0.49,0.01,0.30,1.0,50.0,1.0,1.0,1.0,13.0,1.0,1.0,AAPL,Technology
20200501,3,288.485,289.315,288.280,289.095,0.49,0.49,0.01,0.17,1.0,25.0,1.0,16.0,1.0,8.0,1.0,1.0,AAPL,Technology
20200501,4,289.100,290.435,288.940,290.320,0.16,0.33,0.01,0.10,13.0,71.0,1.0,1.0,1.0,236.0,1.0,1.0,AAPL,Technology
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,123.950,124.110,123.910,124.100,0.02,0.07,0.01,0.04,1.0,11.0,1.0,1.0,5.0,9.0,1.0,1.0,WMT,Consumer Defensive
20200529,386,124.085,124.085,123.920,123.995,0.01,0.06,0.01,0.01,1.0,8.0,1.0,3.0,1.0,9.0,1.0,2.0,WMT,Consumer Defensive
20200529,387,123.995,124.355,123.985,124.335,0.01,0.07,0.01,0.05,4.0,16.0,1.0,2.0,2.0,10.0,1.0,2.0,WMT,Consumer Defensive
20200529,388,124.335,124.355,124.060,124.075,0.05,0.12,0.01,0.01,3.0,6.0,1.0,2.0,2.0,10.0,1.0,4.0,WMT,Consumer Defensive


In [28]:
########### Generate Features ################

n_feature_lags = 1

# features = generateFeatures_multi_final(data = data, 
#                                   listOfFeatures = [
#                                                     'pastobs',
#                                                     'spread',
#                                                     'bidsize',
#                                                     'ofrsize',
# #                                                     'stok',
# #                                                     'stod',
# #                                                     'sstod',
# #                                                     'wilr',
# #                                                     'roc',
# #                                                     'rsi',
# #                                                     'atr',
# #                                                     'cci',
# #                                                     'dpo',
# #                                                     'sma',
# #                                                     'ema',
# #                                                     'macd',
# #                                                       'macd_diff',
# #                                                       'macd_signal',
# #                                                     'dis5',
# #                                                     'dis10',
#                                                       'sector'
#                                                    ], 
#                                    feature_lags = n_feature_lags
#                                      ,stockTable=stockTable)
features = generateFeatures_multi_final(data = data, 
#                                   listOfFeatures = [
#                                                     'pastobs',
#                                                     'spread',
#                                                     'bidsize',
#                                                     'ofrsize',
# #                                                     'stok',
# #                                                     'stod',
# #                                                     'sstod',
# #                                                     'wilr',
# #                                                     'roc',
# #                                                     'rsi',
# #                                                     'atr',
# #                                                     'cci',
# #                                                     'dpo',
# #                                                     'sma',
# #                                                     'ema',
# #                                                     'macd',
# #                                                       'macd_diff',
# #                                                       'macd_signal',
# #                                                     'dis5',
# #                                                     'dis10',
#                                                       'sector'
#                                                    ]
                                        listOfFeatures = [
                                                        'pastobs',
                                                        'spread',
                                                        'bidsize',
                                                        'ofrsize',
                                                        'pastreturns',
                                                        'intradaytime',
                                                        'stok',
                                                        'stod',
                                                        'sstod',
#                                                         'wilr',
                                                        'roc',
                                                        'rsi',
                                                        'atr',
                                                        'cci',
                                                        'dpo',
                                                        'sma',
                                                        'ema',
                                                        'macd',
                                                        'macd_diff',
                                                        'macd_signal',
                                                        'dis5',
                                                        'dis10',
                                                        'sector'
                                                        ]
                                        , 
                                   feature_lags = n_feature_lags
                                     ,sectorETFS=sectorETFS
                                       ,pastobs_in_percentage = False)

########### Generate Labels ################

n_classes = 2
# extract first 4 columns as the lag0 or raw OHLC prices (used for labelling)
price_candles = data[['open','high','low','close','Ticker']]

########### Align Data ################

# from imported function (see testing_preprocessing_features_and_labels.ipynb for thorough experimenting with all the cut-offs):    
X, y,indices = align_features_and_labels_multi_final(price_candles = price_candles, 
                                                 all_features = features,
                                                 prediction_horizon = 1, 
                                                 n_feature_lags = n_feature_lags, 
                                                 n_classes = n_classes, # 5,
                                                 safe_burn_in = False, 
                                                 data_sample = 'full',
                                                 splitType='global',
                                                 noise=False,ticker_dummies=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


AAPL done
ABBV done
ABT done
AEP done
AMT done
APD done
BA done
BABA done
BAC done
BHP done
BP done
CCI done
CHL done
COST done
CSGP done
D done
DIS done
ECL done
ENB done
EXC done
FB done
FMX done
GOOG done
INTC done
JNJ done
KO done
LFC done
LIN done
LMT done
MA done
MCD done
MSFT done
NKE done
NVDA done
NVS done
Number of NaNs in label: 1. 1 is expected
Returns that lead to NaNs in label: [0.0907158]
PBR done
PEP done
PFE done
PLD done
PSA done
PTR done
PYPL done
RTX done
SHW done
SNP done
SO done
SRE done
T done
TM done
TSLA done
TSM done
UNP done
UPS done
V done
WMT done


In [30]:
hej = None

if hej != None:
    print('hej')

In [32]:
128//2

64

In [11]:
data.Ticker.unique()

array(['AAPL', 'ABBV', 'ABT', 'AEP', 'AMT', 'APD', 'BA', 'BABA', 'BAC',
       'BHP', 'BP', 'CCI', 'CHL', 'COST', 'CSGP', 'D', 'DIS', 'ECL',
       'ENB', 'EXC', 'FB', 'FMX', 'GOOG', 'INTC', 'JNJ', 'KO', 'LFC',
       'LIN', 'LMT', 'MA', 'MCD', 'MSFT', 'NKE', 'NVDA', 'NVS', 'PBR',
       'PEP', 'PFE', 'PLD', 'PSA', 'PTR', 'PYPL', 'RTX', 'SHW', 'SNP',
       'SO', 'SRE', 'T', 'TM', 'TSLA', 'TSM', 'UNP', 'UPS', 'V', 'WMT'],
      dtype=object)

In [11]:
temp = features[features.ticker=='LFC'].iloc[33:,:].drop('ticker',axis=1)
temp2 = features[features.ticker=='LFC'].drop('ticker',axis=1)
# sum(temp.isna())

In [12]:
nulls = temp.isna().sum(axis=1).values
nulls[nulls>0]

array([ 1,  2,  2, ...,  2,  1, 52], dtype=int64)

In [13]:
np.array(temp.dtypes)

array([dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'),

In [18]:
np.where((np.sum(np.isnan(temp.values), axis=1) == 0) == False)

(array([  70,   71,   72, ..., 7682, 7683, 7766], dtype=int64),)

In [19]:
temp.iloc[70,:].values

array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.02250000e+01,
        1.00000000e-02,  1.00000000e-02,  1.00000000e-02,  1.00000000e-02,
        1.10000000e+01,  1.10000000e+01,  1.10000000e+01,  1.10000000e+01,
        8.00000000e+00,  8.00000000e+00,  6.00000000e+00,  6.00000000e+00,
        0.00000000e+00,  1.12500000e+01,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,             nan,  0.00000000e+00,  3.78696281e+01,
        7.05419572e-04, -4.76190476e+01, -2.50000000e-03,  1.02250000e+01,
        1.02257130e+01, -1.29979085e-03, -2.33252869e-04, -1.06653798e-03,
        1.00000000e+02,  1.00000000e+02, -3.76159826e-04,  8.58922053e-04,
       -4.25521264e-04, -5.21240553e-04, -6.21504040e-04, -3.11780507e-04,
       -7.07266230e-04, -1.69183268e-04, -5.67715890e-04,  4.06311370e-04,
       -1.04083093e-03, -1.86254424e-04, -7.20645326e-04, -4.56548842e-04,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        1.00000000e-02,  

In [20]:
temp.iloc[70,10:30]

bidsize_low_lag0      11.000000
bidsize_close_lag0    11.000000
ofrsize_open_lag0      8.000000
ofrsize_high_lag0      8.000000
ofrsize_low_lag0       6.000000
ofrsize_close_lag0     6.000000
pastreturns_lag0       0.000000
intradaytime_lag0     11.250000
stok_lag0              0.000000
stod_lag0              0.000000
sstod_lag0             0.000000
wilr_lag0                   NaN
roc_lag0               0.000000
rsi_lag0              37.869628
atr_lag0               0.000705
cci_lag0             -47.619048
dpo_lag0              -0.002500
sma_lag0              10.225000
ema_lag0              10.225713
macd_lag0             -0.001300
Name: (20200501, 103), dtype: float64

In [27]:
temp2.iloc[0:20,0:4]

Unnamed: 0,Unnamed: 1,open_lag0,high_lag0,low_lag0,close_lag0
20200501,0,0.04,0.04,-0.045,10.29
20200501,1,0.005,0.005,-0.025,10.285
20200501,2,0.01,0.01,-0.015,10.28
20200501,3,0.0,0.005,-0.025,10.28
20200501,4,-0.005,0.005,-0.02,10.285
20200501,5,0.01,0.08,0.0,10.275
20200501,6,0.02,0.02,-0.01,10.255
20200501,7,0.0,0.0,0.0,10.255
20200501,8,0.02,0.02,0.0,10.235
20200501,9,0.0,0.01,0.0,10.235


In [26]:
import ta
# ta.momentum.stoch(temp2.high_lag0,
#                 temp2.low_lag0,
#                 temp2.close_lag0,fillna=True).reset_index().loc[100:120,:]
ta.momentum.wr(temp2.high_lag0,
                temp2.low_lag0,
                temp2.close_lag0
                             ,fillna=True
                            ).reset_index().loc[100:120,:]

Unnamed: 0,level_0,level_1,wr
100,20200501,100,102150.0
101,20200501,101,102150.0
102,20200501,102,102150.0
103,20200501,103,102150.0
104,20200501,104,102150.0
105,20200501,105,102150.0
106,20200501,106,102150.0
107,20200501,107,102150.0
108,20200501,108,102150.0
109,20200501,109,102150.0


In [59]:
temp2[['open_lag0','high_lag0','low_lag0','close_lag0']].iloc[100:120,:]

Unnamed: 0,Unnamed: 1,open_lag0,high_lag0,low_lag0,close_lag0
20200501,100,0.0,0.0,0.0,10.225
20200501,101,0.0,0.0,0.0,10.225
20200501,102,0.0,0.0,0.0,10.225
20200501,103,0.0,0.0,0.0,10.225
20200501,104,0.0,0.0,0.0,10.225
20200501,105,0.0,0.0,0.0,10.225
20200501,106,0.0,0.0,0.0,10.225
20200501,107,0.0,0.0,0.0,10.225
20200501,108,0.0,0.0,0.0,10.225
20200501,109,0.0,0.0,0.0,10.225


In [61]:
data[data.Ticker=='LFC'].iloc[100:120,0:4]

Unnamed: 0,Unnamed: 1,open,high,low,close
20200501,100,10.225,10.225,10.225,10.225
20200501,101,10.225,10.225,10.225,10.225
20200501,102,10.225,10.225,10.225,10.225
20200501,103,10.225,10.225,10.225,10.225
20200501,104,10.225,10.225,10.225,10.225
20200501,105,10.225,10.225,10.225,10.225
20200501,106,10.225,10.225,10.225,10.225
20200501,107,10.225,10.225,10.225,10.225
20200501,108,10.225,10.225,10.225,10.225
20200501,109,10.225,10.225,10.225,10.225


# Validating Alignment

In [10]:
table = pd.pivot_table(sectorETFS.reset_index()[['level_0','level_1','close','Ticker']],
                               index=['level_0','level_1'],columns='Ticker')
        
table.columns = table.columns.get_level_values(1)

tempSector_wshift = pd.DataFrame(np.concatenate([np.array([0 for i in np.arange(table.shape[1])])\
                                  .reshape((1,table.shape[1])),
                                  ((table.values[1:]/table.values[0:-1])-1)*100]),
                  index=table.index,
                  columns=table.columns).shift(1).fillna(0)
tempSector_woshift = pd.DataFrame(np.concatenate([np.array([0 for i in np.arange(table.shape[1])])\
                                  .reshape((1,table.shape[1])),
                                  ((table.values[1:]/table.values[0:-1])-1)*100]),
                  index=table.index,
                  columns=table.columns).fillna(0)

In [11]:
tempSector_wshift

Unnamed: 0_level_0,Ticker,DIA,IDU,IYC,IYE,IYG,IYH,IYJ,IYK,IYM,IYR,IYW,IYZ,QQQ,SPY
level_0,level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20200501,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
20200501,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
20200501,2,0.048018,-0.110328,0.194878,0.741119,-0.102272,0.099990,-0.070106,0.033934,0.113080,-0.047336,0.217846,0.000000,0.167563,0.056063
20200501,3,0.158591,-0.089073,0.289255,-0.101471,0.129084,-0.064215,0.110771,0.309545,0.244729,0.087951,0.030432,0.036975,0.060408,0.106810
20200501,4,0.033335,-0.531346,0.034809,0.457085,0.057791,-0.128513,0.047947,-0.067636,0.169014,-0.047316,0.043461,-0.018481,0.006966,-0.034982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,-0.070662,-0.047146,0.034437,-0.049032,-0.087018,-0.011319,-0.013513,0.056119,-0.039927,0.025825,-0.019795,0.089270,0.060025,-0.009848
20200529,386,0.053034,0.148243,0.087210,0.098111,-0.087094,0.122255,0.000000,0.208325,0.159772,0.180727,0.023758,0.374599,0.040707,0.062375
20200529,387,-0.007853,0.060555,-0.013758,0.049008,-0.053962,-0.006784,-0.064194,-0.015992,-0.056970,-0.038657,-0.079175,0.071086,-0.055682,-0.024606
20200529,388,0.041230,-0.090778,0.011467,0.048984,0.012460,0.009045,0.077758,-0.031988,0.045602,0.000000,0.075276,-0.035518,0.023571,0.024612


In [12]:
tempSector_woshift

Unnamed: 0_level_0,Ticker,DIA,IDU,IYC,IYE,IYG,IYH,IYJ,IYK,IYM,IYR,IYW,IYZ,QQQ,SPY
level_0,level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20200501,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
20200501,1,0.048018,-0.110328,0.194878,0.741119,-0.102272,0.099990,-0.070106,0.033934,0.113080,-0.047336,0.217846,0.000000,0.167563,0.056063
20200501,2,0.158591,-0.089073,0.289255,-0.101471,0.129084,-0.064215,0.110771,0.309545,0.244729,0.087951,0.030432,0.036975,0.060408,0.106810
20200501,3,0.033335,-0.531346,0.034809,0.457085,0.057791,-0.128513,0.047947,-0.067636,0.169014,-0.047316,0.043461,-0.018481,0.006966,-0.034982
20200501,4,0.006248,-0.035851,-0.044739,-0.480283,0.053314,0.102466,0.011060,0.088832,0.043745,0.006763,0.330162,-0.018484,0.188066,0.062990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,0.053034,0.148243,0.087210,0.098111,-0.087094,0.122255,0.000000,0.208325,0.159772,0.180727,0.023758,0.374599,0.040707,0.062375
20200529,386,-0.007853,0.060555,-0.013758,0.049008,-0.053962,-0.006784,-0.064194,-0.015992,-0.056970,-0.038657,-0.079175,0.071086,-0.055682,-0.024606
20200529,387,0.041230,-0.090778,0.011467,0.048984,0.012460,0.009045,0.077758,-0.031988,0.045602,0.000000,0.075276,-0.035518,0.023571,0.024612
20200529,388,-0.021588,-0.020191,-0.020637,-0.048960,0.049832,-0.011306,-0.043916,0.008000,-0.085465,-0.090235,-0.043548,0.106591,-0.008569,-0.018045


In [13]:
data

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker,sector
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL,Technology
20200501,1,289.260,289.350,288.365,289.020,0.24,0.45,0.01,0.10,9.0,20.0,1.0,1.0,4.0,56.0,1.0,1.0,AAPL,Technology
20200501,2,289.035,289.705,288.280,288.580,0.07,0.49,0.01,0.30,1.0,50.0,1.0,1.0,1.0,13.0,1.0,1.0,AAPL,Technology
20200501,3,288.485,289.315,288.280,289.095,0.49,0.49,0.01,0.17,1.0,25.0,1.0,16.0,1.0,8.0,1.0,1.0,AAPL,Technology
20200501,4,289.100,290.435,288.940,290.320,0.16,0.33,0.01,0.10,13.0,71.0,1.0,1.0,1.0,236.0,1.0,1.0,AAPL,Technology
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,123.950,124.110,123.910,124.100,0.02,0.07,0.01,0.04,1.0,11.0,1.0,1.0,5.0,9.0,1.0,1.0,WMT,Consumer Defensive
20200529,386,124.085,124.085,123.920,123.995,0.01,0.06,0.01,0.01,1.0,8.0,1.0,3.0,1.0,9.0,1.0,2.0,WMT,Consumer Defensive
20200529,387,123.995,124.355,123.985,124.335,0.01,0.07,0.01,0.05,4.0,16.0,1.0,2.0,2.0,10.0,1.0,2.0,WMT,Consumer Defensive
20200529,388,124.335,124.355,124.060,124.075,0.05,0.12,0.01,0.01,3.0,6.0,1.0,2.0,2.0,10.0,1.0,4.0,WMT,Consumer Defensive


## Apple example

In [15]:
apple = data[data.Ticker=='AAPL']
apple

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker,sector
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL,Technology
20200501,1,289.260,289.350,288.365,289.020,0.24,0.45,0.01,0.10,9.0,20.0,1.0,1.0,4.0,56.0,1.0,1.0,AAPL,Technology
20200501,2,289.035,289.705,288.280,288.580,0.07,0.49,0.01,0.30,1.0,50.0,1.0,1.0,1.0,13.0,1.0,1.0,AAPL,Technology
20200501,3,288.485,289.315,288.280,289.095,0.49,0.49,0.01,0.17,1.0,25.0,1.0,16.0,1.0,8.0,1.0,1.0,AAPL,Technology
20200501,4,289.100,290.435,288.940,290.320,0.16,0.33,0.01,0.10,13.0,71.0,1.0,1.0,1.0,236.0,1.0,1.0,AAPL,Technology
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,319.255,319.415,318.830,319.095,0.11,0.17,0.01,0.05,2.0,485.0,1.0,1.0,1.0,22.0,1.0,7.0,AAPL,Technology
20200529,386,319.095,319.295,318.810,318.845,0.05,0.22,0.01,0.03,1.0,20.0,1.0,2.0,6.0,11.0,1.0,1.0,AAPL,Technology
20200529,387,318.845,319.605,318.695,319.460,0.03,0.11,0.01,0.06,1.0,10.0,1.0,2.0,1.0,9.0,1.0,6.0,AAPL,Technology
20200529,388,319.465,319.555,318.660,318.675,0.05,0.12,0.01,0.03,1.0,38.0,1.0,1.0,6.0,19.0,1.0,1.0,AAPL,Technology


In [21]:
apple_return = ((apple.close.values[1:]/apple.close.values[0:-1])-1)*100
apple_return = pd.DataFrame({'return':np.concatenate([[0],apple_return])})
apple_return

Unnamed: 0,return
0,0.000000
1,-0.082970
2,-0.152239
3,0.178460
4,0.423736
...,...
7795,-0.048551
7796,-0.078347
7797,0.192884
7798,-0.245727


In [26]:
apple_return.shift(1)

Unnamed: 0,return
0,
1,0.000000
2,-0.082970
3,-0.152239
4,0.178460
...,...
7795,-0.207868
7796,-0.048551
7797,-0.078347
7798,0.192884


In [23]:
IDU = sectorETFS[sectorETFS.Ticker=='IDU']
IDU

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker,sector
20200501,0,141.805,142.910,138.520,140.490,1.93,12.37,0.09,0.76,1.0,6.0,1.0,5.0,5.0,16.0,1.0,6.0,IDU,Utilities
20200501,1,140.490,140.600,140.280,140.335,0.76,0.81,0.56,0.59,5.0,6.0,1.0,5.0,5.0,7.0,1.0,6.0,IDU,Utilities
20200501,2,140.335,140.340,140.205,140.210,0.59,0.63,0.48,0.54,5.0,7.0,1.0,6.0,5.0,10.0,1.0,5.0,IDU,Utilities
20200501,3,140.210,140.275,139.440,139.465,0.54,0.57,0.26,0.43,6.0,7.0,1.0,6.0,6.0,10.0,1.0,6.0,IDU,Utilities
20200501,4,139.465,139.860,139.315,139.415,0.43,0.76,0.07,0.43,5.0,10.0,1.0,5.0,6.0,16.0,1.0,6.0,IDU,Utilities
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,148.400,148.635,148.375,148.625,0.08,0.11,0.04,0.05,10.0,15.0,1.0,1.0,10.0,11.0,1.0,10.0,IDU,Utilities
20200529,386,148.630,148.735,148.610,148.715,0.06,0.08,0.03,0.05,1.0,11.0,1.0,1.0,10.0,11.0,1.0,10.0,IDU,Utilities
20200529,387,148.710,148.710,148.575,148.580,0.06,0.08,0.02,0.06,10.0,13.0,1.0,10.0,10.0,11.0,1.0,10.0,IDU,Utilities
20200529,388,148.575,148.610,148.510,148.550,0.05,0.09,0.05,0.06,10.0,13.0,1.0,10.0,1.0,11.0,1.0,10.0,IDU,Utilities


In [24]:
idu_return = ((IDU.close.values[1:]/IDU.close.values[0:-1])-1)*100
idu_return = pd.DataFrame({'return':np.concatenate([[0],idu_return])})
idu_return

Unnamed: 0,return
0,0.000000
1,-0.110328
2,-0.089073
3,-0.531346
4,-0.035851
...,...
7795,0.148243
7796,0.060555
7797,-0.090778
7798,-0.020191


In [25]:
apple_return - idu_return

INFO:MainThread:numexpr.utils:NumExpr defaulting to 4 threads.


Unnamed: 0,return
0,0.000000
1,0.027358
2,-0.063166
3,0.709806
4,0.459587
...,...
7795,-0.196794
7796,-0.138902
7797,0.283661
7798,-0.225536


# Conclusion: Additional shift not needed - removed from final function.

# Adding intraday times as features

In [36]:
datafiles

['aggregateTAQ_May2020_10sec (1).csv',
 'aggregateTAQ_May2020_30sec (1).csv',
 'aggregateTAQ_May2020_60sec.csv',
 'aggregateTAQ_May2020_60sec.gsheet',
 'desktop.ini',
 'round1',
 'round4_only_largest_exchange',
 'round5_market_tickers',
 'trueAggregateTAQ_60sec.csv']

In [42]:
(30/60)/60

0.008333333333333333

In [56]:
candles_per_hour = data.index.get_level_values(1).unique().shape[0]/6.5

1/(candles_per_hour)

# denumerator

0.016666666666666666

In [68]:
intradaytime=9.5+(1/candles_per_hour)+data.index.get_level_values(1).unique()/candles_per_hour
intradaytime

Float64Index([ 9.516666666666667,  9.533333333333335,               9.55,
               9.566666666666668,  9.583333333333334,  9.600000000000001,
               9.616666666666667,  9.633333333333335,               9.65,
               9.666666666666668,
              ...
              15.850000000000001, 15.866666666666667, 15.883333333333333,
              15.900000000000002, 15.916666666666668, 15.933333333333334,
              15.950000000000001, 15.966666666666669, 15.983333333333334,
                            16.0],
             dtype='float64', length=390)

In [72]:
np.repeat(np.array(intradaytime),axis=0,repeats=2)

array([ 9.51666667,  9.51666667,  9.53333333,  9.53333333,  9.55      ,
        9.55      ,  9.56666667,  9.56666667,  9.58333333,  9.58333333,
        9.6       ,  9.6       ,  9.61666667,  9.61666667,  9.63333333,
        9.63333333,  9.65      ,  9.65      ,  9.66666667,  9.66666667,
        9.68333333,  9.68333333,  9.7       ,  9.7       ,  9.71666667,
        9.71666667,  9.73333333,  9.73333333,  9.75      ,  9.75      ,
        9.76666667,  9.76666667,  9.78333333,  9.78333333,  9.8       ,
        9.8       ,  9.81666667,  9.81666667,  9.83333333,  9.83333333,
        9.85      ,  9.85      ,  9.86666667,  9.86666667,  9.88333333,
        9.88333333,  9.9       ,  9.9       ,  9.91666667,  9.91666667,
        9.93333333,  9.93333333,  9.95      ,  9.95      ,  9.96666667,
        9.96666667,  9.98333333,  9.98333333, 10.        , 10.        ,
       10.01666667, 10.01666667, 10.03333333, 10.03333333, 10.05      ,
       10.05      , 10.06666667, 10.06666667, 10.08333333, 10.08

In [90]:
days = data.index.get_level_values(0).unique().shape[0]
# print(days)
nparray = np.tile(intradaytime.values,(days,1)).flatten()

# np.array(np.arange(len(nparray)))[nparray==9.55]
np.array(np.arange(len(nparray)))[nparray==9.55][1:]-np.array(np.arange(len(nparray)))[nparray==9.55][0:-1]

array([390, 390, 390, 390, 390, 390, 390, 390, 390, 390, 390, 390, 390,
       390, 390, 390, 390, 390, 390])

In [92]:
test = pd.DataFrame()
test.loc[:,'test'] = nparray

In [93]:
test

Unnamed: 0,test
0,9.516667
1,9.533333
2,9.550000
3,9.566667
4,9.583333
...,...
7795,15.933333
7796,15.950000
7797,15.966667
7798,15.983333


In [None]:
pd.np.tile()

In [60]:
pd.DataFrame({'intradaytime':9.5+(1/candles_per_hour)+data.index.get_level_values(1).unique()/candles_per_hour},
             index=apple.index)

ValueError: Shape of passed values is (390, 1), indices imply (7800, 1)

# Converting the "diff's" to percentage

In [95]:
apple.iloc[:,0:4]

Unnamed: 0,Unnamed: 1,open,high,low,close
20200501,0,286.250,289.260,285.870,289.260
20200501,1,289.260,289.350,288.365,289.020
20200501,2,289.035,289.705,288.280,288.580
20200501,3,288.485,289.315,288.280,289.095
20200501,4,289.100,290.435,288.940,290.320
...,...,...,...,...,...
20200529,385,319.255,319.415,318.830,319.095
20200529,386,319.095,319.295,318.810,318.845
20200529,387,318.845,319.605,318.695,319.460
20200529,388,319.465,319.555,318.660,318.675


In [96]:
apple.close

20200501  0      289.260
          1      289.020
          2      288.580
          3      289.095
          4      290.320
                  ...   
20200529  385    319.095
          386    318.845
          387    319.460
          388    318.675
          389    317.920
Name: close, Length: 7800, dtype: float64

In [97]:
apple.iloc[:,0:4].subtract(apple.close,axis=0)

Unnamed: 0,Unnamed: 1,open,high,low,close
20200501,0,-3.010,0.000,-3.390,0.0
20200501,1,0.240,0.330,-0.655,0.0
20200501,2,0.455,1.125,-0.300,0.0
20200501,3,-0.610,0.220,-0.815,0.0
20200501,4,-1.220,0.115,-1.380,0.0
...,...,...,...,...,...
20200529,385,0.160,0.320,-0.265,0.0
20200529,386,0.250,0.450,-0.035,0.0
20200529,387,-0.615,0.145,-0.765,0.0
20200529,388,0.790,0.880,-0.015,0.0


In [98]:
apple.iloc[:,0:4].values

array([[286.25 , 289.26 , 285.87 , 289.26 ],
       [289.26 , 289.35 , 288.365, 289.02 ],
       [289.035, 289.705, 288.28 , 288.58 ],
       ...,
       [318.845, 319.605, 318.695, 319.46 ],
       [319.465, 319.555, 318.66 , 318.675],
       [318.675, 319.38 , 317.5  , 317.92 ]])

In [119]:
(apple.iloc[:,0:4].divide(apple.close,axis=0)-1)# apple.close.values

Unnamed: 0,Unnamed: 1,open,high,low,close
20200501,0,-1.040586,0.000000,-1.171956,0.0
20200501,1,0.083039,0.114179,-0.226628,0.0
20200501,2,0.157669,0.389840,-0.103957,0.0
20200501,3,-0.211003,0.076100,-0.281914,0.0
20200501,4,-0.420226,0.039611,-0.475338,0.0
...,...,...,...,...,...
20200529,385,0.050142,0.100284,-0.083047,0.0
20200529,386,0.078408,0.141134,-0.010977,0.0
20200529,387,-0.192512,0.045389,-0.239467,0.0
20200529,388,0.247901,0.276143,-0.004707,0.0


In [121]:
((apple.iloc[:,0:4].divide(apple.close,axis=0)-1)*100).iloc[0:10,0].sum()

-2.2397900180812447

In [112]:
apple.close.values.shape

(7800,)

In [114]:
apple.iloc[:,0:4].values.shape

(7800, 4)

In [115]:
apple.iloc[:,0:4].values.T.shape

(4, 7800)

In [117]:
pd.DataFrame((apple.iloc[:,0:4].values.T / apple.close.values).T,columns=['open','high','low','close'])

Unnamed: 0,open,high,low,close
0,0.989594,1.000000,0.988280,1.0
1,1.000830,1.001142,0.997734,1.0
2,1.001577,1.003898,0.998960,1.0
3,0.997890,1.000761,0.997181,1.0
4,0.995798,1.000396,0.995247,1.0
...,...,...,...,...
7795,1.000501,1.001003,0.999170,1.0
7796,1.000784,1.001411,0.999890,1.0
7797,0.998075,1.000454,0.997605,1.0
7798,1.002479,1.002761,0.999953,1.0


# Examinating the align function

In [11]:
features.columns

Index(['open', 'high', 'low', 'close', 'spread_open', 'spread_high',
       'spread_low', 'spread_close', 'bidsize_open', 'bidsize_high',
       'bidsize_low', 'bidsize_close', 'ofrsize_open', 'ofrsize_high',
       'ofrsize_low', 'ofrsize_close', 'relReturns_DIA', 'relReturns_IDU',
       'relReturns_IYC', 'relReturns_IYE', 'relReturns_IYG', 'relReturns_IYH',
       'relReturns_IYJ', 'relReturns_IYK', 'relReturns_IYM', 'relReturns_IYR',
       'relReturns_IYW', 'relReturns_IYZ', 'relReturns_QQQ', 'relReturns_SPY',
       'ticker', 'd_sector_Basic Materials', 'd_sector_Communication Services',
       'd_sector_Consumer Cyclical', 'd_sector_Consumer Defensive',
       'd_sector_Energy', 'd_sector_Financial Services', 'd_sector_Healthcare',
       'd_sector_Industrials', 'd_sector_Real Estate', 'd_sector_Technology',
       'd_sector_Utilities'],
      dtype='object')

In [10]:
features

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,...,d_sector_Communication Services,d_sector_Consumer Cyclical,d_sector_Consumer Defensive,d_sector_Energy,d_sector_Financial Services,d_sector_Healthcare,d_sector_Industrials,d_sector_Real Estate,d_sector_Technology,d_sector_Utilities
20200501,0,-3.010,0.000,-3.390,289.260,0.50,0.50,0.01,0.24,6.0,95.0,...,0,0,0,0,0,0,0,0,1,0
20200501,1,0.240,0.330,-0.655,289.020,0.24,0.45,0.01,0.10,9.0,20.0,...,0,0,0,0,0,0,0,0,1,0
20200501,2,0.455,1.125,-0.300,288.580,0.07,0.49,0.01,0.30,1.0,50.0,...,0,0,0,0,0,0,0,0,1,0
20200501,3,-0.610,0.220,-0.815,289.095,0.49,0.49,0.01,0.17,1.0,25.0,...,0,0,0,0,0,0,0,0,1,0
20200501,4,-1.220,0.115,-1.380,290.320,0.16,0.33,0.01,0.10,13.0,71.0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,-0.150,0.010,-0.190,124.100,0.02,0.07,0.01,0.04,1.0,11.0,...,0,0,1,0,0,0,0,0,0,0
20200529,386,0.090,0.090,-0.075,123.995,0.01,0.06,0.01,0.01,1.0,8.0,...,0,0,1,0,0,0,0,0,0,0
20200529,387,-0.340,0.020,-0.350,124.335,0.01,0.07,0.01,0.05,4.0,16.0,...,0,0,1,0,0,0,0,0,0,0
20200529,388,0.260,0.280,-0.015,124.075,0.05,0.12,0.01,0.01,3.0,6.0,...,0,0,1,0,0,0,0,0,0,0


In [13]:
features['close'].reset_index()

Unnamed: 0,level_0,level_1,close
0,20200501,0,289.260
1,20200501,1,289.020
2,20200501,2,288.580
3,20200501,3,289.095
4,20200501,4,290.320
...,...,...,...
428995,20200529,385,124.100
428996,20200529,386,123.995
428997,20200529,387,124.335
428998,20200529,388,124.075


In [16]:
features.shape,data.shape

((429000, 42), (429000, 18))

In [19]:
data.loc[:,'Ticker']=='AAPL'

20200501  0       True
          1       True
          2       True
          3       True
          4       True
                 ...  
20200529  385    False
          386    False
          387    False
          388    False
          389    False
Name: Ticker, Length: 429000, dtype: bool

In [18]:
features

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,...,d_sector_Communication Services,d_sector_Consumer Cyclical,d_sector_Consumer Defensive,d_sector_Energy,d_sector_Financial Services,d_sector_Healthcare,d_sector_Industrials,d_sector_Real Estate,d_sector_Technology,d_sector_Utilities
20200501,0,-3.010,0.000,-3.390,289.260,0.50,0.50,0.01,0.24,6.0,95.0,...,0,0,0,0,0,0,0,0,1,0
20200501,1,0.240,0.330,-0.655,289.020,0.24,0.45,0.01,0.10,9.0,20.0,...,0,0,0,0,0,0,0,0,1,0
20200501,2,0.455,1.125,-0.300,288.580,0.07,0.49,0.01,0.30,1.0,50.0,...,0,0,0,0,0,0,0,0,1,0
20200501,3,-0.610,0.220,-0.815,289.095,0.49,0.49,0.01,0.17,1.0,25.0,...,0,0,0,0,0,0,0,0,1,0
20200501,4,-1.220,0.115,-1.380,290.320,0.16,0.33,0.01,0.10,13.0,71.0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,-0.150,0.010,-0.190,124.100,0.02,0.07,0.01,0.04,1.0,11.0,...,0,0,1,0,0,0,0,0,0,0
20200529,386,0.090,0.090,-0.075,123.995,0.01,0.06,0.01,0.01,1.0,8.0,...,0,0,1,0,0,0,0,0,0,0
20200529,387,-0.340,0.020,-0.350,124.335,0.01,0.07,0.01,0.05,4.0,16.0,...,0,0,1,0,0,0,0,0,0,0
20200529,388,0.260,0.280,-0.015,124.075,0.05,0.12,0.01,0.01,3.0,6.0,...,0,0,1,0,0,0,0,0,0,0


In [20]:
features[['close','relReturns_IDU']][data.loc[:,'Ticker']=='AAPL']

  """Entry point for launching an IPython kernel.


ValueError: cannot handle a non-unique multi-index!

In [22]:
tempdata

Unnamed: 0,level_0,level_1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker,sector
0,20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL,Technology
1,20200501,1,289.260,289.350,288.365,289.020,0.24,0.45,0.01,0.10,9.0,20.0,1.0,1.0,4.0,56.0,1.0,1.0,AAPL,Technology
2,20200501,2,289.035,289.705,288.280,288.580,0.07,0.49,0.01,0.30,1.0,50.0,1.0,1.0,1.0,13.0,1.0,1.0,AAPL,Technology
3,20200501,3,288.485,289.315,288.280,289.095,0.49,0.49,0.01,0.17,1.0,25.0,1.0,16.0,1.0,8.0,1.0,1.0,AAPL,Technology
4,20200501,4,289.100,290.435,288.940,290.320,0.16,0.33,0.01,0.10,13.0,71.0,1.0,1.0,1.0,236.0,1.0,1.0,AAPL,Technology
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428995,20200529,385,123.950,124.110,123.910,124.100,0.02,0.07,0.01,0.04,1.0,11.0,1.0,1.0,5.0,9.0,1.0,1.0,WMT,Consumer Defensive
428996,20200529,386,124.085,124.085,123.920,123.995,0.01,0.06,0.01,0.01,1.0,8.0,1.0,3.0,1.0,9.0,1.0,2.0,WMT,Consumer Defensive
428997,20200529,387,123.995,124.355,123.985,124.335,0.01,0.07,0.01,0.05,4.0,16.0,1.0,2.0,2.0,10.0,1.0,2.0,WMT,Consumer Defensive
428998,20200529,388,124.335,124.355,124.060,124.075,0.05,0.12,0.01,0.01,3.0,6.0,1.0,2.0,2.0,10.0,1.0,4.0,WMT,Consumer Defensive


In [23]:
tempfeatures

Unnamed: 0,level_0,level_1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,...,d_sector_Communication Services,d_sector_Consumer Cyclical,d_sector_Consumer Defensive,d_sector_Energy,d_sector_Financial Services,d_sector_Healthcare,d_sector_Industrials,d_sector_Real Estate,d_sector_Technology,d_sector_Utilities
0,20200501,0,-3.010,0.000,-3.390,289.260,0.50,0.50,0.01,0.24,...,0,0,0,0,0,0,0,0,1,0
1,20200501,1,0.240,0.330,-0.655,289.020,0.24,0.45,0.01,0.10,...,0,0,0,0,0,0,0,0,1,0
2,20200501,2,0.455,1.125,-0.300,288.580,0.07,0.49,0.01,0.30,...,0,0,0,0,0,0,0,0,1,0
3,20200501,3,-0.610,0.220,-0.815,289.095,0.49,0.49,0.01,0.17,...,0,0,0,0,0,0,0,0,1,0
4,20200501,4,-1.220,0.115,-1.380,290.320,0.16,0.33,0.01,0.10,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428995,20200529,385,-0.150,0.010,-0.190,124.100,0.02,0.07,0.01,0.04,...,0,0,1,0,0,0,0,0,0,0
428996,20200529,386,0.090,0.090,-0.075,123.995,0.01,0.06,0.01,0.01,...,0,0,1,0,0,0,0,0,0,0
428997,20200529,387,-0.340,0.020,-0.350,124.335,0.01,0.07,0.01,0.05,...,0,0,1,0,0,0,0,0,0,0
428998,20200529,388,0.260,0.280,-0.015,124.075,0.05,0.12,0.01,0.01,...,0,0,1,0,0,0,0,0,0,0


In [28]:
sectorETFS

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker,sector
20200501,0,141.805,142.910,138.520,140.490,1.93,12.37,0.09,0.76,1.0,6.0,1.0,5.0,5.0,16.0,1.0,6.0,IDU,Utilities
20200501,1,140.490,140.600,140.280,140.335,0.76,0.81,0.56,0.59,5.0,6.0,1.0,5.0,5.0,7.0,1.0,6.0,IDU,Utilities
20200501,2,140.335,140.340,140.205,140.210,0.59,0.63,0.48,0.54,5.0,7.0,1.0,6.0,5.0,10.0,1.0,5.0,IDU,Utilities
20200501,3,140.210,140.275,139.440,139.465,0.54,0.57,0.26,0.43,6.0,7.0,1.0,6.0,6.0,10.0,1.0,6.0,IDU,Utilities
20200501,4,139.465,139.860,139.315,139.415,0.43,0.76,0.07,0.43,5.0,10.0,1.0,5.0,6.0,16.0,1.0,6.0,IDU,Utilities
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,304.610,304.855,304.505,304.800,0.02,0.04,0.01,0.02,2.0,109.0,1.0,9.0,9.0,74.0,1.0,2.0,SPY,
20200529,386,304.800,304.850,304.690,304.725,0.02,0.06,0.01,0.01,9.0,104.0,1.0,1.0,3.0,114.0,1.0,5.0,SPY,
20200529,387,304.725,304.890,304.620,304.800,0.01,0.04,0.01,0.02,1.0,102.0,1.0,5.0,6.0,53.0,1.0,9.0,SPY,
20200529,388,304.800,304.910,304.715,304.745,0.02,0.14,0.01,0.01,6.0,66.0,1.0,9.0,9.0,138.0,1.0,29.0,SPY,


In [43]:
features.columns

Index(['open', 'high', 'low', 'close', 'spread_open', 'spread_high',
       'spread_low', 'spread_close', 'bidsize_open', 'bidsize_high',
       'bidsize_low', 'bidsize_close', 'ofrsize_open', 'ofrsize_high',
       'ofrsize_low', 'ofrsize_close', 'relReturns_DIA', 'relReturns_IDU',
       'relReturns_IYC', 'relReturns_IYE', 'relReturns_IYG', 'relReturns_IYH',
       'relReturns_IYJ', 'relReturns_IYK', 'relReturns_IYM', 'relReturns_IYR',
       'relReturns_IYW', 'relReturns_IYZ', 'relReturns_QQQ', 'relReturns_SPY',
       'ticker', 'd_sector_Basic Materials', 'd_sector_Communication Services',
       'd_sector_Consumer Cyclical', 'd_sector_Consumer Defensive',
       'd_sector_Energy', 'd_sector_Financial Services', 'd_sector_Healthcare',
       'd_sector_Industrials', 'd_sector_Real Estate', 'd_sector_Technology',
       'd_sector_Utilities'],
      dtype='object')

In [51]:
tempdata = data.reset_index().copy(deep=True)
tempIDU = sectorETFS[sectorETFS.Ticker == 'IDU'].close.reset_index().copy(deep=True)
tempfeatures = features.reset_index().copy(deep=True)
tempfeatures = tempfeatures[tempfeatures.ticker=='AAPL'][['level_0',
                                                  'level_1',
                                                  'close',
                                                  'relReturns_IDU']].reset_index(drop=True).copy(deep=True)

testdata = tempfeatures.copy(deep=True)#[['level_0','level_1','close','relReturns_IDU']][tempfeatures.ticker=='AAPL']
# # tempIDU = features[data.Ticker=='IDU'].close
testdata = testdata.merge(tempIDU,
                          on = ['level_0','level_1'],
                          how='left')
testdata.index = data[data.Ticker=='AAPL'].index
testdata = testdata.drop(['level_0','level_1'],axis=1)

In [53]:
testdata

Unnamed: 0,Unnamed: 1,close_x,relReturns_IDU,close_y
20200501,0,289.260,0.000000,140.490
20200501,1,289.020,0.000274,140.335
20200501,2,288.580,-0.000632,140.210
20200501,3,289.095,0.007098,139.465
20200501,4,290.320,0.004596,139.415
...,...,...,...,...
20200529,385,319.095,-0.001968,148.625
20200529,386,318.845,-0.001389,148.715
20200529,387,319.460,0.002837,148.580
20200529,388,318.675,-0.002255,148.550


In [56]:
(testdata.close_x.values[1:]/testdata.close_x.values[0:-1])-1

array([-0.0008297 , -0.00152239,  0.0017846 , ...,  0.00192884,
       -0.00245727, -0.00236918])

In [62]:
testdata.loc[:,'returns_x'] = np.concatenate([[np.nan],(testdata.close_x.values[1:]/testdata.close_x.values[0:-1])-1])
testdata.loc[:,'returns_y'] = np.concatenate([[np.nan],(testdata.close_y.values[1:]/testdata.close_y.values[0:-1])-1])

In [63]:
testdata

Unnamed: 0,Unnamed: 1,close_x,relReturns_IDU,close_y,returns_x,returns_y
20200501,0,289.260,0.000000,140.490,,
20200501,1,289.020,0.000274,140.335,-0.000830,-0.001103
20200501,2,288.580,-0.000632,140.210,-0.001522,-0.000891
20200501,3,289.095,0.007098,139.465,0.001785,-0.005313
20200501,4,290.320,0.004596,139.415,0.004237,-0.000359
...,...,...,...,...,...,...
20200529,385,319.095,-0.001968,148.625,-0.000486,0.001482
20200529,386,318.845,-0.001389,148.715,-0.000783,0.000606
20200529,387,319.460,0.002837,148.580,0.001929,-0.000908
20200529,388,318.675,-0.002255,148.550,-0.002457,-0.000202


In [61]:
np.where((np.sum(np.isnan(testdata.values), axis=1) == 0) == True)[0][0]

0

In [68]:
def extract_labels_multi_final(data = None,
                            classes = 5,
                            splits=None):

    # this version takes data in a direct returns for a specific ticker
    # per version 6 we no longer use group_style, as the "splits" fully describes splits for both equal and non-equal

    labels = pd.cut(data, bins=splits, labels=False, right=False, include_lowest=True)

    # we need right=False (open right-handside in split interval) to get median into the positive class
    # this makes the last point nan, we fix it here
    if sum(np.isnan(labels)) > 0:
        print(f'Number of NaNs in label: {sum(np.isnan(labels))}. 1 is expected')
        print(f'Returns that lead to NaNs in label: {data[np.where(np.isnan(labels))]}')
        assert sum(np.isnan(labels)) <= 1, "There should be max 1 NaN"

        if data[np.where(np.isnan(labels))] >= splits[-1]:
            labels[np.where(np.isnan(labels))] = classes - 1 # assign last label id
        else:
            print(data[np.where(np.isnan(labels))], splits[-1])
            raise ValueError('There is a label NaN where its underlying return is not max of dataset, which it should be')

    return labels

In [73]:
def align_features_and_labels_multi_temp(price_candles,
                                            all_features,
                                            prediction_horizon,
                                            n_feature_lags,
                                            n_classes,
                                            label_split = [],
                                            safe_burn_in = False,
                                            data_sample = 'full',
                                            splitType='global',
                                            noise = False,
                                            ticker_dummies = False):

    all_burned_in_features = pd.DataFrame()
    all_burned_in_indices = pd.DataFrame()
    all_labels = pd.DataFrame()

    dailyIndices = pd.DataFrame({'days':price_candles.index.get_level_values(0),
                                  'timestamps':price_candles.index.get_level_values(1),
                                  'ticker':price_candles.Ticker})

    if splitType.lower() == 'global':
        # Making the splits for the labels based on all tickers
        # returns = ((price_candles['close'].values[1:] / price_candles['close'].values[:-1]) -1) * 100
    #         returns = np.concatenate([((price_candles[price_candles.Ticker==ticker]['close'].values[1:]/\
    #                          price_candles[price_candles.Ticker==ticker]['close'].values[:-1])-1) for ticker\
    #                           in price_candles.Ticker.unique()])

        returns = []
        tickers = []

        for ticker in price_candles.Ticker.unique():

            ticker_returns = (price_candles[price_candles.Ticker==ticker]['close'].values[1:]/\
                                 price_candles[price_candles.Ticker==ticker]['close'].values[:-1]) - 1
            ticker_names = [ticker for i in range(len(ticker_returns))]

            returns.append(ticker_returns)
            tickers.append(ticker_names)

        # concatenate returns and add noise
        returns = np.concatenate(returns)
        if noise:
            returns[returns==0] = np.random.normal(0,1,sum(returns==0))/1000000

        tickers = np.concatenate(tickers)

        if label_split == []:
            # equal-sized bins according to n_classes
            _, splits = pd.qcut(returns, q=n_classes, labels=False, retbins=True)
        elif label_split != []:
            _, splits = pd.qcut(returns, q=label_split, labels=False, retbins=True)

        #print(splits)

        returns = pd.DataFrame({'returns': returns, 'Ticker': tickers})

    keepCheck = []

    for ticker_iter, ticker_name in enumerate(all_features.ticker.unique()):
        ticker_features = all_features[all_features.ticker==ticker_name].copy(deep=True)

        ticker_indices = dailyIndices[dailyIndices.ticker==ticker_name].copy(deep=True)
        # removing the "ticker" variable from ticker_features as np.isnan() does not like non-numericals
        #ticker_features = ticker_features.iloc[:, ticker_features.columns != 'ticker']
        ticker_features.drop('ticker', axis=1, inplace=True)
        # extract first 4 columns as the lag0 or raw OHLC prices (used for labelling)
        #ticker_prices = price_candles[price_candles.Ticker==ticker_name]['close'].values # candles.iloc[:, :4].values
        ticker_returns = returns[returns.Ticker==ticker_name]['returns'].values

        if not safe_burn_in:
            assert data_sample == 'full'
            # we assume data_sample is full and that we can continue features from yesterday's values.
            # that we have a single burn-in at the beginning and that's it

            # get first index that has no NaNs (the sum checks for True across columns, we look for sum == 0 and where that is first True)
            print(ticker_features.iloc[0,:],'\n')
            burned_in_idx = np.where((np.sum(np.isnan(ticker_features.values), axis=1) == 0) == True)[0][0]
            print(np.isnan(ticker_features.values),'\n')
            print(burned_in_idx,'\n')
            keepCheck.append(burned_in_idx)
            # calculate end-point cut-off to match with labels
            end_point_cut = max(prediction_horizon, n_feature_lags + 1)
            print(end_point_cut,'\n')
            # slice away the observations used for burn-in (taking off 1 at the end to match with labels [slice off "prediction_horizon"])
            burned_in_features = ticker_features.iloc[burned_in_idx : -end_point_cut, :] #.reset_index(drop=True) # features[burned_in_idx:] latter is sligthly faster but maybe not as precise
            burned_in_indices = ticker_indices.iloc[burned_in_idx : -end_point_cut, :]
            # slice away the burned-in indices from labels
            labels = extract_labels_multi_final(data = ticker_returns[(burned_in_idx+n_feature_lags):],
                                                classes = n_classes,
                                                splits = splits)
            # labels, returns, thresholds = extract_labels(data = candles[burned_in_idx + n_feature_lags : , :],
            #                                             classes = n_classes, group_style = 'equal')
            print(labels[0:10],labels[-10:],'\n')
            # check if there are remaining NaNs are burn-in (means error)
            remaining_nans = np.where(np.isnan(burned_in_features.values))[0].size
            if remaining_nans > 0:
                raise ValueError('Had NaN in burned_in_features after burn-in')

        # Adding the ticker
        burned_in_features.loc[:,'ticker'] = ticker_name

        # Adding the burned in data
        all_burned_in_features = pd.concat([all_burned_in_features, burned_in_features.reset_index(drop=True)])
        all_burned_in_indices = pd.concat([all_burned_in_indices, burned_in_indices.reset_index(drop=True)])
        all_labels = pd.concat([all_labels, pd.Series(labels)])
        print(ticker_name + " done")

    # Returning the ticker as dummies
    if ticker_dummies:

        tickers = all_burned_in_features.pop('ticker')
        all_burned_in_features = pd.concat([all_burned_in_features, pd.get_dummies(tickers, prefix='d_ticker', drop_first=False)], axis=1)
    #     print('Are all burned_in_idx the same?', all(keepCheck==keepCheck[0]))
    #     print(dailyIndicies.head(50))
    return all_burned_in_features.reset_index(drop=True),\
            all_labels.reset_index(drop=True),\
            all_burned_in_indices.reset_index(drop=True)

In [79]:
n_classes = 2
# extract first 4 columns as the lag0 or raw OHLC prices (used for labelling)
price_candles = data[['open','high','low','close','Ticker']]

########### Align Data ################

# from imported function (see testing_preprocessing_features_and_labels.ipynb for thorough experimenting with all the cut-offs):    
X, y,indices = align_features_and_labels_multi_temp(price_candles = price_candles[price_candles.Ticker=='AAPL'], 
                                                 all_features = features[features.ticker=='AAPL'],
                                                 prediction_horizon = 1, 
                                                 n_feature_lags = n_feature_lags, 
                                                 n_classes = n_classes, # 5,
                                                 safe_burn_in = False, 
                                                 data_sample = 'full',
                                                 splitType='global',
                                                 noise=False,
                                                    ticker_dummies=False)

open_lag0                 0.240
high_lag0                 0.330
low_lag0                 -0.655
close_lag0              289.020
spread_open_lag0          0.240
                         ...   
d_sector_Healthcare       0.000
d_sector_Industrials      0.000
d_sector_Real Estate      0.000
d_sector_Technology       1.000
d_sector_Utilities        0.000
Name: (20200501, 0), Length: 71, dtype: float64 

[[False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]
 ...
 [False False False ... False False False]
 [False False False ... False False False]
 [ True  True  True ... False False False]] 

0 

2 

Number of NaNs in label: 1. 1 is expected
Returns that lead to NaNs in label: [0.01684919]
[0. 1. 1. 0. 1. 1. 1. 1. 1. 0.] [0. 1. 0. 0. 0. 0. 0. 1. 0. 0.] 

AAPL done


In [85]:
X.values

(7798, 72)

In [83]:
X[['close_lag0','relReturns_IDU_lag0','relReturns_IDU_lag1']]

Unnamed: 0,close_lag0,relReturns_IDU_lag0,relReturns_IDU_lag1
0,289.020,0.000274,0.000000
1,288.580,-0.000632,0.000274
2,289.095,0.007098,-0.000632
3,290.320,0.004596,0.007098
4,290.085,0.001235,0.004596
...,...,...,...
7793,319.250,-0.001607,-0.000808
7794,319.095,-0.001968,-0.001607
7795,318.845,-0.001389,-0.001968
7796,319.460,0.002837,-0.001389


In [77]:
y

Unnamed: 0,0
0,0.0
1,0.0
2,1.0
3,1.0
4,0.0
...,...
7794,0.0
7795,0.0
7796,1.0
7797,0.0


## Splitting the data

## Adding ticker dummies

In [6]:
## Adding ticker dummies
tickers = X.pop('ticker')
X = pd.concat([X, pd.get_dummies(tickers, prefix='ticker', drop_first=False)], axis=1)

In [13]:
X.columns

Index(['open_lag0', 'high_lag0', 'low_lag0', 'close_lag0', 'spread_open_lag0',
       'spread_high_lag0', 'spread_low_lag0', 'spread_close_lag0',
       'bidsize_open_lag0', 'bidsize_high_lag0', 'bidsize_low_lag0',
       'bidsize_close_lag0', 'ofrsize_open_lag0', 'ofrsize_high_lag0',
       'ofrsize_low_lag0', 'ofrsize_close_lag0', 'open_lag1', 'high_lag1',
       'low_lag1', 'close_lag1', 'spread_open_lag1', 'spread_high_lag1',
       'spread_low_lag1', 'spread_close_lag1', 'bidsize_open_lag1',
       'bidsize_high_lag1', 'bidsize_low_lag1', 'bidsize_close_lag1',
       'ofrsize_open_lag1', 'ofrsize_high_lag1', 'ofrsize_low_lag1',
       'ofrsize_close_lag1', 'sector_Basic Materials',
       'sector_Communication Services', 'sector_Consumer Cyclical',
       'sector_Consumer Defensive', 'sector_Energy',
       'sector_Financial Services', 'sector_Healthcare', 'sector_Industrials',
       'sector_Real Estate', 'sector_Technology', 'sector_Utilities',
       'ticker_AAPL', 'ticker_ABBV

## Constructing our final train/validation sets

In [39]:
train_ds = pd.concat([X.iloc[start:end, :] for (start, end) in train_ranges]).reset_index(drop=True)
train_y = pd.concat([y.iloc[start:end] for (start, end) in train_ranges]).reset_index(drop=True)

validate_ds = pd.concat([X.iloc[start:end, :] for (start, end) in val_ranges]).reset_index(drop=True)
val_y = pd.concat([y.iloc[start:end] for (start, end) in val_ranges]).reset_index(drop=True)

train_ds.shape, train_y.shape, validate_ds.shape, val_y.shape, train_y.shape[0] + val_y.shape[0]

((343090, 98), (343090, 1), (85800, 98), (85800, 1), 428890)

In [40]:
train_ranges[0]

[0, 6238]

In [41]:
train_ranges[0][1]+10

6248

## Pre-processing

In [44]:
{i:colname for i,colname in enumerate(train_ds.columns)}

{0: 'open_lag0',
 1: 'high_lag0',
 2: 'low_lag0',
 3: 'close_lag0',
 4: 'spread_open_lag0',
 5: 'spread_high_lag0',
 6: 'spread_low_lag0',
 7: 'spread_close_lag0',
 8: 'bidsize_open_lag0',
 9: 'bidsize_high_lag0',
 10: 'bidsize_low_lag0',
 11: 'bidsize_close_lag0',
 12: 'ofrsize_open_lag0',
 13: 'ofrsize_high_lag0',
 14: 'ofrsize_low_lag0',
 15: 'ofrsize_close_lag0',
 16: 'open_lag1',
 17: 'high_lag1',
 18: 'low_lag1',
 19: 'close_lag1',
 20: 'spread_open_lag1',
 21: 'spread_high_lag1',
 22: 'spread_low_lag1',
 23: 'spread_close_lag1',
 24: 'bidsize_open_lag1',
 25: 'bidsize_high_lag1',
 26: 'bidsize_low_lag1',
 27: 'bidsize_close_lag1',
 28: 'ofrsize_open_lag1',
 29: 'ofrsize_high_lag1',
 30: 'ofrsize_low_lag1',
 31: 'ofrsize_close_lag1',
 32: 'sector_Basic Materials',
 33: 'sector_Communication Services',
 34: 'sector_Consumer Cyclical',
 35: 'sector_Consumer Defensive',
 36: 'sector_Energy',
 37: 'sector_Financial Services',
 38: 'sector_Healthcare',
 39: 'sector_Industrials',
 40: 

In [47]:
# Creating one ppdict for individual preprocessings
# ppdict1 = {'open':'minmax',
#           'high':'log',
#           'low':'log',
#           'close':'std'}
splitpoint = 32

# Standardize some features
ppdict1 = {i:'std' for i in train_ds.columns[0:splitpoint]} 
# Keep some in actual levels (Dummies in this case).
ppdict2 = {i:'act' for i in train_ds.columns[splitpoint:]} 

# Merging the two
ppdict = {**ppdict1,**ppdict2}

In [54]:
train_ds,validate_ds = pre_processing(train_ds,
                                    validate_ds,
                                    ppdict,
                                    100,
                                    verbose =True)

Pre-Processing Procedure:  act
Columns Processed: ['sector_Basic Materials' 'sector_Communication Services'
 'sector_Consumer Cyclical' 'sector_Consumer Defensive' 'sector_Energy'
 'sector_Financial Services' 'sector_Healthcare' 'sector_Industrials'
 'sector_Real Estate' 'sector_Technology' 'sector_Utilities' 'ticker_AAPL'
 'ticker_ABBV' 'ticker_ABT' 'ticker_AEP' 'ticker_AMT' 'ticker_APD'
 'ticker_BA' 'ticker_BABA' 'ticker_BAC' 'ticker_BHP' 'ticker_BP'
 'ticker_CCI' 'ticker_CHL' 'ticker_COST' 'ticker_CSGP' 'ticker_D'
 'ticker_DIS' 'ticker_ECL' 'ticker_ENB' 'ticker_EXC' 'ticker_FB'
 'ticker_FMX' 'ticker_GOOG' 'ticker_INTC' 'ticker_JNJ' 'ticker_KO'
 'ticker_LFC' 'ticker_LIN' 'ticker_LMT' 'ticker_MA' 'ticker_MCD'
 'ticker_MSFT' 'ticker_NKE' 'ticker_NVDA' 'ticker_NVS' 'ticker_PBR'
 'ticker_PEP' 'ticker_PFE' 'ticker_PLD' 'ticker_PSA' 'ticker_PTR'
 'ticker_PYPL' 'ticker_RTX' 'ticker_SHW' 'ticker_SNP' 'ticker_SO'
 'ticker_SRE' 'ticker_T' 'ticker_TM' 'ticker_TSLA' 'ticker_TSM'
 'ticker_UNP' 't

In [53]:
ppX_train.iloc[:,0].mean(),ppX_train.iloc[:,0].std()

(-1.8927265537610815e-16, 1.000001457346533)

## Prepping for models

In [8]:
N_VALIDATION = val_y.shape[0] #int(1e3)
N_TRAIN = train_y.shape[0] #int(1e4)
# BUFFER_SIZE = int(1e4)
BATCH_SIZE = 256 #512 #32
MAX_EPOCHS = 500

STEPS_PER_EPOCH = N_TRAIN//BATCH_SIZE

N_REPEAT = int(N_TRAIN / ((STEPS_PER_EPOCH * MAX_EPOCHS) / BATCH_SIZE))
FEATURES = X.shape[1]

N_TRAIN, N_VALIDATION, N_TRAIN + N_VALIDATION, STEPS_PER_EPOCH, N_REPEAT, STEPS_PER_EPOCH * MAX_EPOCHS

(343090, 85800, 428890, 1340, 131, 670000)

## A Logistic Regression model in TF/Keras

In [55]:
METRICS = [
      #keras.metrics.TruePositives(name='tp'),
      #keras.metrics.FalsePositives(name='fp'),
      #keras.metrics.TrueNegatives(name='tn'),
      #keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      #keras.metrics.Precision(name='precision'),
      #keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

# def make_model(metrics = METRICS, output_bias=None):
#   if output_bias is not None:
#     output_bias = tf.keras.initializers.Constant(output_bias)
#   model = keras.Sequential([
#       keras.layers.Dense(
#           16, activation='relu',
#           input_shape=(train_features.shape[-1],)),
#       keras.layers.Dropout(0.5),
#       keras.layers.Dense(1, activation='sigmoid',
#                          bias_initializer=output_bias),
#   ])

#   model.compile(
#       optimizer=keras.optimizers.Adam(lr=1e-3),
#       loss=keras.losses.BinaryCrossentropy(),
#       metrics=metrics)

#   return model

# model = keras.Sequential({
#   keras.layers.Dense(1, input_shape=(FEATURES,))
# })

model = keras.Sequential([
#     keras.layers.Flatten(input_shape=(28, 28)),
#     keras.layers.Dense(128, activation='relu'),
#     keras.layers.Dense(10)
    keras.layers.Dense(1,
                       input_shape=(FEATURES,),
                       activation='sigmoid',
                       kernel_regularizer=regularizers.l2(1))
])

model.summary()

# with final activation (Keras/TF tutorial advises against this practice, but they also use it later in the tutorial)
# model = keras.Sequential({
#   keras.layers.Dense(1, input_shape=(FEATURES,), activation='sigmoid')
# })

#model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy', ])
model.compile(
              optimizer=keras.optimizers.Adam(), #lr=1e-3
              loss=keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=METRICS)

early_stopping = tf.keras.callbacks.EarlyStopping(
                                                monitor='val_auc', 
                                                verbose=1,
                                                patience=100,
                                                mode='max',
                                                restore_best_weights=True)

def get_callbacks(run_id):
      return [
             tfdocs.modeling.EpochDots(),
             early_stopping,
             tf.keras.callbacks.TensorBoard(logdir), #/run_id),
      ]

baseline_history = model.fit(
                            train_ds, #train_features,
                            train_y, #train_labels,
                            batch_size=512, #BATCH_SIZE,
                            epochs=1000, #EPOCHS,
                            callbacks = get_callbacks(run_id = 'first'), #[early_stopping],
                            validation_data=(validate_ds, val_y),
                            verbose=0) #(val_features, val_labels))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1)                 99        
Total params: 99
Trainable params: 99
Non-trainable params: 0
_________________________________________________________________





Epoch: 0, accuracy:0.5352,  auc:0.5034,  loss:0.8996,  val_accuracy:0.5456,  val_auc:0.5453,  val_loss:0.6876,  
....................................................................................................
Epoch: 100, accuracy:0.5480,  auc:0.5440,  loss:0.6873,  val_accuracy:0.5454,  val_auc:0.5459,  val_loss:0.6879,  
..................Restoring model weights from the end of the best epoch.
Epoch 00118: early stopping


In [56]:
model.evaluate(validate_ds,  val_y, verbose=2)

2682/2682 - 6s - loss: 0.6879 - accuracy: 0.5457 - auc: 0.5513


[0.6878659725189209, 0.5456876754760742, 0.5513222217559814]

In [11]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [12]:
import datetime
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
%tensorboard --logdir logs

ERROR: Timed out waiting for TensorBoard to start. It may still be running as pid 9296.