# The aim of this notebook is to implement the work presented in the article [here](https://towardsdatascience.com/how-to-train-multiple-machine-learning-models-and-run-other-data-tasks-in-parallel-by-combining-2fa9670dd579)

## Reading in packages

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import time
import h5py
import copy
import datetime
import ta
import pathlib
import shutil
import tempfile
import vaex
from IPython import display
from IPython.display import clear_output
import pyodbc

# Tensorflow related
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import layers
from tensorflow.keras import regularizers
import tensorflow.compat.v2.feature_column as fc

#!pip install -q git+https://github.com/tensorflow/docs

import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

print(tf.__version__)
logdir = pathlib.Path(tempfile.mkdtemp())/"tensorboard_logs"
shutil.rmtree(logdir, ignore_errors=True)
print(logdir)

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score, f1_score, log_loss


# Models
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.exceptions import ConvergenceWarning 
from sklearn import ensemble
# ConvergenceWarning('ignore')
# Do you wanna see?
verbose = True

import sys
sys.path.append('../')
#sys.path.append('...../')

from utils.data_extraction import load_data_final,load_data_and_save
from utils.data_cleaning import HFDataCleaning
from utils.generate_features import candleCreateNP_vect_final,\
                                    generateFeatures_final,\
                                    generateFeatures_multi_v2

from utils.preprocessing_features_and_labels import extract_labels,\
                                                    align_features_and_labels,\
                                                    pre_processing_initial,\
                                                    pre_processing_extended,\
                                                    pre_processing,\
                                                    extract_labels_multi_final,\
                                                    align_features_and_labels_multi_final,\
                                                    align_features_and_labels_multi_v5

from utils.models import make_input_fn
from utils.models import performanceTesting,scoreFunction
from utils.plotting import plot_confusion_matrix

2.2.0
C:\Users\PC\AppData\Local\Temp\tmp5w_xaxtv\tensorboard_logs


In [79]:
def align_features_and_labels_multi_v7(price_candles,
                                        all_features,
                                        prediction_horizon,
                                        n_feature_lags,
                                        n_classes,
                                        safe_burn_in = False,
                                        data_sample = 'full',
                                        splitType='global',
                                        noise = False,
                                        ticker_dummies = False):

    all_burned_in_features = pd.DataFrame()
    all_burned_in_indices = pd.DataFrame()
    all_labels = pd.DataFrame()
    
    dailyIndices = pd.DataFrame({'days':price_candles.index.get_level_values(0),
                                      'timestemps':price_candles.index.get_level_values(1),
                                      'ticker':price_candles.Ticker})

    if splitType.lower() == 'global':
        # Making the splits for the labels based on all tickers
        # returns = ((price_candles['close'].values[1:] / price_candles['close'].values[:-1]) -1) * 100
#         returns = np.concatenate([((price_candles[price_candles.Ticker==ticker]['close'].values[1:]/\
#                          price_candles[price_candles.Ticker==ticker]['close'].values[:-1])-1) for ticker\
#                           in price_candles.Ticker.unique()])

        returns = []
        tickers = []
        
        for ticker in price_candles.Ticker.unique():

            ticker_returns = (price_candles[price_candles.Ticker==ticker]['close'].values[1:]/\
                                 price_candles[price_candles.Ticker==ticker]['close'].values[:-1]) - 1
            ticker_names = [ticker for i in range(len(ticker_returns))]

            returns.append(ticker_returns)
            tickers.append(ticker_names)

        # concatenate returns and add noise
        returns = np.concatenate(returns)
        if noise:
            returns[returns==0] = np.random.normal(0,1,sum(returns==0))/1000000

        tickers = np.concatenate(tickers)

        _, splits = pd.qcut(returns, q=n_classes, labels=False, retbins=True)
        #print(splits)

        returns = pd.DataFrame({'returns': returns, 'Ticker': tickers})

    keepCheck = []

    for ticker_iter, ticker_name in enumerate(all_features.ticker.unique()):
        ticker_features = all_features[all_features.ticker==ticker_name].copy(deep=True)
        ticker_indices = dailyIndices[dailyIndices.ticker==ticker_name].copy(deep=True)
        # removing the "ticker" variable from ticker_features as np.isnan() does not like non-numericals
        #ticker_features = ticker_features.iloc[:, ticker_features.columns != 'ticker']
        ticker_features.drop('ticker', axis=1, inplace=True)
        # extract first 4 columns as the lag0 or raw OHLC prices (used for labelling)
        #ticker_prices = price_candles[price_candles.Ticker==ticker_name]['close'].values # candles.iloc[:, :4].values
        ticker_returns = returns[returns.Ticker==ticker_name]['returns'].values

        if not safe_burn_in:
            assert data_sample == 'full'
            # we assume data_sample is full and that we can continue features from yesterday's values.
            # that we have a single burn-in at the beginning and that's it

            # get first index that has no NaNs (the sum checks for True across columns, we look for sum == 0 and where that is first True)
            burned_in_idx = np.where((np.sum(np.isnan(ticker_features.values), axis=1) == 0) == True)[0][0]
            keepCheck.append(burned_in_idx)
            # calculate end-point cut-off to match with labels
            end_point_cut = max(prediction_horizon, n_feature_lags + 1)

            # slice away the observations used for burn-in (taking off 1 at the end to match with labels [slice off "prediction_horizon"])
            burned_in_features = ticker_features.iloc[burned_in_idx : -end_point_cut, :] #.reset_index(drop=True) # features[burned_in_idx:] latter is sligthly faster but maybe not as precise
            burned_in_indices = ticker_indices.iloc[burned_in_idx : -end_point_cut, :]
            # slice away the burned-in indices from labels
            labels = extract_labels_multi_final(data = ticker_returns[(burned_in_idx+n_feature_lags):],
                                                classes = n_classes,
                                                group_style = 'equal',
                                                splits = splits)
            # labels, returns, thresholds = extract_labels(data = candles[burned_in_idx + n_feature_lags : , :],
            #                                             classes = n_classes, group_style = 'equal')

            # check if there are remaining NaNs are burn-in (means error)
            remaining_nans = np.where(np.isnan(burned_in_features.values))[0].size
            if remaining_nans > 0:
                raise ValueError('Had NaN in burned_in_features after burn-in')

        # Adding the ticker
        burned_in_features.loc[:,'ticker'] = ticker_name

        # Adding the burned in data
        all_burned_in_features = pd.concat([all_burned_in_features, burned_in_features.reset_index(drop=True)])
        all_burned_in_indices = pd.concat([all_burned_in_indices, burned_in_indices.reset_index(drop=True)])
        all_labels = pd.concat([all_labels, pd.Series(labels)])
        print(ticker_name + " done")

    # Returning the ticker as dummies
    if ticker_dummies:

        tickers = all_burned_in_features.pop('ticker')
        all_burned_in_features = pd.concat([all_burned_in_features, pd.get_dummies(tickers, prefix='d_ticker', drop_first=False)], axis=1)
#     print('Are all burned_in_idx the same?', all(keepCheck==keepCheck[0]))
#     print(dailyIndicies.head(50))
    return all_burned_in_features.reset_index(drop=True),\
            all_labels.reset_index(drop=True),\
            all_burned_in_indices.reset_index(drop=True)

## Extracting data

In [3]:
# Do we extract new data or read in?
readIn = True
# run load_data()
if readIn:
    
    # Listing the data files 
    path = '../../../Google Drev/Thesis/Data/TAQ/AggregatedTAQ'
#     path = 'F:/AggregatedTAQ/round3'
    datafiles = os.listdir(path)
    content = np.concatenate([['\n\n'],[str(j)+': '+i+'\n' for j,i in enumerate(datafiles) if 'csv' in i],['\n\n']])
    
    # Asking for user input
    file = input('Which one do you want to load? %s'%''.join(content))
    data = pd.read_csv(path + '/' + datafiles[int(file)],
                       header = None,
                       names=['open','high','low','close',
                              'spread_open','spread_high','spread_low','spread_close',
                              'bidsize_open','bidsize_high','bidsize_low','bidsize_close',
                              'ofrsize_open','ofrsize_high','ofrsize_low','ofrsize_close',
                              'Ticker'])
    # Lower casing all column names
#     data.columns = data.columns.str.lower()
else:
    
    # print(os.listdir())
    try:
        path = 'a:/taqhdf5'  #'a:/taqhdf5'
        os.listdir(path)
    except:
        path = 't:/taqhdf5'  #'a:/taqhdf5'
        os.listdir(path)
        
    # Sample type
    data_sample = 'full' # or 'stable'
    # allFiles = os.listdir(path)
    # print(len(allFiles), allFiles[:5], allFiles[-5:])
    # print(allFiles[-10:])

    #dates = np.array(['2020040' + str(i) if i < 10 else '202004' + str(i) for i in np.arange(1,16)]).astype(int)
    dates = np.array(['20200501']).astype(int)#,'20200402','20200403','20200406','20200407'

    # Provide a list of tickers of interest
    
    tickers = sorted(['TSLA','FB'])#'MSFT'
    
    # Do we need data on trades, quotes or both?
    dataNeeded = 'quotes' # 'trades', 'quotes' or 'both'
    
    if dataNeeded == 'trades':
        tradeData = load_data_final(dates, tickers, dataNeeded, path, verbose)
    elif dataNeeded == 'quotes':
        quoteData = load_data_final(dates,
                                    tickers,
                                    dataNeeded,
                                    path,
                                    verbose,
                                    extract_candles = False,
                                    aggHorizon = 1,
                                    extra_features_from_quotes = None,
                                    data_sample = data_sample)
    elif dataNeeded == 'both':
        tradeData, quoteData = load_data_final(dates, tickers, dataNeeded, path, verbose)

# Reading in sector information
stockInfo = pd.read_csv('../utils/stockInfo_v1.csv',header=[0,1])
stockInfo.columns = ['ticker','sector','exchange','marketCap']

# Creating a table with stock information based on the tickers available in the data.
uniqueTickers = data.Ticker.unique()
stockTable = stockInfo[stockInfo.ticker.isin(uniqueTickers)]
stockTable.head(10)

Which one do you want to load? 

0: aggregateTAQ_May2020_10sec (1).csv
1: aggregateTAQ_May2020_30sec (1).csv
2: aggregateTAQ_May2020_60sec.csv


2


Unnamed: 0,ticker,sector,exchange,marketCap
12,AAPL,Technology,NMS,1578173000000.0
20,ABBV,Healthcare,NYQ,174261200000.0
34,ABT,Healthcare,NYQ,163141000000.0
126,AEP,Utilities,NYQ,40895510000.0
379,AMT,Real Estate,NYQ,117125900000.0
428,APD,Basic Materials,NYQ,54643950000.0
697,BA,Industrials,NYQ,102035600000.0
699,BABA,Consumer Cyclical,NYQ,593653600000.0
700,BAC,Financial Services,NYQ,202055000000.0
870,BHP,Basic Materials,NYQ,125819400000.0


### Dropping ETFS

In [4]:
etfs = ['IYH','IYM','IYK','IYJ','IYG','IYW','IYC','IYR','IDU','IYZ','IYE','IYF']

# Extracting the sector ETFs to a separate variable
sectorETFS = data[data.Ticker.isin(etfs)]

# Removing the ETFs
data = data[~data.Ticker.isin(etfs)]

## Generating Features

In [5]:
########### Generate Features ################

n_feature_lags = 1

features = generateFeatures_multi_v2(data = data, 
                                  listOfFeatures = [
                                                    'pastobs',
                                                    'spread',
                                                    'bidsize',
                                                    'ofrsize',
#                                                     'stok',
#                                                     'stod',
#                                                     'sstod',
#                                                     'wilr',
#                                                     'roc',
#                                                     'rsi',
#                                                     'atr',
#                                                     'cci',
#                                                     'dpo',
#                                                     'sma',
#                                                     'ema',
#                                                     'macd',
#                                                       'macd_diff',
#                                                       'macd_signal',
#                                                     'dis5',
#                                                     'dis10',
                                                      'sector'
                                                   ], 
                                   feature_lags = n_feature_lags
                                     ,stockTable=stockTable)

########### Generate Labels ################

n_classes = 2
# extract first 4 columns as the lag0 or raw OHLC prices (used for labelling)
price_candles = data[['open','high','low','close','Ticker']]

########### Align Data ################

# from imported function (see testing_preprocessing_features_and_labels.ipynb for thorough experimenting with all the cut-offs):    
X, y,indices = align_features_and_labels_multi_final(price_candles = price_candles, 
                                             all_features = features,
                                             prediction_horizon = 1, 
                                             n_feature_lags = n_feature_lags, 
                                             n_classes = n_classes, # 5,
                                             safe_burn_in = False, 
                                             data_sample = 'full',
                                             splitType='global',
                                             noise=False,ticker_dummies=False)

# X_temp, y_temp,indices = align_features_and_labels_multi_v7(price_candles = price_candles, 
#                                                              all_features = features,
#                                                              prediction_horizon = 1, 
#                                                              n_feature_lags = n_feature_lags, 
#                                                              n_classes = n_classes, # 5,
#                                                              safe_burn_in = False, 
#                                                              data_sample = 'full',
#                                                              splitType='global',
#                                                              noise=False,ticker_dummies=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


AAPL done
ABBV done
ABT done
AEP done
AMT done
APD done
BA done
BABA done
BAC done
BHP done
BP done
CCI done
CHL done
COST done
CSGP done
D done
DIS done
ECL done
ENB done
EXC done
FB done
FMX done
GOOG done
INTC done
JNJ done
KO done
LFC done
LIN done
LMT done
MA done
MCD done
MSFT done
NKE done
NVDA done
NVS done
Number of NaNs in label: 1. 1 is expected
Returns that lead to NaNs in label: [0.0907158]
PBR done
PEP done
PFE done
PLD done
PSA done
PTR done
PYPL done
RTX done
SHW done
SNP done
SO done
SRE done
T done
TM done
TSLA done
TSM done
UNP done
UPS done
V done
WMT done


In [6]:
indices

Unnamed: 0,days,timestemps,ticker
0,20200501,0,AAPL
1,20200501,1,AAPL
2,20200501,2,AAPL
3,20200501,3,AAPL
4,20200501,4,AAPL
...,...,...,...
428885,20200529,383,WMT
428886,20200529,384,WMT
428887,20200529,385,WMT
428888,20200529,386,WMT


NameError: name 'X_temp' is not defined

In [None]:
X

## Can we replicate the below now?

In [None]:
data

In [8]:
data.reset_index().sort_values(['level_0','level_1','Ticker']).set_index(['level_0','level_1'])

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker
level_0,level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL
20200501,0,81.000,82.110,80.580,81.010,2.00,2.06,0.01,0.24,12.0,12.0,1.0,1.0,1.0,9.0,1.0,1.0,ABBV
20200501,0,91.025,91.435,90.490,91.220,1.89,1.89,0.03,0.24,1.0,5.0,1.0,1.0,1.0,5.0,1.0,1.0,ABT
20200501,0,83.455,83.980,82.250,82.470,3.09,4.14,0.10,0.14,1.0,7.0,1.0,1.0,7.0,8.0,1.0,1.0,AEP
20200501,0,234.350,234.575,232.215,232.950,3.30,3.87,0.27,2.90,1.0,3.0,1.0,1.0,2.0,3.0,1.0,1.0,AMT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,389,50.455,50.495,50.325,50.325,0.01,0.05,0.01,0.05,5.0,38.0,1.0,2.0,7.0,24.0,1.0,9.0,TSM
20200529,389,169.960,170.030,169.610,169.675,0.04,0.36,0.01,0.23,10.0,11.0,1.0,1.0,1.0,10.0,1.0,2.0,UNP
20200529,389,100.095,100.110,99.670,99.670,0.05,0.13,0.01,0.12,1.0,11.0,1.0,7.0,1.0,11.0,1.0,1.0,UPS
20200529,389,195.855,195.920,194.195,194.195,0.03,2.39,0.01,2.39,2.0,7.0,1.0,1.0,1.0,9.0,1.0,2.0,V


In [9]:
# Let's see number 1
data.reset_index().sort_values(['level_0','level_1','Ticker']).set_index(['level_0','level_1'])[['close']][(data.reset_index().sort_values(['level_0','level_1','Ticker']).set_index(['level_0','level_1']).index.get_level_values(1)==1)&(data.reset_index().sort_values(['level_0','level_1','Ticker']).set_index(['level_0','level_1']).index.get_level_values(0)==20200501)]

Unnamed: 0_level_0,Unnamed: 1_level_0,close
level_0,level_1,Unnamed: 2_level_1
20200501,1,289.02
20200501,1,81.76
20200501,1,91.22
20200501,1,82.37
20200501,1,233.54
20200501,1,222.465
20200501,1,142.89
20200501,1,196.92
20200501,1,23.345
20200501,1,38.985


In [10]:
X.loc[indices.sort_values(['days','timestemps','ticker']).index,:].head(66)

Unnamed: 0,open_lag0,high_lag0,low_lag0,close_lag0,spread_open_lag0,spread_high_lag0,spread_low_lag0,spread_close_lag0,bidsize_open_lag0,bidsize_high_lag0,...,sector_Consumer Cyclical,sector_Consumer Defensive,sector_Energy,sector_Financial Services,sector_Healthcare,sector_Industrials,sector_Real Estate,sector_Technology,sector_Utilities,ticker
0,0.240,0.330,-0.655,289.020,0.24,0.45,0.01,0.10,9.0,20.0,...,0,0,0,0,0,0,0,1,0,AAPL
7798,-0.750,0.310,-0.750,81.760,0.24,1.06,0.07,0.80,14.0,20.0,...,0,0,0,0,1,0,0,0,0,ABBV
15596,-0.030,0.160,-0.320,91.220,0.18,0.58,0.05,0.12,1.0,3.0,...,0,0,0,0,1,0,0,0,0,ABT
23394,0.105,0.105,-0.100,82.370,0.13,0.54,0.10,0.26,1.0,2.0,...,0,0,0,0,0,0,0,0,1,AEP
31192,-0.335,0.685,-0.380,233.540,2.39,3.99,1.42,2.10,1.0,1.0,...,0,0,0,0,0,0,1,0,0,AMT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46789,1.130,1.135,-0.180,141.770,0.54,0.83,0.01,0.36,2.0,6.0,...,0,0,0,0,0,1,0,0,0,BA
54587,0.210,0.380,-0.185,196.710,0.14,0.35,0.01,0.04,1.0,10.0,...,1,0,0,0,0,0,0,0,0,BABA
62385,-0.050,0.030,-0.060,23.395,0.01,0.04,0.01,0.01,14.0,37.0,...,0,0,0,1,0,0,0,0,0,BAC
70183,0.000,0.050,-0.050,38.990,0.02,0.08,0.01,0.04,1.0,5.0,...,0,0,0,0,0,0,0,0,0,BHP


In [11]:
X.loc[indices.sort_values(['days','timestemps','ticker']).index,:].head(66)['close_lag0'].values

array([ 289.02 ,   81.76 ,   91.22 ,   82.37 ,  233.54 ,  222.465,
        142.89 ,  196.92 ,   23.345,   38.985,   23.145,  156.54 ,
         39.255,  301.76 ,  638.615,   76.18 ,  105.765,  189.35 ,
         30.085,   36.375,  202.68 ,   63.23 , 1332.65 ,   58.82 ,
        148.905,   45.47 ,   10.29 ,  183.25 ,  387.765,  269.125,
        184.39 ,  176.515,   85.27 ,  286.32 ,   84.21 ,    6.735,
        131.27 ,   37.785,   87.1  ,  179.82 ,   34.92 ,  120.815,
         63.29 ,  527.2  ,   48.475,   55.845,  121.405,   30.685,
        121.725,  757.875,   52.44 ,  156.095,   93.445,  175.26 ,
        121.08 ,  288.58 ,   81.285,   91.17 ,   82.245,  235.13 ,
        223.605,  141.77 ,  196.71 ,   23.395,   38.99 ,   23.135])

In [12]:
data.reset_index().sort_values(['level_0','level_1','Ticker']).set_index(['level_0','level_1'])[['close']][(data.reset_index().sort_values(['level_0','level_1','Ticker']).set_index(['level_0','level_1']).index.get_level_values(1)==1)&(data.reset_index().sort_values(['level_0','level_1','Ticker']).set_index(['level_0','level_1']).index.get_level_values(0)==20200501)].values.T

array([[ 289.02 ,   81.76 ,   91.22 ,   82.37 ,  233.54 ,  222.465,
         142.89 ,  196.92 ,   23.345,   38.985,   23.145,  156.54 ,
          39.255,  301.76 ,  638.615,   76.18 ,  105.765,  189.35 ,
          30.085,   36.375,  202.68 ,   63.23 , 1332.65 ,   58.82 ,
         148.905,   45.47 ,   10.29 ,  183.25 ,  387.765,  269.125,
         184.39 ,  176.515,   85.27 ,  286.32 ,   84.21 ,    6.735,
         131.27 ,   37.785,   87.1  ,  179.82 ,   34.92 ,  120.815,
          63.29 ,  527.2  ,   48.475,   55.845,  121.405,   30.685,
         121.725,  757.875,   52.44 ,  156.095,   93.445,  175.26 ,
         121.08 ]])

In [13]:
X.loc[indices.sort_values(['days','timestemps','ticker']).index,:].head(66)['close_lag0'].values==data.reset_index().sort_values(['level_0','level_1','Ticker']).set_index(['level_0','level_1'])[['close']][(data.reset_index().sort_values(['level_0','level_1','Ticker']).set_index(['level_0','level_1']).index.get_level_values(1)==1)&(data.reset_index().sort_values(['level_0','level_1','Ticker']).set_index(['level_0','level_1']).index.get_level_values(0)==20200501)].values.T

  """Entry point for launching an IPython kernel.


False

In [14]:
indices.sort_values(['days','timestemps','ticker']).index

Int64Index([     0,   7798,  15596,  23394,  31192,  38990,  46788,  54586,
             62384,  70182,
            ...
            358707, 366505, 374303, 382101, 389899, 397697, 405495, 413293,
            421091, 428889],
           dtype='int64', length=428890)

In [15]:
indices.loc[7798,:]

days          20200501
timestemps           0
ticker            ABBV
Name: 7798, dtype: object

In [16]:
X.loc[7798,:][['close_lag0']]

close_lag0    81.76
Name: 7798, dtype: object

In [18]:
X.loc[indices.sort_values(['days','timestemps','ticker']).index,:].head(66)

Unnamed: 0,open_lag0,high_lag0,low_lag0,close_lag0,spread_open_lag0,spread_high_lag0,spread_low_lag0,spread_close_lag0,bidsize_open_lag0,bidsize_high_lag0,...,sector_Consumer Cyclical,sector_Consumer Defensive,sector_Energy,sector_Financial Services,sector_Healthcare,sector_Industrials,sector_Real Estate,sector_Technology,sector_Utilities,ticker
0,0.240,0.330,-0.655,289.020,0.24,0.45,0.01,0.10,9.0,20.0,...,0,0,0,0,0,0,0,1,0,AAPL
7798,-0.750,0.310,-0.750,81.760,0.24,1.06,0.07,0.80,14.0,20.0,...,0,0,0,0,1,0,0,0,0,ABBV
15596,-0.030,0.160,-0.320,91.220,0.18,0.58,0.05,0.12,1.0,3.0,...,0,0,0,0,1,0,0,0,0,ABT
23394,0.105,0.105,-0.100,82.370,0.13,0.54,0.10,0.26,1.0,2.0,...,0,0,0,0,0,0,0,0,1,AEP
31192,-0.335,0.685,-0.380,233.540,2.39,3.99,1.42,2.10,1.0,1.0,...,0,0,0,0,0,0,1,0,0,AMT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46789,1.130,1.135,-0.180,141.770,0.54,0.83,0.01,0.36,2.0,6.0,...,0,0,0,0,0,1,0,0,0,BA
54587,0.210,0.380,-0.185,196.710,0.14,0.35,0.01,0.04,1.0,10.0,...,1,0,0,0,0,0,0,0,0,BABA
62385,-0.050,0.030,-0.060,23.395,0.01,0.04,0.01,0.01,14.0,37.0,...,0,0,0,1,0,0,0,0,0,BAC
70183,0.000,0.050,-0.050,38.990,0.02,0.08,0.01,0.04,1.0,5.0,...,0,0,0,0,0,0,0,0,0,BHP


In [19]:
y.loc[indices.sort_values(['days','timestemps','ticker']).index,:].head(66)

Unnamed: 0,0
0,0.0
7798,0.0
15596,0.0
23394,0.0
31192,1.0
...,...
46789,1.0
54587,0.0
62385,1.0
70183,1.0


In [None]:
y

# Splitting the data


In [22]:
# data = data.reset_index().sort_values(['level_0',
#                                 'level_1',
#                                 'Ticker']).set_index(['level_0',
#                                                       'level_1'])
X = X.loc[indices.sort_values(['days','timestemps','ticker']).index,:]#.head(66)
first_val_day = int(np.floor(indices.days.unique().shape[0]*0.8))



In [25]:
tempIndices = indices.sort_values(['days','timestemps','ticker'])
tempIndices

Unnamed: 0,days,timestemps,ticker
0,20200501,0,AAPL
7798,20200501,0,ABBV
15596,20200501,0,ABT
23394,20200501,0,AEP
31192,20200501,0,AMT
...,...,...,...
397697,20200529,387,TSM
405495,20200529,387,UNP
413293,20200529,387,UPS
421091,20200529,387,V


In [27]:
tempIndices.days.unique()[first_val_day]

20200526

In [32]:
X[tempIndices.days<tempIndices.days.unique()[first_val_day]]

Unnamed: 0,open_lag0,high_lag0,low_lag0,close_lag0,spread_open_lag0,spread_high_lag0,spread_low_lag0,spread_close_lag0,bidsize_open_lag0,bidsize_high_lag0,...,sector_Consumer Cyclical,sector_Consumer Defensive,sector_Energy,sector_Financial Services,sector_Healthcare,sector_Industrials,sector_Real Estate,sector_Technology,sector_Utilities,ticker
0,0.240,0.330,-0.655,289.020,0.24,0.45,0.01,0.10,9.0,20.0,...,0,0,0,0,0,0,0,1,0,AAPL
7798,-0.750,0.310,-0.750,81.760,0.24,1.06,0.07,0.80,14.0,20.0,...,0,0,0,0,1,0,0,0,0,ABBV
15596,-0.030,0.160,-0.320,91.220,0.18,0.58,0.05,0.12,1.0,3.0,...,0,0,0,0,1,0,0,0,0,ABT
23394,0.105,0.105,-0.100,82.370,0.13,0.54,0.10,0.26,1.0,2.0,...,0,0,0,0,0,0,0,0,1,AEP
31192,-0.335,0.685,-0.380,233.540,2.39,3.99,1.42,2.10,1.0,1.0,...,0,0,0,0,0,0,1,0,0,AMT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396139,0.035,0.165,-0.040,51.300,0.17,0.26,0.01,0.02,5.0,23.0,...,0,0,0,0,0,0,0,1,0,TSM
403937,0.890,0.965,-0.290,169.435,0.35,1.41,0.01,0.93,1.0,10.0,...,0,0,0,0,0,1,0,0,0,UNP
411735,0.105,0.235,-0.465,97.955,0.86,1.32,0.06,0.15,1.0,6.0,...,0,0,0,0,0,1,0,0,0,UPS
419533,-0.080,0.540,-0.245,195.705,0.75,1.08,0.01,0.07,1.0,4.0,...,0,0,0,1,0,0,0,0,0,V


In [33]:
X[tempIndices.days>=tempIndices.days.unique()[first_val_day]]

Unnamed: 0,open_lag0,high_lag0,low_lag0,close_lag0,spread_open_lag0,spread_high_lag0,spread_low_lag0,spread_close_lag0,bidsize_open_lag0,bidsize_high_lag0,...,sector_Consumer Cyclical,sector_Consumer Defensive,sector_Energy,sector_Financial Services,sector_Healthcare,sector_Industrials,sector_Real Estate,sector_Technology,sector_Utilities,ticker
6240,0.700,0.705,-0.120,323.180,0.12,0.22,0.01,0.04,1.0,14.0,...,0,0,0,0,0,0,0,1,0,AAPL
14038,0.240,0.240,0.000,93.050,0.04,0.13,0.01,0.06,1.0,4.0,...,0,0,0,0,1,0,0,0,0,ABBV
21836,0.165,0.175,-0.115,92.855,0.18,0.33,0.01,0.03,1.0,4.0,...,0,0,0,0,1,0,0,0,0,ABT
29634,0.025,0.080,-0.065,79.620,0.21,0.21,0.01,0.08,4.0,4.0,...,0,0,0,0,0,0,0,0,1,AEP
37432,0.135,0.590,-1.060,247.855,0.28,2.77,0.04,0.99,1.0,2.0,...,0,0,0,0,0,0,1,0,0,AMT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397697,0.090,0.095,0.000,50.455,0.01,0.03,0.01,0.01,11.0,25.0,...,0,0,0,0,0,0,0,1,0,TSM
405495,0.015,0.070,-0.070,169.965,0.06,0.12,0.01,0.03,2.0,10.0,...,0,0,0,0,0,1,0,0,0,UNP
413293,0.075,0.095,-0.030,100.075,0.04,0.09,0.01,0.01,1.0,10.0,...,0,0,0,0,0,1,0,0,0,UPS
421091,0.000,0.120,-0.055,195.850,0.08,0.12,0.01,0.04,1.0,16.0,...,0,0,0,1,0,0,0,0,0,V


In [34]:
tempIndices[tempIndices.days>=tempIndices.days.unique()[first_val_day]]

Unnamed: 0,days,timestemps,ticker
6240,20200526,0,AAPL
14038,20200526,0,ABBV
21836,20200526,0,ABT
29634,20200526,0,AEP
37432,20200526,0,AMT
...,...,...,...
397697,20200529,387,TSM
405495,20200529,387,UNP
413293,20200529,387,UPS
421091,20200529,387,V


In [None]:
X_train = X[in]
X_test = data.loc[(data.index.get_level_values(0).unique()[first_val_day:],)]

In [23]:
first_val_day

16

In [21]:
X

Unnamed: 0,open_lag0,high_lag0,low_lag0,close_lag0,spread_open_lag0,spread_high_lag0,spread_low_lag0,spread_close_lag0,bidsize_open_lag0,bidsize_high_lag0,...,sector_Consumer Cyclical,sector_Consumer Defensive,sector_Energy,sector_Financial Services,sector_Healthcare,sector_Industrials,sector_Real Estate,sector_Technology,sector_Utilities,ticker
0,0.240,0.330,-0.655,289.020,0.24,0.45,0.01,0.10,9.0,20.0,...,0,0,0,0,0,0,0,1,0,AAPL
7798,-0.750,0.310,-0.750,81.760,0.24,1.06,0.07,0.80,14.0,20.0,...,0,0,0,0,1,0,0,0,0,ABBV
15596,-0.030,0.160,-0.320,91.220,0.18,0.58,0.05,0.12,1.0,3.0,...,0,0,0,0,1,0,0,0,0,ABT
23394,0.105,0.105,-0.100,82.370,0.13,0.54,0.10,0.26,1.0,2.0,...,0,0,0,0,0,0,0,0,1,AEP
31192,-0.335,0.685,-0.380,233.540,2.39,3.99,1.42,2.10,1.0,1.0,...,0,0,0,0,0,0,1,0,0,AMT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397697,0.090,0.095,0.000,50.455,0.01,0.03,0.01,0.01,11.0,25.0,...,0,0,0,0,0,0,0,1,0,TSM
405495,0.015,0.070,-0.070,169.965,0.06,0.12,0.01,0.03,2.0,10.0,...,0,0,0,0,0,1,0,0,0,UNP
413293,0.075,0.095,-0.030,100.075,0.04,0.09,0.01,0.01,1.0,10.0,...,0,0,0,0,0,1,0,0,0,UPS
421091,0.000,0.120,-0.055,195.850,0.08,0.12,0.01,0.04,1.0,16.0,...,0,0,0,1,0,0,0,0,0,V


In [None]:
stop

In [None]:
int(data.shape[0]*0.8)

In [None]:
data.reset_index().sort_values(['level_0',
                                'level_1',
                                'Ticker']).set_index(['level_0',
                                                      'level_1']).iloc[int(data.shape[0]*0.8):,:]

In [None]:
data.index.get_level_values(0).unique()

In [None]:
data = data.reset_index().sort_values(['level_0',
                                'level_1',
                                'Ticker']).set_index(['level_0',
                                                      'level_1'])
first_val_day = int(np.floor(data.index.get_level_values(0).unique().shape[0]*0.8))

X_train = data.loc[(data.index.get_level_values(0).unique()[:first_val_day],)]
X_test = data.loc[(data.index.get_level_values(0).unique()[first_val_day:],)]

# first_val_day = int(np.floor(data.reset_index().loc[:,'level_0'].unique().shape[0]*0.8))

# data.reset_index().loc[:,'level_0'].unique()[0:first_val_day]
# # data.reset_index().loc[:,'level_0'].unique().shape

In [None]:
X_train[X_train.Ticker=='AAPL']

In [None]:
X_test

In [None]:
data.reset_index().loc[:,'level_0'].unique()[first_val_day:]

In [None]:

data.reset_index().sort_values(['level_0',
                                'level_1',
                                'Ticker']).set_index(['level_0',
                                                      'level_1']).index.get_level_values(0)

In [None]:
list(data.reset_index().loc[:,'level_0'].unique()[first_val_day:])

In [None]:
data.reset_index().loc[:,'level_0'].unique()[first_val_day:]

In [None]:
data[(list(data.reset_index().loc[:,'level_0'].unique()[first_val_day:]),)]

In [None]:
data.reset_index().loc[(data.reset_index().loc[:,'level_0'].unique()[first_val_day:],)]

In [None]:
for i in X.ticker.unique():
    
    print(i,' - shape: ',X[X.ticker=='AAPL'].shape[0])

In [None]:
X.columns

In [None]:
X.iloc[:,:-1].values

In [None]:
X.values.shape

In [None]:
test = pd.DataFrame({'container':X.iloc[:,:-1].values,'ticker':X.ticker})

# Testing a method based on no-resetting

In [None]:
X_temp.reset_index().sort_values(['index','ticker'])

In [None]:
X_temp.reset_index().loc[:,'index'].unique()

# Attemp to do it by day

In [None]:
data[['open','high','low','close','Ticker']]

## Splitting the data

In [29]:
# Let's have a proper split (along tickers & dates)
train_size = 0.8
data_splits = pd.DataFrame()
data_splits = X.index.to_series().groupby(X['ticker']).agg(['first','last']).reset_index()

data_splits['val_size'] = ((1-train_size) * (data_splits['last'] - data_splits['first'])).astype(int)
data_splits['val_start_idx'] = data_splits['last'] - data_splits['val_size']
data_splits['val_end_idx'] = data_splits['last'] + 1 # to get the last observation included

data_splits['train_start_idx'] =  data_splits['first']
data_splits['train_end_idx'] = data_splits['val_start_idx']

# Store ranges

train_ranges = [list(x) for x in zip(data_splits['train_start_idx'], data_splits['train_end_idx'])]
val_ranges = [list(x) for x in zip(data_splits['val_start_idx'], data_splits['val_end_idx'])]


if verbose:
    data_splits

INFO:MainThread:numexpr.utils:NumExpr defaulting to 4 threads.


## Adding ticker dummies

In [None]:
## Adding ticker dummies
tickers = X.pop('ticker')
X = pd.concat([X, pd.get_dummies(tickers, prefix='ticker', drop_first=False)], axis=1)

In [None]:
X.columns

In [None]:
train_ranges[0]

## Constructing our final train/validation sets

In [30]:
train_ds = pd.concat([X.iloc[start:end, :] for (start, end) in train_ranges]).reset_index(drop=True)
train_y = pd.concat([y.iloc[start:end] for (start, end) in train_ranges]).reset_index(drop=True)

validate_ds = pd.concat([X.iloc[start:end, :] for (start, end) in val_ranges]).reset_index(drop=True)
val_y = pd.concat([y.iloc[start:end] for (start, end) in val_ranges]).reset_index(drop=True)

train_ds.shape, train_y.shape, validate_ds.shape, val_y.shape, train_y.shape[0] + val_y.shape[0]

((343090, 44), (343090, 1), (85800, 44), (85800, 1), 428890)

In [None]:
train_ranges[0]

In [None]:
train_ranges[0][1]+10

## Pre-processing

In [None]:
{i:colname for i,colname in enumerate(train_ds.columns)}

In [None]:
# Creating one ppdict for individual preprocessings
# ppdict1 = {'open':'minmax',
#           'high':'log',
#           'low':'log',
#           'close':'std'}
splitpoint = 32

# Standardize some features
ppdict1 = {i:'std' for i in train_ds.columns[0:splitpoint]} 
# Keep some in actual levels (Dummies in this case).
ppdict2 = {i:'act' for i in train_ds.columns[splitpoint:]} 

# Merging the two
ppdict = {**ppdict1,**ppdict2}

In [None]:
train_ds,validate_ds = pre_processing(train_ds,
                                    validate_ds,
                                    ppdict,
                                    100,
                                    verbose =True)

In [None]:
ppX_train.iloc[:,0].mean(),ppX_train.iloc[:,0].std()

## Prepping for models

In [None]:
N_VALIDATION = val_y.shape[0] #int(1e3)
N_TRAIN = train_y.shape[0] #int(1e4)
# BUFFER_SIZE = int(1e4)
BATCH_SIZE = 256 #512 #32
MAX_EPOCHS = 500

STEPS_PER_EPOCH = N_TRAIN//BATCH_SIZE

N_REPEAT = int(N_TRAIN / ((STEPS_PER_EPOCH * MAX_EPOCHS) / BATCH_SIZE))
FEATURES = X.shape[1]

N_TRAIN, N_VALIDATION, N_TRAIN + N_VALIDATION, STEPS_PER_EPOCH, N_REPEAT, STEPS_PER_EPOCH * MAX_EPOCHS

## A Logistic Regression model in TF/Keras

In [None]:
METRICS = [
      #keras.metrics.TruePositives(name='tp'),
      #keras.metrics.FalsePositives(name='fp'),
      #keras.metrics.TrueNegatives(name='tn'),
      #keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      #keras.metrics.Precision(name='precision'),
      #keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

# def make_model(metrics = METRICS, output_bias=None):
#   if output_bias is not None:
#     output_bias = tf.keras.initializers.Constant(output_bias)
#   model = keras.Sequential([
#       keras.layers.Dense(
#           16, activation='relu',
#           input_shape=(train_features.shape[-1],)),
#       keras.layers.Dropout(0.5),
#       keras.layers.Dense(1, activation='sigmoid',
#                          bias_initializer=output_bias),
#   ])

#   model.compile(
#       optimizer=keras.optimizers.Adam(lr=1e-3),
#       loss=keras.losses.BinaryCrossentropy(),
#       metrics=metrics)

#   return model

# model = keras.Sequential({
#   keras.layers.Dense(1, input_shape=(FEATURES,))
# })

model = keras.Sequential([
#     keras.layers.Flatten(input_shape=(28, 28)),
#     keras.layers.Dense(128, activation='relu'),
#     keras.layers.Dense(10)
    keras.layers.Dense(1,
                       input_shape=(FEATURES,),
                       activation='sigmoid',
                       kernel_regularizer=regularizers.l2(1))
])

model.summary()

# with final activation (Keras/TF tutorial advises against this practice, but they also use it later in the tutorial)
# model = keras.Sequential({
#   keras.layers.Dense(1, input_shape=(FEATURES,), activation='sigmoid')
# })

#model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy', ])
model.compile(
              optimizer=keras.optimizers.Adam(), #lr=1e-3
              loss=keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=METRICS)

early_stopping = tf.keras.callbacks.EarlyStopping(
                                                monitor='val_auc', 
                                                verbose=1,
                                                patience=100,
                                                mode='max',
                                                restore_best_weights=True)

def get_callbacks(run_id):
      return [
             tfdocs.modeling.EpochDots(),
             early_stopping,
             tf.keras.callbacks.TensorBoard(logdir), #/run_id),
      ]

baseline_history = model.fit(
                            train_ds, #train_features,
                            train_y, #train_labels,
                            batch_size=512, #BATCH_SIZE,
                            epochs=1000, #EPOCHS,
                            callbacks = get_callbacks(run_id = 'first'), #[early_stopping],
                            validation_data=(validate_ds, val_y),
                            verbose=0) #(val_features, val_labels))

In [None]:
model.evaluate(validate_ds,  val_y, verbose=2)

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
import datetime
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
%tensorboard --logdir logs