## Reading in packages

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import time
import h5py
import copy
import datetime
import ta
import pathlib
import shutil
import tempfile
import vaex as vx
from IPython import display
from IPython.display import clear_output
import pyodbc

# Tensorflow related
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import layers
from tensorflow.keras import regularizers
import tensorflow.compat.v2.feature_column as fc

#!pip install -q git+https://github.com/tensorflow/docs

import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

print(tf.__version__)
logdir = pathlib.Path(tempfile.mkdtemp())/"tensorboard_logs"
shutil.rmtree(logdir, ignore_errors=True)
print(logdir)

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score, f1_score, log_loss


# Models
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.exceptions import ConvergenceWarning 
from sklearn import ensemble
# ConvergenceWarning('ignore')
# Do you wanna see?
verbose = True

import sys
sys.path.append('../')
#sys.path.append('...../')

from utils.data_extraction import load_data_final,load_data_and_save
from utils.data_cleaning import HFDataCleaning
from utils.generate_features import candleCreateNP_vect_final,\
                                    generateFeatures_final,\
                                    generateFeatures_multi_final

from utils.preprocessing_features_and_labels import extract_labels,\
                                                    align_features_and_labels,\
                                                    pre_processing_initial,\
                                                    pre_processing_extended,\
                                                    pre_processing_final,\
                                                    extract_labels_multi_final,\
                                                    align_features_and_labels_multi_final,\
                                                    align_features_and_labels_multi_v5

from utils.models import make_input_fn
from utils.models import performanceTesting,scoreFunction
from utils.plotting import plot_confusion_matrix
from scipy.stats import kurtosis,skew,spearmanr,kendalltau

2.2.0
C:\Users\PC\AppData\Local\Temp\tmp7zbu0t88\tensorboard_logs


In [2]:
os.listdir('../../../Desktop')

['Atom.lnk',
 'AU.txt',
 'CryptoExtraction',
 'desktop.ini',
 'exchange-codes.txt',
 'Git-2.27.0-64-bit.exe',
 'Google Drev.lnk',
 'Kristian',
 'Kristian.zip',
 'Mødrup.txt',
 'Pensionsinfo - vejledning.pdf',
 'SissePensionsInfo.pdf',
 'SisseSkat2019.pdf',
 'SupportAssistLauncher.exe',
 'taqquote_20200501.h5',
 'Vejledning til SKAT.pdf']

In [3]:
import seaborn as sns
# sns.set_style(style="darkgrid")

## Extracting data

In [4]:
# Do we extract new data or read in?
readIn = True
# run load_data()
if readIn:
    
    # Listing the data files 
    path = '../../../Google Drev/Thesis/Data/TAQ/AggregatedTAQ'
#     path = 'F:/AggregatedTAQ/round3'
    datafiles = os.listdir(path)
    content = np.concatenate([['\n\n'],[str(j)+': '+i+'\n' for j,i in enumerate(datafiles) if 'csv' in i],['\n\n']])
    
    # Asking for user input
    file = input('Which one do you want to load? %s'%''.join(content))
    if int(file) <= 2:
        data = pd.read_csv(path + '/' + datafiles[int(file)],
                           header = None,
                           names=['open','high','low','close',
                                  'spread_open','spread_high','spread_low','spread_close',
                                  'bidsize_open','bidsize_high','bidsize_low','bidsize_close',
                                  'ofrsize_open','ofrsize_high','ofrsize_low','ofrsize_close',
                                  'Ticker'])
        # Using the choice of the user to determine the correct market file
        key = re.split('[_.]',datafiles[int(file)])[-2]
        marketDataFile = [file for file in os.listdir(path+'/round5_market_tickers') if key in file]

        # Reading in the market data
        tempData = pd.read_csv(path+'/round5_market_tickers/'+marketDataFile[0]
                               ,header = None
                               ,names=['open','high','low','close',
                                      'spread_open','spread_high','spread_low','spread_close',
                                      'bidsize_open','bidsize_high','bidsize_low','bidsize_close',
                                      'ofrsize_open','ofrsize_high','ofrsize_low','ofrsize_close',
                                      'Ticker'])
        # Adding the market data to the ticker data
        data = pd.concat([data,tempData],axis=0)
    else:
        data = pd.read_csv(path + '/' + datafiles[int(file)],
                           header = 0,
                           index_col=[0,1]
#                            names=['open','high','low','close',
#                                   'spread_open','spread_high','spread_low','spread_close',
#                                   'bidsize_open','bidsize_high','bidsize_low','bidsize_close',
#                                   'ofrsize_open','ofrsize_high','ofrsize_low','ofrsize_close',
#                                   'Ticker']
                          )
    
    # Lower casing all column names
#     data.columns = data.columns.str.lower()
else:
    
    # print(os.listdir())
    try:
        path = 'a:/taqhdf5'  #'a:/taqhdf5'
        os.listdir(path)
    except:
        path = 't:/taqhdf5'  #'a:/taqhdf5'
        os.listdir(path)
        
    # Sample type
    data_sample = 'full' # or 'stable'
    # allFiles = os.listdir(path)
    # print(len(allFiles), allFiles[:5], allFiles[-5:])
    # print(allFiles[-10:])

    #dates = np.array(['2020040' + str(i) if i < 10 else '202004' + str(i) for i in np.arange(1,16)]).astype(int)
    dates = np.array(['20200501']).astype(int)#,'20200402','20200403','20200406','20200407'

    # Provide a list of tickers of interest
    
    tickers = sorted(['TSLA','FB'])#'MSFT'
    
    # Do we need data on trades, quotes or both?
    dataNeeded = 'quotes' # 'trades', 'quotes' or 'both'
    
    if dataNeeded == 'trades':
        tradeData = load_data_final(dates, tickers, dataNeeded, path, verbose)
    elif dataNeeded == 'quotes':
        quoteData = load_data_final(dates,
                                    tickers,
                                    dataNeeded,
                                    path,
                                    verbose,
                                    extract_candles = False,
                                    aggHorizon = 1,
                                    extra_features_from_quotes = None,
                                    data_sample = data_sample)
    elif dataNeeded == 'both':
        tradeData, quoteData = load_data_final(dates, tickers, dataNeeded, path, verbose)

# Reading in sector information
stockInfo = pd.read_csv('../utils/stockInfo_v1.csv',header=[0,1])
stockInfo.columns = ['ticker','sector','exchange','marketCap']

# Creating a table with stock information based on the tickers available in the data.
uniqueTickers = data.Ticker.unique()
stockTable = stockInfo[stockInfo.ticker.isin(uniqueTickers)]
stockTable.head(10)

Which one do you want to load? 

0: aggregateTAQ_May2020_10sec.csv
1: aggregateTAQ_May2020_30sec.csv
2: aggregateTAQ_May2020_60sec.csv
8: trueAggregateTAQ_60sec.csv


8


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ticker,sector,exchange,marketCap
12,AAPL,Technology,NMS,1578173000000.0
20,ABBV,Healthcare,NYQ,174261200000.0
34,ABT,Healthcare,NYQ,163141000000.0
126,AEP,Utilities,NYQ,40895510000.0
379,AMT,Real Estate,NYQ,117125900000.0
428,APD,Basic Materials,NYQ,54643950000.0
697,BA,Industrials,NYQ,102035600000.0
699,BABA,Consumer Cyclical,NYQ,593653600000.0
700,BAC,Financial Services,NYQ,202055000000.0
870,BHP,Basic Materials,NYQ,125819400000.0


# reading in the market data (done automatically atm)

### Dropping ETFS and market indices

In [11]:
data.Ticker.unique()

array(['AAPL', 'ABBV', 'ABT', 'AEP', 'AMT', 'APD', 'BA', 'BABA', 'BAC',
       'BHP', 'BP', 'CCI', 'CHL', 'COST', 'CSGP', 'D', 'DIS', 'ECL',
       'ENB', 'EXC', 'FB', 'FMX', 'GOOG', 'INTC', 'JNJ', 'KO', 'LFC',
       'LIN', 'LMT', 'MA', 'MCD', 'MSFT', 'NKE', 'NVDA', 'NVS', 'PBR',
       'PEP', 'PFE', 'PLD', 'PSA', 'PTR', 'PYPL', 'RTX', 'SHW', 'SNP',
       'SO', 'SRE', 'T', 'TM', 'TSLA', 'TSM', 'UNP', 'UPS', 'V', 'WMT'],
      dtype=object)

In [12]:
# Removing the XNTK ticker
data = data[~data.Ticker.isin(['XNTK'])]

In [13]:
data.Ticker.unique()

array(['AAPL', 'ABBV', 'ABT', 'AEP', 'AMT', 'APD', 'BA', 'BABA', 'BAC',
       'BHP', 'BP', 'CCI', 'CHL', 'COST', 'CSGP', 'D', 'DIS', 'ECL',
       'ENB', 'EXC', 'FB', 'FMX', 'GOOG', 'INTC', 'JNJ', 'KO', 'LFC',
       'LIN', 'LMT', 'MA', 'MCD', 'MSFT', 'NKE', 'NVDA', 'NVS', 'PBR',
       'PEP', 'PFE', 'PLD', 'PSA', 'PTR', 'PYPL', 'RTX', 'SHW', 'SNP',
       'SO', 'SRE', 'T', 'TM', 'TSLA', 'TSM', 'UNP', 'UPS', 'V', 'WMT'],
      dtype=object)

In [14]:
# Removing the XNTK ticker
data = data[~data.Ticker.isin(['XNTK'])]

etfs = ['IYH','IYM','IYK','IYJ','IYG','IYW','IYC','IYR','IDU','IYZ','IYE','IYF','SPY','DIA','QQQ']

# Extracting the sector ETFs to a separate variable
sectorETFS = data[data.Ticker.isin(etfs)]

# Removing the ETFs
data = data[~data.Ticker.isin(etfs)]

In [15]:
data.columns

Index(['open', 'high', 'low', 'close', 'spread_open', 'spread_high',
       'spread_low', 'spread_close', 'bidsize_open', 'bidsize_high',
       'bidsize_low', 'bidsize_close', 'ofrsize_open', 'ofrsize_high',
       'ofrsize_low', 'ofrsize_close', 'Ticker', 'sector'],
      dtype='object')

In [16]:
data

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker,sector
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL,Technology
20200501,1,289.260,289.350,288.365,289.020,0.24,0.45,0.01,0.10,9.0,20.0,1.0,1.0,4.0,56.0,1.0,1.0,AAPL,Technology
20200501,2,289.035,289.705,288.280,288.580,0.07,0.49,0.01,0.30,1.0,50.0,1.0,1.0,1.0,13.0,1.0,1.0,AAPL,Technology
20200501,3,288.485,289.315,288.280,289.095,0.49,0.49,0.01,0.17,1.0,25.0,1.0,16.0,1.0,8.0,1.0,1.0,AAPL,Technology
20200501,4,289.100,290.435,288.940,290.320,0.16,0.33,0.01,0.10,13.0,71.0,1.0,1.0,1.0,236.0,1.0,1.0,AAPL,Technology
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,123.950,124.110,123.910,124.100,0.02,0.07,0.01,0.04,1.0,11.0,1.0,1.0,5.0,9.0,1.0,1.0,WMT,Consumer Defensive
20200529,386,124.085,124.085,123.920,123.995,0.01,0.06,0.01,0.01,1.0,8.0,1.0,3.0,1.0,9.0,1.0,2.0,WMT,Consumer Defensive
20200529,387,123.995,124.355,123.985,124.335,0.01,0.07,0.01,0.05,4.0,16.0,1.0,2.0,2.0,10.0,1.0,2.0,WMT,Consumer Defensive
20200529,388,124.335,124.355,124.060,124.075,0.05,0.12,0.01,0.01,3.0,6.0,1.0,2.0,2.0,10.0,1.0,4.0,WMT,Consumer Defensive


# Calculating returns

In [19]:
for i,ticker in enumerate(data.Ticker.unique()):
    
    ## Price series
    temp_price = data[data.Ticker==ticker].close
    
    ## Returns
    simple_ticker_returns = ((temp_price.values[1:]/temp_price.values[:-1]) - 1)
    log_ticker_returns = (np.log(temp_price.values[1:]) - np.log(temp_price.values[:-1]))
    log_abs_ticker_returns = np.abs(log_ticker_returns)
    
    ## Returns over x period 
    simple_ticker_returns_x = [((temp_price.values[i:]/temp_price.values[:-i]) - 1)*100 for i in [3,5,10]]
    log_ticker_returns_x = [(np.log(temp_price.values[i:]) - np.log(temp_price.values[:-i]))*100 for i in [3,5,10]]
    log_abs_ticker_returns_x = np.abs(log_ticker_returns_x)
    
    ##### Autocorrelations
    ## Pearsons
    autocorr_log = np.correlate(log_ticker_returns,
                                log_ticker_returns,
                                mode='full')
    autocorr_log = autocorr_log[autocorr_log.size//2:]
#     if i == 0:
#         print(log_ticker_returns,'\n\n',np.abs(log_ticker_returns))
    autocorr_log_abs = np.correlate(np.abs(log_ticker_returns),
                                np.abs(log_ticker_returns),
                                mode='full')
    autocorr_log_abs = autocorr_log_abs[autocorr_log_abs.size//2:]
    
    autocorr_simple = np.correlate(simple_ticker_returns,
                                   simple_ticker_returns,
                                   mode='full')
    autocorr_simple = autocorr_simple[autocorr_simple.size//2:]
    
    ## Spearmans
    s_auto_log,p_val_l = spearmanr(log_ticker_returns[0:-1],log_ticker_returns[1:])
    s_auto_log_abs,p_val_l_abs = spearmanr(np.abs(log_ticker_returns[0:-1]),np.abs(log_ticker_returns[1:]))
    s_auto_simple,p_val_s = spearmanr(simple_ticker_returns[0:-1],simple_ticker_returns[1:])
    
    ## Kendalls
    k_auto_log,pval_l = kendalltau(log_ticker_returns[0:-1],log_ticker_returns[1:])
    k_auto_log_abs,pval_l_abs = kendalltau(np.abs(log_ticker_returns[0:-1]),np.abs(log_ticker_returns[1:]))
    k_auto_simple,pval_s = kendalltau(simple_ticker_returns[0:-1],simple_ticker_returns[1:])
    
    if i == 0:
        simple_returns = pd.DataFrame()
        log_returns = pd.DataFrame()
        log_abs_returns = pd.DataFrame()
        
        additional_simple = pd.DataFrame(columns=['Kurt','Skew','Corr_P','Corr_S',
                                                  'P_val_S','Corr_K','P_val_K',
                                                  'Avg spread','Med spread','Med Return',
                                                  'Med Return 3','Med Return 5','Med Return 10'])
        
        additional_log = pd.DataFrame(columns=['Kurt','Skew','Corr_P','Corr_S',
                                               'P_val_S','Corr_K','P_val_K',
                                               'Avg spread','Med spread','Med Return',
                                               'Med Return 3','Med Return 5','Med Return 10'])
        
        additional_log_abs = pd.DataFrame(columns=['Kurt','Skew','Corr_P','Corr_S',
                                               'P_val_S','Corr_K','P_val_K',
                                               'Avg spread','Med spread','Med Return'
                                                   ,'Med Return 3','Med Return 5','Med Return 10'])
        
        returns = pd.DataFrame({'simple_returns':simple_ticker_returns,
                                'log_returns':log_ticker_returns,
                                'log_abs_returns':log_abs_ticker_returns})
        returns['ticker'] = ticker
    else:
        temp = pd.DataFrame({'simple_returns':simple_ticker_returns,
                             'log_returns':log_ticker_returns,
                             'log_abs_returns':log_abs_ticker_returns})
        temp['ticker'] = ticker
        returns = pd.concat([returns,temp])
    
    simple_returns.loc[:,ticker] = simple_ticker_returns*100
    log_returns.loc[:,ticker] = log_ticker_returns*100
    log_abs_returns.loc[:,ticker] = log_abs_ticker_returns*100
    
    ## Spread as a percentage of the price
    spread_per_price = (data[data.Ticker==ticker].spread_close/temp_price)*100
    
    additional_simple.loc[ticker,['Kurt','Skew','Corr_P','Corr_S',
                                  'P_val_S','Corr_K','P_val_K',
                                  'Avg spread','Med spread','Med Return',
                                  'Med Return 3','Med Return 5',
                                  'Med Return 10']] = [kurtosis(simple_ticker_returns),
                                                         skew(simple_ticker_returns),
                                                         autocorr_simple[0],
                                                         s_auto_simple,
                                                         p_val_s,
                                                         k_auto_simple,
                                                         pval_s,
                                                         np.mean(spread_per_price),
                                                        np.median(spread_per_price),
                                                       np.median(simple_ticker_returns),
                                                       np.median(simple_ticker_returns_x[0]),
                                                       np.median(simple_ticker_returns_x[1]),
                                                       np.median(simple_ticker_returns_x[2])]
    additional_log.loc[ticker,['Kurt','Skew','Corr_P','Corr_S',
                               'P_val_S','Corr_K','P_val_K',
                               'Avg spread','Med spread','Med Return',
                               'Med Return 3','Med Return 5',
                               'Med Return 10']] = [kurtosis(log_ticker_returns),
                                                     skew(log_ticker_returns),
                                                     autocorr_log[0],
                                                      s_auto_log,
                                                      p_val_l,
                                                      k_auto_log,
                                                      pval_l,
                                                      np.mean(spread_per_price),
                                                     np.median(spread_per_price),
                                                    np.median(log_ticker_returns),
                                                    np.median(log_ticker_returns_x[0]),
                                                    np.median(log_ticker_returns_x[1]),
                                                    np.median(log_ticker_returns_x[2])]
    additional_log_abs.loc[ticker,['Kurt','Skew','Corr_P','Corr_S',
                                   'P_val_S','Corr_K','P_val_K',
                                   'Avg spread','Med spread','Med Return',
                                   'Med Return 3','Med Return 5',
                                   'Med Return 10']] = [kurtosis(log_abs_ticker_returns),
                                                         skew(log_abs_ticker_returns),
                                                         autocorr_log_abs[0],
                                                          s_auto_log_abs,
                                                          p_val_l_abs,
                                                          k_auto_log_abs,
                                                          pval_l_abs,
                                                          np.mean(spread_per_price),
                                                         np.median(spread_per_price),
                                                        np.median(log_abs_ticker_returns),
                                                        np.median(log_abs_ticker_returns_x[0]),
                                                        np.median(log_abs_ticker_returns_x[1]),
                                                        np.median(log_abs_ticker_returns_x[2])]
#         simple_returns = (data.close.values[1:]/data.close.values[0:-1])-1
# log_returns = np.log(data.close.values[1:]) - np.log(data.close.values[0:-1])

[-0.00083005 -0.00152355  0.00178301 ...  0.00192698 -0.0024603
 -0.002372  ] 

 [0.00083005 0.00152355 0.00178301 ... 0.00192698 0.0024603  0.002372  ]


INFO:MainThread:numexpr.utils:NumExpr defaulting to 4 threads.


# Adding time

In [20]:
simple_returns.loc[:,'Day'] = data[data.Ticker == 'AAPL'].index[1:].get_level_values(0)

for i,d in enumerate(simple_returns.Day.unique().astype(str)):

    if i == 0:
        number_of_obs = simple_returns[simple_returns.Day == int(d)].shape[0]
        
        container = pd.DataFrame()
        
        container.loc[:,'timestamp'] = pd.date_range(d+' 09:31',periods = number_of_obs,freq='T')
        
    else:
        number_of_obs = simple_returns[simple_returns.Day == int(d)].shape[0]
        
        temp = pd.DataFrame({'timestamp':pd.date_range(d+' 09:30',periods = number_of_obs,freq='T')})
        
        container = pd.concat([container,temp],axis=0)
        
container = container.reset_index(drop=True)

total_obs = simple_returns[simple_returns.Day==20200504].shape[0]
perhour = simple_returns[simple_returns.Day==20200504].shape[0]/6.5
bins_hc = np.linspace(0,perhour*7,8)

# Weekdays
weekdays = {0: 'Monday'
             ,1:'Tuesday'
             ,2:'Wednesday'
             ,3:'Thursday'
             ,4:'Friday'}

# temp_return = log_returns.copy(deep=True)
log_returns.loc[:,'Day'] = data[data.Ticker == 'AAPL'].index[1:].get_level_values(0)
log_returns.loc[:,'Dt'] = pd.to_datetime(log_returns.Day,format='%Y%m%d')
log_returns.loc[:,'Weekday'] = log_returns.Dt.dt.dayofweek.apply(lambda x: weekdays[x])
log_returns.loc[:,'Hour'] = data[data.Ticker == 'AAPL'].index[1:].get_level_values(1)
log_returns.loc[:,'Hour category'] = pd.cut(log_returns.Hour,
                                           bins=bins_hc,#[0,60,120,180,240,300,360,420],
                                           right=True,
                                           include_lowest=True,labels=['09.30-10.30',
                                                                       '10.30-11.30',
                                                                       '11.30-12.30',
                                                                       '12.30-13.30',
                                                                       '13.30-14.30',
                                                                       '14.30-15.30',
                                                                       '15.30-16.00'])
log_returns.loc[:,'H:M'] = container.timestamp.dt.strftime('%H:%M')


simple_returns.loc[:,'Day'] = data[data.Ticker == 'AAPL'].index[1:].get_level_values(0)
simple_returns.loc[:,'Dt'] = pd.to_datetime(log_returns.Day,format='%Y%m%d')
simple_returns.loc[:,'Weekday'] = log_returns.Dt.dt.dayofweek.apply(lambda x: weekdays[x])
simple_returns.loc[:,'Hour'] = data[data.Ticker == 'AAPL'].index[1:].get_level_values(1)
simple_returns.loc[:,'Hour category'] = pd.cut(log_returns.Hour,
                                           bins=bins_hc,#[0,60,120,180,240,300,360,420],
                                           right=True,
                                           include_lowest=True,labels=['09.30-10.30',
                                                                       '10.30-11.30',
                                                                       '11.30-12.30',
                                                                       '12.30-13.30',
                                                                       '13.30-14.30',
                                                                       '14.30-15.30',
                                                                       '15.30-16.00'])
simple_returns.loc[:,'H:M'] = container.timestamp.dt.strftime('%H:%M')

### Prepping to look at other metrics as well
data.loc[:,'Day'] = data.index.get_level_values(0)
data.loc[:,'Dt'] = pd.to_datetime(data.Day,format='%Y%m%d')
data.loc[:,'Weekday'] = data.Dt.dt.dayofweek.apply(lambda x: weekdays[x])
data.loc[:,'Hour'] = data.index.get_level_values(1)
data.loc[:,'Hour category'] = pd.cut(data.Hour,
                                           bins=bins_hc,#[0,60,120,180,240,300,360,420],
                                           right=True,
                                           include_lowest=True,labels=['09.30-10.30',
                                                                       '10.30-11.30',
                                                                       '11.30-12.30',
                                                                       '12.30-13.30',
                                                                       '13.30-14.30',
                                                                       '14.30-15.30',
                                                                       '15.30-16.00'])

data.loc[:,'H:M'] = container.timestamp.dt.strftime('%H:%M')


log_returns

Unnamed: 0,AAPL,ABBV,ABT,AEP,AMT,APD,BA,BABA,BAC,BHP,...,UNP,UPS,V,WMT,Day,Dt,Weekday,Hour,Hour category,H:M
0,-0.083005,0.921552,0.000000,-0.121330,0.252953,1.266613,0.368092,0.137206,-0.299401,0.000000,...,-0.460197,0.385997,0.219915,0.037172,20200501,2020-05-01,Friday,1,09.30-10.30,09:31
1,-0.152355,-0.582663,-0.054828,-0.151870,0.678518,0.511132,-0.786908,-0.106699,0.213950,0.012825,...,0.348538,-0.026757,0.196657,0.276295,20200501,2020-05-01,Friday,2,09.30-10.30,09:32
2,0.178301,0.196645,-0.005484,-0.707708,0.008506,-0.488659,0.982671,-0.099180,0.042735,0.089726,...,-0.006384,-0.166055,0.199113,-0.078275,20200501,2020-05-01,Friday,3,09.30-10.30,09:33
3,0.422841,0.116575,0.010969,-0.024493,-0.257614,-0.488800,-0.549808,0.104263,-0.064109,0.064041,...,0.114855,0.026802,0.130630,-0.020609,20200501,2020-05-01,Friday,4,09.30-10.30,09:34
4,-0.080978,-0.055203,-0.071317,-0.098033,0.125696,-0.117488,-0.179252,-0.066106,-0.192575,0.064000,...,-0.226645,-0.456610,0.189961,0.226463,20200501,2020-05-01,Friday,5,09.30-10.30,09:35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7794,-0.048563,0.357008,0.235156,0.157798,0.141253,0.109358,-0.078936,0.186294,-0.247678,0.106090,...,-0.047035,0.034950,0.035724,0.120943,20200529,2020-05-29,Friday,385,15.30-16.00,15:55
7795,-0.078377,0.210362,0.319779,0.029195,0.038665,0.020620,-0.030905,-0.099152,0.082628,-0.084863,...,-0.020585,0.004992,-0.096996,-0.084645,20200529,2020-05-29,Friday,386,15.30-16.00,15:56
7796,0.192698,-0.070071,0.255089,-0.105140,0.071490,-0.059810,0.075529,0.120904,-0.123967,0.021222,...,-0.029414,-0.029955,0.030640,0.273829,20200529,2020-05-29,Friday,387,15.30-16.00,15:57
7797,-0.246030,0.123939,0.148502,-0.040918,-0.148834,0.010315,0.106330,0.007250,0.165255,0.021218,...,0.002942,-0.059937,0.000000,-0.209331,20200529,2020-05-29,Friday,388,15.30-16.00,15:58


In [None]:
sns.set_context("paper", font_scale=1.5)

In [21]:
data

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,...,ofrsize_low,ofrsize_close,Ticker,sector,Day,Dt,Weekday,Hour,Hour category,H:M
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,...,1.0,4.0,AAPL,Technology,20200501,2020-05-01,Friday,0,09.30-10.30,
20200501,1,289.260,289.350,288.365,289.020,0.24,0.45,0.01,0.10,9.0,20.0,...,1.0,1.0,AAPL,Technology,20200501,2020-05-01,Friday,1,09.30-10.30,
20200501,2,289.035,289.705,288.280,288.580,0.07,0.49,0.01,0.30,1.0,50.0,...,1.0,1.0,AAPL,Technology,20200501,2020-05-01,Friday,2,09.30-10.30,
20200501,3,288.485,289.315,288.280,289.095,0.49,0.49,0.01,0.17,1.0,25.0,...,1.0,1.0,AAPL,Technology,20200501,2020-05-01,Friday,3,09.30-10.30,
20200501,4,289.100,290.435,288.940,290.320,0.16,0.33,0.01,0.10,13.0,71.0,...,1.0,1.0,AAPL,Technology,20200501,2020-05-01,Friday,4,09.30-10.30,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,123.950,124.110,123.910,124.100,0.02,0.07,0.01,0.04,1.0,11.0,...,1.0,1.0,WMT,Consumer Defensive,20200529,2020-05-29,Friday,385,15.30-16.00,
20200529,386,124.085,124.085,123.920,123.995,0.01,0.06,0.01,0.01,1.0,8.0,...,1.0,2.0,WMT,Consumer Defensive,20200529,2020-05-29,Friday,386,15.30-16.00,
20200529,387,123.995,124.355,123.985,124.335,0.01,0.07,0.01,0.05,4.0,16.0,...,1.0,2.0,WMT,Consumer Defensive,20200529,2020-05-29,Friday,387,15.30-16.00,
20200529,388,124.335,124.355,124.060,124.075,0.05,0.12,0.01,0.01,3.0,6.0,...,1.0,4.0,WMT,Consumer Defensive,20200529,2020-05-29,Friday,388,15.30-16.00,


In [22]:
subset_data = data.iloc[0:50,:]

In [24]:
pip install plotly==4.14.1

Collecting plotly==4.14.1
  Downloading plotly-4.14.1-py2.py3-none-any.whl (13.2 MB)
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py): started
  Building wheel for retrying (setup.py): finished with status 'done'
  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11435 sha256=bdb6177db813a8be948ab7ba44ddd6b33f4100e002234c2b457ac8ff6d0cabcb
  Stored in directory: c:\users\pc\appdata\local\pip\cache\wheels\f9\8d\8d\f6af3f7f9eea3553bc2fe6d53e4b287dad18b06a861ac56ddf
Successfully built retrying
Installing collected packages: retrying, plotly
Successfully installed plotly-4.14.1 retrying-1.3.3
Note: you may need to restart the kernel to use updated packages.


In [23]:
import plotly.graph_objects as go
# import pandas as pd

# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/finance-charts-apple.csv')

fig = go.Figure(data=[go.Candlestick(x=subset_data['Dt'],
                open=subset_data['open'], high=subset_data['high'],
                low=subset_data['low'], close=subset_data['close'])
                     ])

fig.update_layout(xaxis_rangeslider_visible=False)
fig.show()

ModuleNotFoundError: No module named 'plotly'

In [10]:
########### Generate Features ################

n_feature_lags = 1

# features = generateFeatures_multi_final(data = data, 
#                                   listOfFeatures = [
#                                                     'pastobs',
#                                                     'spread',
#                                                     'bidsize',
#                                                     'ofrsize',
# #                                                     'stok',
# #                                                     'stod',
# #                                                     'sstod',
# #                                                     'wilr',
# #                                                     'roc',
# #                                                     'rsi',
# #                                                     'atr',
# #                                                     'cci',
# #                                                     'dpo',
# #                                                     'sma',
# #                                                     'ema',
# #                                                     'macd',
# #                                                       'macd_diff',
# #                                                       'macd_signal',
# #                                                     'dis5',
# #                                                     'dis10',
#                                                       'sector'
#                                                    ], 
#                                    feature_lags = n_feature_lags
#                                      ,stockTable=stockTable)
features = generateFeatures_multi_final(data = data, 
                                  listOfFeatures = [
                                                    'pastobs',
                                                    'spread',
                                                    'bidsize',
                                                    'ofrsize',
                                                    'stok',
                                                    'stod',
                                                    'sstod',
#                                                     'wilr',
                                                    'roc',
                                                    'rsi',
                                                    'atr',
                                                    'cci',
                                                    'dpo',
                                                    'sma',
                                                    'ema',
                                                    'macd',
                                                      'macd_diff',
                                                      'macd_signal',
                                                    'dis5',
                                                    'dis10',
                                                      'sector'
                                                   ], 
                                   feature_lags = n_feature_lags
                                     ,sectorETFS=sectorETFS)

########### Generate Labels ################

n_classes = 2
# extract first 4 columns as the lag0 or raw OHLC prices (used for labelling)
price_candles = data[['open','high','low','close','Ticker']]

########### Align Data ################

# from imported function (see testing_preprocessing_features_and_labels.ipynb for thorough experimenting with all the cut-offs):    
X, y,indices = align_features_and_labels_multi_final(price_candles = price_candles, 
                                                 all_features = features,
                                                 prediction_horizon = 1, 
                                                 n_feature_lags = n_feature_lags, 
                                                 n_classes = n_classes, # 5,
                                                 safe_burn_in = False, 
                                                 data_sample = 'full',
                                                 splitType='global',
                                                 noise=False,ticker_dummies=False)

INFO:MainThread:numexpr.utils:NumExpr defaulting to 4 threads.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


AAPL done
ABBV done
ABT done
AEP done
AMT done
APD done
BA done
BABA done
BAC done
BHP done
BP done
CCI done
CHL done
COST done
CSGP done
D done
DIS done
ECL done
ENB done
EXC done
FB done
FMX done
GOOG done
INTC done
JNJ done
KO done
LFC done
LIN done
LMT done
MA done
MCD done
MSFT done
NKE done
NVDA done
NVS done
Number of NaNs in label: 1. 1 is expected
Returns that lead to NaNs in label: [0.0907158]
PBR done
PEP done
PFE done
PLD done
PSA done
PTR done
PYPL done
RTX done
SHW done
SNP done
SO done
SRE done
T done
TM done
TSLA done
TSM done
UNP done
UPS done
V done
WMT done


## Splitting the data

## Adding ticker dummies

In [14]:
## Adding ticker dummies
tickers = X.pop('ticker')
X = pd.concat([X, pd.get_dummies(tickers, prefix='ticker', drop_first=False)], axis=1)

In [15]:
X.columns

Index(['open_lag0', 'high_lag0', 'low_lag0', 'close_lag0', 'spread_open_lag0',
       'spread_high_lag0', 'spread_low_lag0', 'spread_close_lag0',
       'bidsize_open_lag0', 'bidsize_high_lag0',
       ...
       'ticker_SO', 'ticker_SRE', 'ticker_T', 'ticker_TM', 'ticker_TSLA',
       'ticker_TSM', 'ticker_UNP', 'ticker_UPS', 'ticker_V', 'ticker_WMT'],
      dtype='object', length=156)

## Constructing our final train/validation sets

In [16]:
# train_ds = pd.concat([X.iloc[start:end, :] for (start, end) in train_ranges]).reset_index(drop=True)
# train_y = pd.concat([y.iloc[start:end] for (start, end) in train_ranges]).reset_index(drop=True)

# validate_ds = pd.concat([X.iloc[start:end, :] for (start, end) in val_ranges]).reset_index(drop=True)
# val_y = pd.concat([y.iloc[start:end] for (start, end) in val_ranges]).reset_index(drop=True)

# train_ds.shape, train_y.shape, validate_ds.shape, val_y.shape, train_y.shape[0] + val_y.shape[0]

# Let's have a proper split (along tickers & dates)
train_size = 0.8

# Sort the indices
tempIndices = indices.sort_values(['days','timestamps','ticker'])

# Sorting the data
X = X.loc[tempIndices.index,:]#.head(66)
y = y.loc[tempIndices.index,:]

# extracting the first date for the validation data.
first_val_day = int(np.floor(indices.days.unique().shape[0]*0.8))

# Splitting the data
X_train = X[tempIndices.days<tempIndices.days.unique()[first_val_day]].reset_index(drop=True)
y_train = y[tempIndices.days<tempIndices.days.unique()[first_val_day]].reset_index(drop=True)

X_test = X[tempIndices.days>=tempIndices.days.unique()[first_val_day]].reset_index(drop=True)
y_test = y[tempIndices.days>=tempIndices.days.unique()[first_val_day]].reset_index(drop=True)


print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep='\n')

(341385, 156)
(341385, 1)
(85690, 156)
(85690, 1)


In [17]:
X_train

Unnamed: 0,open_lag0,high_lag0,low_lag0,close_lag0,spread_open_lag0,spread_high_lag0,spread_low_lag0,spread_close_lag0,bidsize_open_lag0,bidsize_high_lag0,...,ticker_SO,ticker_SRE,ticker_T,ticker_TM,ticker_TSLA,ticker_TSM,ticker_UNP,ticker_UPS,ticker_V,ticker_WMT
0,-0.545,0.205,-0.635,296.440,0.09,0.20,0.01,0.06,1.0,12.0,...,0,0,0,0,0,0,0,0,0,0
1,-0.200,0.040,-0.255,82.805,0.21,0.26,0.01,0.07,1.0,8.0,...,0,0,0,0,0,0,0,0,0,0
2,0.000,0.030,-0.040,90.855,0.07,0.10,0.01,0.09,1.0,4.0,...,0,0,0,0,0,0,0,0,0,0
3,0.100,0.140,-0.110,81.510,0.06,0.25,0.03,0.16,1.0,9.0,...,0,0,0,0,0,0,0,0,0,0
4,0.010,0.070,-0.045,234.230,0.26,0.30,0.11,0.18,1.0,3.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341380,0.035,0.165,-0.040,51.300,0.17,0.26,0.01,0.02,5.0,23.0,...,0,0,0,0,0,1,0,0,0,0
341381,0.890,0.965,-0.290,169.435,0.35,1.41,0.01,0.93,1.0,10.0,...,0,0,0,0,0,0,1,0,0,0
341382,0.105,0.235,-0.465,97.955,0.86,1.32,0.06,0.15,1.0,6.0,...,0,0,0,0,0,0,0,1,0,0
341383,-0.080,0.540,-0.245,195.705,0.75,1.08,0.01,0.07,1.0,4.0,...,0,0,0,0,0,0,0,0,1,0


In [21]:
np.where((np.sum(np.isinf(X_train.values), axis=1) == 0) == False),
np.where((np.sum(np.isnan(X_train.values), axis=1) == 0) == False)#X_train


(array([], dtype=int64),)

## Pre-processing

In [None]:
{i:colname for i,colname in enumerate(train_ds.columns)}

In [22]:
# Creating one ppdict for individual preprocessings
# ppdict1 = {'open':'minmax',
#           'high':'log',
#           'low':'log',
#           'close':'std'}
# splitpoint = 32

# # Standardize some features
# ppdict1 = {i:'std' for i in train_ds.columns[0:splitpoint]} 
# # Keep some in actual levels (Dummies in this case).
# ppdict2 = {i:'act' for i in train_ds.columns[splitpoint:]}

pre_procesing_applied = 'std'

# Merging the two
# ppdict = {**ppdict1,**ppdict2}

if  pre_procesing_applied == 'None':
    # do nothing here
    pass

elif  pre_procesing_applied == 'std':

    # splitpoint = int((args.feature-lags+1)*16)#32
    # columns_to_pre_process = [col for col in X_train.columns if 'd_' != col[0:2]]

    # Standardize some features
    ppdict1 = {i:'std' for i in X_train.columns if 'd_' != i[0:2]} 
    # Keep some in actual levels (Dummies in this case).
    ppdict2 = {i:'act' for i in X_train.columns if 'd_' == i[0:2]} 

    # Merging the two
    ppdict = {**ppdict1,**ppdict2}

    # x_train,x_test = pre_processing(x_train,x_test,pp_dict)

elif pre_procesing_applied == 'minmax':

    # splitpoint = int((args.feature-lags+1)*16)#32
    # columns_to_pre_process = [col for col in X_train.columns if 'd_' != col[0:2]]

    # Standardize some features
    ppdict1 = {i:'minmax' for i in X_train.columns if 'd_' != i[0:2]} 
    # Keep some in actual levels (Dummies in this case).
    ppdict2 = {i:'act' for i in X_train.columns if 'd_' == i[0:2]} 

    # Merging the two
    ppdict = {**ppdict1,**ppdict2}

    # x_train,x_test = pre_processing(X_train,X_test,pp_dict)

elif pre_procesing_applied == 'pow':

    # splitpoint = int((args.feature-lags+1)*16)#32
    # columns_to_pre_process = [col for col in X_train.columns if 'd_' != col[0:2]]

    # Standardize some features
    ppdict1 = {i:'pow' for i in X_train.columns if 'd_' != i[0:2]} 
    # Keep some in actual levels (Dummies in this case).
    ppdict2 = {i:'act' for i in X_train.columns if 'd_' == i[0:2]} 

    # Merging the two
    ppdict = {**ppdict1,**ppdict2}

    # x_train,x_test = pre_processing(x_train,x_test,pp_dict)

elif pre_procesing_applied == 'quantgau':

    # splitpoint = int((args.feature-lags+1)*16)#32
    # columns_to_pre_process = [col for col in X_train.columns if 'd_' != col[0:2]]

    # Standardize some features
    ppdict1 = {i:'quantgau' for i in X_train.columns if 'd_' != i[0:2]} 
    # Keep some in actual levels (Dummies in this case).
    ppdict2 = {i:'act' for i in X_train.columns if 'd_' == i[0:2]} 

    # Merging the two
    ppdict = {**ppdict1,**ppdict2}

    # x_train,x_test = pre_processing(x_train,x_test,pp_dict)

elif pre_procesing_applied == 'individual':

    # splitpoint = int((args.feature-lags+1)*16)#32
    # columns_to_pre_process = [col for col in X_train.columns if 'd_' != col[0:2]]

    # Standardize some features
    
    # ppdict1 = {i:'power' for i in X_train.columns if 'd_' != i[0:2]}


    # Keep some in actual levels (Dummies in this case).
    ppdict2 = {i:'act' for i in X_train.columns if 'd_' == i[0:2]} 

    # Merging the two
    ppdict = {**ppdict1,**ppdict2}

    # x_train,x_test = pre_processing(x_train,x_test,pp_dict)

elif pre_procesing_applied == 'stacked':

    # splitpoint = int((args.feature-lags+1)*16)#32
    # columns_to_pre_process = [col for col in X_train.columns if 'd_' != col[0:2]]

    # Standardize some features
    
    for j in ['pow','std','minmax']:

        ppdict1 = {i:j for i in X_train.columns if 'd_' != i[0:2]}

        # Keep some in actual levels (Dummies in this case).
        ppdict2 = {i:'act' for i in X_train.columns if 'd_' == i[0:2]} 

        # Merging the two
        ppdict = {**ppdict1,**ppdict2}

        X_train,X_test = pre_processing(X_train,X_test,ppdict)

if pre_procesing_applied not in ['None','stacked']:
    X_train,X_test = pre_processing(X_train,X_test,ppdict)

In [53]:
ppX_train.iloc[:,0].mean(),ppX_train.iloc[:,0].std()

(-1.8927265537610815e-16, 1.000001457346533)

## Prepping for models

In [8]:
N_VALIDATION = val_y.shape[0] #int(1e3)
N_TRAIN = train_y.shape[0] #int(1e4)
# BUFFER_SIZE = int(1e4)
BATCH_SIZE = 256 #512 #32
MAX_EPOCHS = 500

STEPS_PER_EPOCH = N_TRAIN//BATCH_SIZE

N_REPEAT = int(N_TRAIN / ((STEPS_PER_EPOCH * MAX_EPOCHS) / BATCH_SIZE))
FEATURES = X.shape[1]

N_TRAIN, N_VALIDATION, N_TRAIN + N_VALIDATION, STEPS_PER_EPOCH, N_REPEAT, STEPS_PER_EPOCH * MAX_EPOCHS

(343090, 85800, 428890, 1340, 131, 670000)

## A Logistic Regression model in TF/Keras

In [55]:
METRICS = [
      #keras.metrics.TruePositives(name='tp'),
      #keras.metrics.FalsePositives(name='fp'),
      #keras.metrics.TrueNegatives(name='tn'),
      #keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      #keras.metrics.Precision(name='precision'),
      #keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

# def make_model(metrics = METRICS, output_bias=None):
#   if output_bias is not None:
#     output_bias = tf.keras.initializers.Constant(output_bias)
#   model = keras.Sequential([
#       keras.layers.Dense(
#           16, activation='relu',
#           input_shape=(train_features.shape[-1],)),
#       keras.layers.Dropout(0.5),
#       keras.layers.Dense(1, activation='sigmoid',
#                          bias_initializer=output_bias),
#   ])

#   model.compile(
#       optimizer=keras.optimizers.Adam(lr=1e-3),
#       loss=keras.losses.BinaryCrossentropy(),
#       metrics=metrics)

#   return model

# model = keras.Sequential({
#   keras.layers.Dense(1, input_shape=(FEATURES,))
# })

model = keras.Sequential([
#     keras.layers.Flatten(input_shape=(28, 28)),
#     keras.layers.Dense(128, activation='relu'),
#     keras.layers.Dense(10)
    keras.layers.Dense(1,
                       input_shape=(FEATURES,),
                       activation='sigmoid',
                       kernel_regularizer=regularizers.l2(1))
])

model.summary()

# with final activation (Keras/TF tutorial advises against this practice, but they also use it later in the tutorial)
# model = keras.Sequential({
#   keras.layers.Dense(1, input_shape=(FEATURES,), activation='sigmoid')
# })

#model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy', ])
model.compile(
              optimizer=keras.optimizers.Adam(), #lr=1e-3
              loss=keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=METRICS)

early_stopping = tf.keras.callbacks.EarlyStopping(
                                                monitor='val_auc', 
                                                verbose=1,
                                                patience=100,
                                                mode='max',
                                                restore_best_weights=True)

def get_callbacks(run_id):
      return [
             tfdocs.modeling.EpochDots(),
             early_stopping,
             tf.keras.callbacks.TensorBoard(logdir), #/run_id),
      ]

baseline_history = model.fit(
                            train_ds, #train_features,
                            train_y, #train_labels,
                            batch_size=512, #BATCH_SIZE,
                            epochs=1000, #EPOCHS,
                            callbacks = get_callbacks(run_id = 'first'), #[early_stopping],
                            validation_data=(validate_ds, val_y),
                            verbose=0) #(val_features, val_labels))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1)                 99        
Total params: 99
Trainable params: 99
Non-trainable params: 0
_________________________________________________________________





Epoch: 0, accuracy:0.5352,  auc:0.5034,  loss:0.8996,  val_accuracy:0.5456,  val_auc:0.5453,  val_loss:0.6876,  
....................................................................................................
Epoch: 100, accuracy:0.5480,  auc:0.5440,  loss:0.6873,  val_accuracy:0.5454,  val_auc:0.5459,  val_loss:0.6879,  
..................Restoring model weights from the end of the best epoch.
Epoch 00118: early stopping


In [56]:
model.evaluate(validate_ds,  val_y, verbose=2)

2682/2682 - 6s - loss: 0.6879 - accuracy: 0.5457 - auc: 0.5513


[0.6878659725189209, 0.5456876754760742, 0.5513222217559814]

In [11]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [12]:
import datetime
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
%tensorboard --logdir logs

ERROR: Timed out waiting for TensorBoard to start. It may still be running as pid 9296.