## Reading in packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import time
import h5py
import copy
import datetime
import ta
import pathlib
import shutil
import tempfile
#import vaex
from IPython import display
from IPython.display import clear_output
import pyodbc

# Tensorflow related
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import layers
from tensorflow.keras import regularizers
import tensorflow.compat.v2.feature_column as fc

#!pip install -q git+https://github.com/tensorflow/docs

import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

print(tf.__version__)
logdir = pathlib.Path(tempfile.mkdtemp())/"tensorboard_logs"
shutil.rmtree(logdir, ignore_errors=True)
print(logdir)

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score, f1_score, log_loss


# Models
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.exceptions import ConvergenceWarning 
from sklearn import ensemble
# ConvergenceWarning('ignore')
# Do you wanna see?
verbose = True

import sys
sys.path.append('../')
#sys.path.append('...../')

from utils.data_extraction import load_data_final,load_data_and_save
from utils.data_cleaning import HFDataCleaning
from utils.generate_features import candleCreateNP_vect_final,\
                                    generateFeatures_final,\
                                    generateFeatures_multi_final

from utils.preprocessing_features_and_labels import extract_labels,\
                                                    align_features_and_labels,\
                                                    pre_processing_initial,\
                                                    pre_processing_extended,\
                                                    pre_processing_final,\
                                                    extract_labels_multi_final,\
                                                    align_features_and_labels_multi_final,\
                                                    align_features_and_labels_multi_v5

from utils.models import make_input_fn
from utils.models import performanceTesting,scoreFunction
from utils.plotting import plot_confusion_matrix

2.2.0
C:\Users\fstri\AppData\Local\Temp\tmpk5l4j8wf\tensorboard_logs


## Extracting data

In [2]:
# Do we extract new data or read in?
readIn = True
# run load_data()
if readIn:
    
    # Listing the data files 
#     path = '../../../Google Drev/Thesis/Data/TAQ/AggregatedTAQ'
    path = 'F:/AggregatedTAQ'
    datafiles = os.listdir(path)
    content = np.concatenate([['\n\n'],[str(j)+': '+i+'\n' for j,i in enumerate(datafiles) if 'csv' in i],['\n\n']])
    
    # Asking for user input
    file = input('Which one do you want to load? %s'%''.join(content))
    if int(file) <= 2:
        data = pd.read_csv(path + '/' + datafiles[int(file)],
                           header = None,
                           names=['open','high','low','close',
                                  'spread_open','spread_high','spread_low','spread_close',
                                  'bidsize_open','bidsize_high','bidsize_low','bidsize_close',
                                  'ofrsize_open','ofrsize_high','ofrsize_low','ofrsize_close',
                                  'Ticker'])
        # Using the choice of the user to determine the correct market file
        key = re.split('[_.]',datafiles[int(file)])[-2]
        marketDataFile = [file for file in os.listdir(path+'/round5_market_tickers') if key in file]

        # Reading in the market data
        tempData = pd.read_csv(path+'/round5_market_tickers/'+marketDataFile[0]
                               ,header = None
                               ,names=['open','high','low','close',
                                      'spread_open','spread_high','spread_low','spread_close',
                                      'bidsize_open','bidsize_high','bidsize_low','bidsize_close',
                                      'ofrsize_open','ofrsize_high','ofrsize_low','ofrsize_close',
                                      'Ticker'])
        # Adding the market data to the ticker data
        data = pd.concat([data,tempData],axis=0)
    else:
        data = pd.read_csv(path + '/' + datafiles[int(file)],
                           header = 0,
                           index_col=[0,1]
#                            names=['open','high','low','close',
#                                   'spread_open','spread_high','spread_low','spread_close',
#                                   'bidsize_open','bidsize_high','bidsize_low','bidsize_close',
#                                   'ofrsize_open','ofrsize_high','ofrsize_low','ofrsize_close',
#                                   'Ticker']
                          )
    
    # Lower casing all column names
#     data.columns = data.columns.str.lower()
else:
    
    # print(os.listdir())
    try:
        path = 'a:/taqhdf5'  #'a:/taqhdf5'
        os.listdir(path)
    except:
        path = 't:/taqhdf5'  #'a:/taqhdf5'
        os.listdir(path)
        
    # Sample type
    data_sample = 'full' # or 'stable'
    # allFiles = os.listdir(path)
    # print(len(allFiles), allFiles[:5], allFiles[-5:])
    # print(allFiles[-10:])

    #dates = np.array(['2020040' + str(i) if i < 10 else '202004' + str(i) for i in np.arange(1,16)]).astype(int)
    dates = np.array(['20200501']).astype(int)#,'20200402','20200403','20200406','20200407'

    # Provide a list of tickers of interest
    
    tickers = sorted(['TSLA','FB'])#'MSFT'
    
    # Do we need data on trades, quotes or both?
    dataNeeded = 'quotes' # 'trades', 'quotes' or 'both'
    
    if dataNeeded == 'trades':
        tradeData = load_data_final(dates, tickers, dataNeeded, path, verbose)
    elif dataNeeded == 'quotes':
        quoteData = load_data_final(dates,
                                    tickers,
                                    dataNeeded,
                                    path,
                                    verbose,
                                    extract_candles = False,
                                    aggHorizon = 1,
                                    extra_features_from_quotes = None,
                                    data_sample = data_sample)
    elif dataNeeded == 'both':
        tradeData, quoteData = load_data_final(dates, tickers, dataNeeded, path, verbose)

# Reading in sector information
stockInfo = pd.read_csv('../utils/stockInfo_v1.csv',header=[0,1])
stockInfo.columns = ['ticker','sector','exchange','marketCap']

# Creating a table with stock information based on the tickers available in the data.
uniqueTickers = data.Ticker.unique()
stockTable = stockInfo[stockInfo.ticker.isin(uniqueTickers)]
stockTable.head(10)

Which one do you want to load? 

5: trueAggregateTAQ_60sec.csv


5


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ticker,sector,exchange,marketCap
12,AAPL,Technology,NMS,1578173000000.0
20,ABBV,Healthcare,NYQ,174261200000.0
34,ABT,Healthcare,NYQ,163141000000.0
126,AEP,Utilities,NYQ,40895510000.0
379,AMT,Real Estate,NYQ,117125900000.0
428,APD,Basic Materials,NYQ,54643950000.0
697,BA,Industrials,NYQ,102035600000.0
699,BABA,Consumer Cyclical,NYQ,593653600000.0
700,BAC,Financial Services,NYQ,202055000000.0
870,BHP,Basic Materials,NYQ,125819400000.0


# reading in the market data (done automatically atm)

In [3]:
file

'5'

In [4]:
data

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker,sector
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL,Technology
20200501,1,289.260,289.350,288.365,289.020,0.24,0.45,0.01,0.10,9.0,20.0,1.0,1.0,4.0,56.0,1.0,1.0,AAPL,Technology
20200501,2,289.035,289.705,288.280,288.580,0.07,0.49,0.01,0.30,1.0,50.0,1.0,1.0,1.0,13.0,1.0,1.0,AAPL,Technology
20200501,3,288.485,289.315,288.280,289.095,0.49,0.49,0.01,0.17,1.0,25.0,1.0,16.0,1.0,8.0,1.0,1.0,AAPL,Technology
20200501,4,289.100,290.435,288.940,290.320,0.16,0.33,0.01,0.10,13.0,71.0,1.0,1.0,1.0,236.0,1.0,1.0,AAPL,Technology
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,91.500,91.755,91.485,91.740,0.42,0.93,0.39,0.50,5.0,5.0,5.0,5.0,1.0,5.0,1.0,5.0,XNTK,
20200529,386,91.740,91.740,91.740,91.740,0.50,0.50,0.50,0.50,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,XNTK,
20200529,387,91.580,91.830,91.580,91.715,0.18,0.68,0.18,0.45,5.0,5.0,5.0,5.0,1.0,5.0,1.0,5.0,XNTK,
20200529,388,91.595,91.880,91.595,91.750,0.21,0.78,0.21,0.52,5.0,5.0,5.0,5.0,1.0,5.0,1.0,5.0,XNTK,


In [5]:
data

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker,sector
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL,Technology
20200501,1,289.260,289.350,288.365,289.020,0.24,0.45,0.01,0.10,9.0,20.0,1.0,1.0,4.0,56.0,1.0,1.0,AAPL,Technology
20200501,2,289.035,289.705,288.280,288.580,0.07,0.49,0.01,0.30,1.0,50.0,1.0,1.0,1.0,13.0,1.0,1.0,AAPL,Technology
20200501,3,288.485,289.315,288.280,289.095,0.49,0.49,0.01,0.17,1.0,25.0,1.0,16.0,1.0,8.0,1.0,1.0,AAPL,Technology
20200501,4,289.100,290.435,288.940,290.320,0.16,0.33,0.01,0.10,13.0,71.0,1.0,1.0,1.0,236.0,1.0,1.0,AAPL,Technology
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,91.500,91.755,91.485,91.740,0.42,0.93,0.39,0.50,5.0,5.0,5.0,5.0,1.0,5.0,1.0,5.0,XNTK,
20200529,386,91.740,91.740,91.740,91.740,0.50,0.50,0.50,0.50,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,XNTK,
20200529,387,91.580,91.830,91.580,91.715,0.18,0.68,0.18,0.45,5.0,5.0,5.0,5.0,1.0,5.0,1.0,5.0,XNTK,
20200529,388,91.595,91.880,91.595,91.750,0.21,0.78,0.21,0.52,5.0,5.0,5.0,5.0,1.0,5.0,1.0,5.0,XNTK,


In [6]:
data.columns

Index(['open', 'high', 'low', 'close', 'spread_open', 'spread_high',
       'spread_low', 'spread_close', 'bidsize_open', 'bidsize_high',
       'bidsize_low', 'bidsize_close', 'ofrsize_open', 'ofrsize_high',
       'ofrsize_low', 'ofrsize_close', 'Ticker', 'sector'],
      dtype='object')

### Dropping ETFS and market indices

In [7]:
data.Ticker.unique()

array(['AAPL', 'ABBV', 'ABT', 'AEP', 'AMT', 'APD', 'BA', 'BABA', 'BAC',
       'BHP', 'BP', 'CCI', 'CHL', 'COST', 'CSGP', 'D', 'DIS', 'ECL',
       'ENB', 'EXC', 'FB', 'FMX', 'GOOG', 'IDU', 'INTC', 'IYC', 'IYE',
       'IYG', 'IYH', 'IYJ', 'IYK', 'IYM', 'IYR', 'IYW', 'IYZ', 'JNJ',
       'KO', 'LFC', 'LIN', 'LMT', 'MA', 'MCD', 'MSFT', 'NKE', 'NVDA',
       'NVS', 'PBR', 'PEP', 'PFE', 'PLD', 'PSA', 'PTR', 'PYPL', 'RTX',
       'SHW', 'SNP', 'SO', 'SRE', 'T', 'TM', 'TSLA', 'TSM', 'UNP', 'UPS',
       'V', 'WMT', 'DIA', 'QQQ', 'SPY', 'XNTK'], dtype=object)

In [8]:
# Removing the XNTK ticker
data = data[~data.Ticker.isin(['XNTK'])]

In [9]:
data.Ticker.unique()

array(['AAPL', 'ABBV', 'ABT', 'AEP', 'AMT', 'APD', 'BA', 'BABA', 'BAC',
       'BHP', 'BP', 'CCI', 'CHL', 'COST', 'CSGP', 'D', 'DIS', 'ECL',
       'ENB', 'EXC', 'FB', 'FMX', 'GOOG', 'IDU', 'INTC', 'IYC', 'IYE',
       'IYG', 'IYH', 'IYJ', 'IYK', 'IYM', 'IYR', 'IYW', 'IYZ', 'JNJ',
       'KO', 'LFC', 'LIN', 'LMT', 'MA', 'MCD', 'MSFT', 'NKE', 'NVDA',
       'NVS', 'PBR', 'PEP', 'PFE', 'PLD', 'PSA', 'PTR', 'PYPL', 'RTX',
       'SHW', 'SNP', 'SO', 'SRE', 'T', 'TM', 'TSLA', 'TSM', 'UNP', 'UPS',
       'V', 'WMT', 'DIA', 'QQQ', 'SPY'], dtype=object)

In [10]:
# Removing the XNTK ticker
data = data[~data.Ticker.isin(['XNTK'])]

etfs = ['IYH','IYM','IYK','IYJ','IYG','IYW','IYC','IYR','IDU','IYZ','IYE','IYF','SPY','DIA','QQQ']

# Extracting the sector ETFs to a separate variable
sectorETFS = data[data.Ticker.isin(etfs)]

# Removing the ETFs
data = data[~data.Ticker.isin(etfs)]

In [11]:
data.columns

Index(['open', 'high', 'low', 'close', 'spread_open', 'spread_high',
       'spread_low', 'spread_close', 'bidsize_open', 'bidsize_high',
       'bidsize_low', 'bidsize_close', 'ofrsize_open', 'ofrsize_high',
       'ofrsize_low', 'ofrsize_close', 'Ticker', 'sector'],
      dtype='object')

In [12]:
data

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker,sector
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL,Technology
20200501,1,289.260,289.350,288.365,289.020,0.24,0.45,0.01,0.10,9.0,20.0,1.0,1.0,4.0,56.0,1.0,1.0,AAPL,Technology
20200501,2,289.035,289.705,288.280,288.580,0.07,0.49,0.01,0.30,1.0,50.0,1.0,1.0,1.0,13.0,1.0,1.0,AAPL,Technology
20200501,3,288.485,289.315,288.280,289.095,0.49,0.49,0.01,0.17,1.0,25.0,1.0,16.0,1.0,8.0,1.0,1.0,AAPL,Technology
20200501,4,289.100,290.435,288.940,290.320,0.16,0.33,0.01,0.10,13.0,71.0,1.0,1.0,1.0,236.0,1.0,1.0,AAPL,Technology
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,123.950,124.110,123.910,124.100,0.02,0.07,0.01,0.04,1.0,11.0,1.0,1.0,5.0,9.0,1.0,1.0,WMT,Consumer Defensive
20200529,386,124.085,124.085,123.920,123.995,0.01,0.06,0.01,0.01,1.0,8.0,1.0,3.0,1.0,9.0,1.0,2.0,WMT,Consumer Defensive
20200529,387,123.995,124.355,123.985,124.335,0.01,0.07,0.01,0.05,4.0,16.0,1.0,2.0,2.0,10.0,1.0,2.0,WMT,Consumer Defensive
20200529,388,124.335,124.355,124.060,124.075,0.05,0.12,0.01,0.01,3.0,6.0,1.0,2.0,2.0,10.0,1.0,4.0,WMT,Consumer Defensive


In [13]:
########### Generate Features ################

n_feature_lags = 1

# features = generateFeatures_multi_final(data = data, 
#                                   listOfFeatures = [
#                                                     'pastobs',
#                                                     'spread',
#                                                     'bidsize',
#                                                     'ofrsize',
# #                                                     'stok',
# #                                                     'stod',
# #                                                     'sstod',
# #                                                     'wilr',
# #                                                     'roc',
# #                                                     'rsi',
# #                                                     'atr',
# #                                                     'cci',
# #                                                     'dpo',
# #                                                     'sma',
# #                                                     'ema',
# #                                                     'macd',
# #                                                       'macd_diff',
# #                                                       'macd_signal',
# #                                                     'dis5',
# #                                                     'dis10',
#                                                       'sector'
#                                                    ], 
#                                    feature_lags = n_feature_lags
#                                      ,stockTable=stockTable)
features = generateFeatures_multi_final(data = data, 
                                  listOfFeatures = [
                                                    'pastobs',
                                                    'spread',
                                                    'bidsize',
                                                    'ofrsize',
#                                                     'stok',
#                                                     'stod',
#                                                     'sstod',
# #                                                     'wilr',
#                                                     'roc',
#                                                     'rsi',
#                                                     'atr',
#                                                     'cci',
#                                                     'dpo',
#                                                     'sma',
#                                                     'ema',
#                                                     'macd',
#                                                       'macd_diff',
#                                                       'macd_signal',
#                                                     'dis5',
#                                                     'dis10',
                                                      'sector'
                                                   ], 
                                   feature_lags = n_feature_lags
                                     ,sectorETFS=sectorETFS)

########### Generate Labels ################

n_classes = 3
# extract first 4 columns as the lag0 or raw OHLC prices (used for labelling)
price_candles = data[['open','high','low','close','Ticker']]

########### Align Data ################

# from imported function (see testing_preprocessing_features_and_labels.ipynb for thorough experimenting with all the cut-offs):    
X, y,indices = align_features_and_labels_multi_final(price_candles = price_candles, 
                                                 all_features = features,
                                                 prediction_horizon = 1, 
                                                 n_feature_lags = n_feature_lags, 
                                                 n_classes = n_classes, # 5,
                                                 safe_burn_in = False, 
                                                 data_sample = 'full',
                                                 splitType='global',
                                                 noise=False,
                                                 ticker_dummies=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


AAPL done
ABBV done
ABT done
AEP done
AMT done
APD done
BA done
BABA done
BAC done
BHP done
BP done
CCI done
CHL done
COST done
CSGP done
D done
DIS done
ECL done
ENB done
EXC done
FB done
FMX done
GOOG done
INTC done
JNJ done
KO done
LFC done
LIN done
LMT done
MA done
MCD done
MSFT done
NKE done
NVDA done
NVS done
Number of NaNs in label: 1. 1 is expected
Returns that lead to NaNs in label: [0.0907158]
PBR done
PEP done
PFE done
PLD done
PSA done
PTR done
PYPL done
RTX done
SHW done
SNP done
SO done
SRE done
T done
TM done
TSLA done
TSM done
UNP done
UPS done
V done
WMT done


## Splitting the data

## Adding ticker dummies

In [14]:
## Adding ticker dummies
tickers = X.pop('ticker')
X = pd.concat([X, pd.get_dummies(tickers, prefix='d_ticker', drop_first=False)], axis=1)

In [15]:
X.columns

Index(['open_lag0', 'high_lag0', 'low_lag0', 'close_lag0', 'spread_open_lag0',
       'spread_high_lag0', 'spread_low_lag0', 'spread_close_lag0',
       'bidsize_open_lag0', 'bidsize_high_lag0',
       ...
       'd_ticker_SO', 'd_ticker_SRE', 'd_ticker_T', 'd_ticker_TM',
       'd_ticker_TSLA', 'd_ticker_TSM', 'd_ticker_UNP', 'd_ticker_UPS',
       'd_ticker_V', 'd_ticker_WMT'],
      dtype='object', length=126)

## Constructing our final train/validation sets

In [16]:
# train_ds = pd.concat([X.iloc[start:end, :] for (start, end) in train_ranges]).reset_index(drop=True)
# train_y = pd.concat([y.iloc[start:end] for (start, end) in train_ranges]).reset_index(drop=True)

# validate_ds = pd.concat([X.iloc[start:end, :] for (start, end) in val_ranges]).reset_index(drop=True)
# val_y = pd.concat([y.iloc[start:end] for (start, end) in val_ranges]).reset_index(drop=True)

# train_ds.shape, train_y.shape, validate_ds.shape, val_y.shape, train_y.shape[0] + val_y.shape[0]

# Let's have a proper split (along tickers & dates)
train_size = 0.8

# Sort the indices
tempIndices = indices.sort_values(['days','timestamps','ticker'])

# Sorting the data
X = X.loc[tempIndices.index,:]#.head(66)
y = y.loc[tempIndices.index,:]

# extracting the first date for the validation data.
first_val_day = int(np.floor(indices.days.unique().shape[0]*0.8))

# Splitting the data
X_train = X[tempIndices.days<tempIndices.days.unique()[first_val_day]].reset_index(drop=True)
y_train = y[tempIndices.days<tempIndices.days.unique()[first_val_day]].reset_index(drop=True)

X_test = X[tempIndices.days>=tempIndices.days.unique()[first_val_day]].reset_index(drop=True)
y_test = y[tempIndices.days>=tempIndices.days.unique()[first_val_day]].reset_index(drop=True)

test_index = X[tempIndices.days>=tempIndices.days.unique()[first_val_day]].index
test_index = tempIndices.loc[test_index]

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep='\n')

(343200, 126)
(343200, 1)
(85690, 126)
(85690, 1)


In [17]:
X_train

Unnamed: 0,open_lag0,high_lag0,low_lag0,close_lag0,spread_open_lag0,spread_high_lag0,spread_low_lag0,spread_close_lag0,bidsize_open_lag0,bidsize_high_lag0,...,d_ticker_SO,d_ticker_SRE,d_ticker_T,d_ticker_TM,d_ticker_TSLA,d_ticker_TSM,d_ticker_UNP,d_ticker_UPS,d_ticker_V,d_ticker_WMT
0,0.240,0.330,-0.655,289.020,0.24,0.45,0.01,0.10,9.0,20.0,...,0,0,0,0,0,0,0,0,0,0
1,-0.750,0.310,-0.750,81.760,0.24,1.06,0.07,0.80,14.0,20.0,...,0,0,0,0,0,0,0,0,0,0
2,-0.030,0.160,-0.320,91.220,0.18,0.58,0.05,0.12,1.0,3.0,...,0,0,0,0,0,0,0,0,0,0
3,0.105,0.105,-0.100,82.370,0.13,0.54,0.10,0.26,1.0,2.0,...,0,0,0,0,0,0,0,0,0,0
4,-0.335,0.685,-0.380,233.540,2.39,3.99,1.42,2.10,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343195,0.035,0.165,-0.040,51.300,0.17,0.26,0.01,0.02,5.0,23.0,...,0,0,0,0,0,1,0,0,0,0
343196,0.890,0.965,-0.290,169.435,0.35,1.41,0.01,0.93,1.0,10.0,...,0,0,0,0,0,0,1,0,0,0
343197,0.105,0.235,-0.465,97.955,0.86,1.32,0.06,0.15,1.0,6.0,...,0,0,0,0,0,0,0,1,0,0
343198,-0.080,0.540,-0.245,195.705,0.75,1.08,0.01,0.07,1.0,4.0,...,0,0,0,0,0,0,0,0,1,0


In [18]:
np.where((np.sum(np.isinf(X_train.values), axis=1) == 0) == False),
np.where((np.sum(np.isnan(X_train.values), axis=1) == 0) == False)#X_train


(array([], dtype=int64),)

## Pre-processing

In [19]:
{i:colname for i,colname in enumerate(X_train.columns)}

{0: 'open_lag0',
 1: 'high_lag0',
 2: 'low_lag0',
 3: 'close_lag0',
 4: 'spread_open_lag0',
 5: 'spread_high_lag0',
 6: 'spread_low_lag0',
 7: 'spread_close_lag0',
 8: 'bidsize_open_lag0',
 9: 'bidsize_high_lag0',
 10: 'bidsize_low_lag0',
 11: 'bidsize_close_lag0',
 12: 'ofrsize_open_lag0',
 13: 'ofrsize_high_lag0',
 14: 'ofrsize_low_lag0',
 15: 'ofrsize_close_lag0',
 16: 'relReturns_DIA_lag0',
 17: 'relReturns_IDU_lag0',
 18: 'relReturns_IYC_lag0',
 19: 'relReturns_IYE_lag0',
 20: 'relReturns_IYG_lag0',
 21: 'relReturns_IYH_lag0',
 22: 'relReturns_IYJ_lag0',
 23: 'relReturns_IYK_lag0',
 24: 'relReturns_IYM_lag0',
 25: 'relReturns_IYR_lag0',
 26: 'relReturns_IYW_lag0',
 27: 'relReturns_IYZ_lag0',
 28: 'relReturns_QQQ_lag0',
 29: 'relReturns_SPY_lag0',
 30: 'open_lag1',
 31: 'high_lag1',
 32: 'low_lag1',
 33: 'close_lag1',
 34: 'spread_open_lag1',
 35: 'spread_high_lag1',
 36: 'spread_low_lag1',
 37: 'spread_close_lag1',
 38: 'bidsize_open_lag1',
 39: 'bidsize_high_lag1',
 40: 'bidsize_

In [20]:
# Creating one ppdict for individual preprocessings
# ppdict1 = {'open':'minmax',
#           'high':'log',
#           'low':'log',
#           'close':'std'}
# splitpoint = 32

# # Standardize some features
# ppdict1 = {i:'std' for i in train_ds.columns[0:splitpoint]} 
# # Keep some in actual levels (Dummies in this case).
# ppdict2 = {i:'act' for i in train_ds.columns[splitpoint:]}

pre_procesing_applied = 'std'

# Merging the two
# ppdict = {**ppdict1,**ppdict2}

if  pre_procesing_applied == 'None':
    # do nothing here
    pass

elif  pre_procesing_applied == 'std':

    # splitpoint = int((args.feature-lags+1)*16)#32
    # columns_to_pre_process = [col for col in X_train.columns if 'd_' != col[0:2]]

    # Standardize some features
    ppdict1 = {i:'std' for i in X_train.columns if 'd_' != i[0:2]} 
    # Keep some in actual levels (Dummies in this case).
    ppdict2 = {i:'act' for i in X_train.columns if 'd_' == i[0:2]} 

    # Merging the two
    ppdict = {**ppdict1,**ppdict2}

    # x_train,x_test = pre_processing(x_train,x_test,pp_dict)

elif pre_procesing_applied == 'minmax':

    # splitpoint = int((args.feature-lags+1)*16)#32
    # columns_to_pre_process = [col for col in X_train.columns if 'd_' != col[0:2]]

    # Standardize some features
    ppdict1 = {i:'minmax' for i in X_train.columns if 'd_' != i[0:2]} 
    # Keep some in actual levels (Dummies in this case).
    ppdict2 = {i:'act' for i in X_train.columns if 'd_' == i[0:2]} 

    # Merging the two
    ppdict = {**ppdict1,**ppdict2}

    # x_train,x_test = pre_processing(X_train,X_test,pp_dict)

elif pre_procesing_applied == 'pow':

    # splitpoint = int((args.feature-lags+1)*16)#32
    # columns_to_pre_process = [col for col in X_train.columns if 'd_' != col[0:2]]

    # Standardize some features
    ppdict1 = {i:'pow' for i in X_train.columns if 'd_' != i[0:2]} 
    # Keep some in actual levels (Dummies in this case).
    ppdict2 = {i:'act' for i in X_train.columns if 'd_' == i[0:2]} 

    # Merging the two
    ppdict = {**ppdict1,**ppdict2}

    # x_train,x_test = pre_processing(x_train,x_test,pp_dict)

elif pre_procesing_applied == 'quantgau':

    # splitpoint = int((args.feature-lags+1)*16)#32
    # columns_to_pre_process = [col for col in X_train.columns if 'd_' != col[0:2]]

    # Standardize some features
    ppdict1 = {i:'quantgau' for i in X_train.columns if 'd_' != i[0:2]} 
    # Keep some in actual levels (Dummies in this case).
    ppdict2 = {i:'act' for i in X_train.columns if 'd_' == i[0:2]} 

    # Merging the two
    ppdict = {**ppdict1,**ppdict2}

    # x_train,x_test = pre_processing(x_train,x_test,pp_dict)

elif pre_procesing_applied == 'individual':

    # splitpoint = int((args.feature-lags+1)*16)#32
    # columns_to_pre_process = [col for col in X_train.columns if 'd_' != col[0:2]]

    # Standardize some features
    
    # ppdict1 = {i:'power' for i in X_train.columns if 'd_' != i[0:2]}


    # Keep some in actual levels (Dummies in this case).
    ppdict2 = {i:'act' for i in X_train.columns if 'd_' == i[0:2]} 

    # Merging the two
    ppdict = {**ppdict1,**ppdict2}

    # x_train,x_test = pre_processing(x_train,x_test,pp_dict)

elif pre_procesing_applied == 'stacked':

    # splitpoint = int((args.feature-lags+1)*16)#32
    # columns_to_pre_process = [col for col in X_train.columns if 'd_' != col[0:2]]

    # Standardize some features
    
    for j in ['pow','std','minmax']:

        ppdict1 = {i:j for i in X_train.columns if 'd_' != i[0:2]}

        # Keep some in actual levels (Dummies in this case).
        ppdict2 = {i:'act' for i in X_train.columns if 'd_' == i[0:2]} 

        # Merging the two
        ppdict = {**ppdict1,**ppdict2}

        X_train,X_test = pre_processing_final(X_train,X_test,ppdict)

if pre_procesing_applied not in ['None','stacked']:
    X_train,X_test = pre_processing_final(X_train,X_test,ppdict)

In [21]:
X_train.iloc[:,0].mean(),X_train.iloc[:,0].std()

(6.059080975878666e-16, 1.000001456879563)

In [22]:
X_test.iloc[:,0].mean(),X_test.iloc[:,0].std()

(-0.00026143140617190656, 0.918681214317273)

## Prepping for models

In [23]:
N_VALIDATION = y_test.shape[0] #int(1e3)
N_TRAIN = y_train.shape[0] #int(1e4)
# BUFFER_SIZE = int(1e4)
BATCH_SIZE = 256 #512 #32
MAX_EPOCHS = 50

STEPS_PER_EPOCH = N_TRAIN//BATCH_SIZE

N_REPEAT = int(N_TRAIN / ((STEPS_PER_EPOCH * MAX_EPOCHS) / BATCH_SIZE))
FEATURES = X_train.shape[1]

N_TRAIN, N_VALIDATION, N_TRAIN + N_VALIDATION, STEPS_PER_EPOCH, N_REPEAT, STEPS_PER_EPOCH * MAX_EPOCHS

(343200, 85690, 428890, 1340, 1311, 67000)

In [32]:
np.unique(y_test, return_counts=True)

(array([0., 1., 2.]), array([28423, 28940, 28327], dtype=int64))

In [33]:
y_test

Unnamed: 0,0
0,0.0
1,2.0
2,0.0
3,0.0
4,0.0
...,...
85685,0.0
85686,0.0
85687,0.0
85688,0.0


## A Logistic Regression model in TF/Keras

In [35]:
METRICS = [
      #keras.metrics.TruePositives(name='tp'),
      #keras.metrics.FalsePositives(name='fp'),
      #keras.metrics.TrueNegatives(name='tn'),
      #keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      #keras.metrics.Precision(name='precision'),
      #keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

# def make_model(metrics = METRICS, output_bias=None):
#   if output_bias is not None:
#     output_bias = tf.keras.initializers.Constant(output_bias)
#   model = keras.Sequential([
#       keras.layers.Dense(
#           16, activation='relu',
#           input_shape=(train_features.shape[-1],)),
#       keras.layers.Dropout(0.5),
#       keras.layers.Dense(1, activation='sigmoid',
#                          bias_initializer=output_bias),
#   ])

#   model.compile(
#       optimizer=keras.optimizers.Adam(lr=1e-3),
#       loss=keras.losses.BinaryCrossentropy(),
#       metrics=metrics)

#   return model

# model = keras.Sequential({
#   keras.layers.Dense(1, input_shape=(FEATURES,))
# })

model = keras.Sequential([
#     keras.layers.Flatten(input_shape=(28, 28)),
#     keras.layers.Dense(128, activation='relu'),
#     keras.layers.Dense(10)
    keras.layers.Dense(1,
                       input_shape=(FEATURES,),
                       activation='sigmoid',
                       kernel_regularizer=regularizers.l2(1))
])

model.summary()

# with final activation (Keras/TF tutorial advises against this practice, but they also use it later in the tutorial)
# model = keras.Sequential({
#   keras.layers.Dense(1, input_shape=(FEATURES,), activation='sigmoid')
# })

#model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy', ])
model.compile(
              optimizer=keras.optimizers.Adam(), #lr=1e-3
              loss=keras.losses.BinaryCrossentropy(from_logits=False),
              #loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
              metrics=METRICS)

early_stopping = tf.keras.callbacks.EarlyStopping(
                                                monitor='val_auc', 
                                                verbose=1,
                                                patience=100,
                                                mode='max',
                                                restore_best_weights=True)

def get_callbacks(run_id):
      return [
             tfdocs.modeling.EpochDots(),
             early_stopping
             #tf.keras.callbacks.TensorBoard(logdir), #/run_id),
      ]

baseline_history = model.fit(
                            X_train, #train_features,
                            y_train, #train_labels,
                            batch_size=3300, #BATCH_SIZE,
                            epochs=50, #EPOCHS,
                            callbacks = get_callbacks(run_id = 'first'), #[early_stopping],
                            validation_data=(X_test, y_test),
                            verbose=0) #(val_features, val_labels))

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 1)                 127       
Total params: 127
Trainable params: 127
Non-trainable params: 0
_________________________________________________________________

Epoch: 0, accuracy:0.3486,  auc:0.5210,  loss:1.5929,  val_accuracy:0.3533,  val_auc:0.5197,  val_loss:0.9762,  
..................................................

In [36]:
model.evaluate(X_test,  y_test, verbose=2)

2678/2678 - 4s - loss: 0.0362 - accuracy: 0.3378 - auc: 0.5232


[0.03617377579212189, 0.3378223776817322, 0.5232062935829163]

In [37]:
# Load the TensorBoard notebook extension
# %load_ext tensorboard

In [38]:
# import datetime
# logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
# %tensorboard --logdir logs

In [39]:
model.predict(X_test, verbose=1)



array([[0.9776743 ],
       [0.97897196],
       [0.97463065],
       ...,
       [0.96677274],
       [0.96306056],
       [0.97363055]], dtype=float32)

In [40]:
model.predict_classes(X_test, verbose=1)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [41]:
X_test

Unnamed: 0,open_lag0,high_lag0,low_lag0,close_lag0,spread_open_lag0,spread_high_lag0,spread_low_lag0,spread_close_lag0,bidsize_open_lag0,bidsize_high_lag0,...,d_ticker_SO,d_ticker_SRE,d_ticker_T,d_ticker_TM,d_ticker_TSLA,d_ticker_TSM,d_ticker_UNP,d_ticker_UPS,d_ticker_V,d_ticker_WMT
0,2.065833,1.816550,0.009023,0.662205,-0.097519,-0.120272,-0.210994,-0.248567,-0.193662,-0.147184,...,0,0,0,0,0,0,0,0,0,0
1,0.709255,0.367430,0.388086,-0.371621,-0.204518,-0.208006,-0.210994,-0.216011,-0.193662,-0.224906,...,0,0,0,0,0,0,0,0,0,0
2,0.488074,0.164865,0.024817,-0.372497,-0.017269,-0.013041,-0.210994,-0.264844,-0.193662,-0.224906,...,0,0,0,0,0,0,0,0,0,0
3,0.075202,-0.131192,0.182760,-0.431954,0.022856,-0.130020,-0.210994,-0.183455,-0.137578,-0.224906,...,0,0,0,0,0,0,0,0,0,0
4,0.399601,1.458166,-2.960304,0.323818,0.116480,2.365538,-0.125019,1.297838,-0.193662,-0.240450,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85685,0.266893,-0.084446,0.388086,-0.562973,-0.244643,-0.305489,-0.210994,-0.297400,-0.006715,-0.061689,...,0,0,0,0,0,1,0,0,0,0
85686,0.045712,-0.162356,0.166966,-0.026092,-0.177768,-0.217755,-0.210994,-0.264844,-0.174967,-0.178273,...,0,0,0,0,0,0,1,0,0,0
85687,0.222657,-0.084446,0.293320,-0.340063,-0.204518,-0.246999,-0.210994,-0.297400,-0.193662,-0.178273,...,0,0,0,0,0,0,0,1,0,0
85688,0.001475,-0.006537,0.214349,0.090193,-0.151018,-0.217755,-0.210994,-0.248567,-0.193662,-0.131639,...,0,0,0,0,0,0,0,0,1,0


In [42]:
X_test.set_index(pd.MultiIndex.from_frame(test_index), inplace=True)
X_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,open_lag0,high_lag0,low_lag0,close_lag0,spread_open_lag0,spread_high_lag0,spread_low_lag0,spread_close_lag0,bidsize_open_lag0,bidsize_high_lag0,...,d_ticker_SO,d_ticker_SRE,d_ticker_T,d_ticker_TM,d_ticker_TSLA,d_ticker_TSM,d_ticker_UNP,d_ticker_UPS,d_ticker_V,d_ticker_WMT
days,timestamps,ticker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
20200526,0,AAPL,2.065833,1.816550,0.009023,0.662205,-0.097519,-0.120272,-0.210994,-0.248567,-0.193662,-0.147184,...,0,0,0,0,0,0,0,0,0,0
20200526,0,ABBV,0.709255,0.367430,0.388086,-0.371621,-0.204518,-0.208006,-0.210994,-0.216011,-0.193662,-0.224906,...,0,0,0,0,0,0,0,0,0,0
20200526,0,ABT,0.488074,0.164865,0.024817,-0.372497,-0.017269,-0.013041,-0.210994,-0.264844,-0.193662,-0.224906,...,0,0,0,0,0,0,0,0,0,0
20200526,0,AEP,0.075202,-0.131192,0.182760,-0.431954,0.022856,-0.130020,-0.210994,-0.183455,-0.137578,-0.224906,...,0,0,0,0,0,0,0,0,0,0
20200526,0,AMT,0.399601,1.458166,-2.960304,0.323818,0.116480,2.365538,-0.125019,1.297838,-0.193662,-0.240450,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,387,TSM,0.266893,-0.084446,0.388086,-0.562973,-0.244643,-0.305489,-0.210994,-0.297400,-0.006715,-0.061689,...,0,0,0,0,0,1,0,0,0,0
20200529,387,UNP,0.045712,-0.162356,0.166966,-0.026092,-0.177768,-0.217755,-0.210994,-0.264844,-0.174967,-0.178273,...,0,0,0,0,0,0,1,0,0,0
20200529,387,UPS,0.222657,-0.084446,0.293320,-0.340063,-0.204518,-0.246999,-0.210994,-0.297400,-0.193662,-0.178273,...,0,0,0,0,0,0,0,1,0,0
20200529,387,V,0.001475,-0.006537,0.214349,0.090193,-0.151018,-0.217755,-0.210994,-0.248567,-0.193662,-0.131639,...,0,0,0,0,0,0,0,0,1,0


In [43]:
test_index

Unnamed: 0,days,timestamps,ticker
6240,20200526,0,AAPL
14038,20200526,0,ABBV
21836,20200526,0,ABT
29634,20200526,0,AEP
37432,20200526,0,AMT
...,...,...,...
397697,20200529,387,TSM
405495,20200529,387,UNP
413293,20200529,387,UPS
421091,20200529,387,V


In [44]:
X_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,open_lag0,high_lag0,low_lag0,close_lag0,spread_open_lag0,spread_high_lag0,spread_low_lag0,spread_close_lag0,bidsize_open_lag0,bidsize_high_lag0,...,d_ticker_SO,d_ticker_SRE,d_ticker_T,d_ticker_TM,d_ticker_TSLA,d_ticker_TSM,d_ticker_UNP,d_ticker_UPS,d_ticker_V,d_ticker_WMT
days,timestamps,ticker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
20200526,0,AAPL,2.065833,1.816550,0.009023,0.662205,-0.097519,-0.120272,-0.210994,-0.248567,-0.193662,-0.147184,...,0,0,0,0,0,0,0,0,0,0
20200526,0,ABBV,0.709255,0.367430,0.388086,-0.371621,-0.204518,-0.208006,-0.210994,-0.216011,-0.193662,-0.224906,...,0,0,0,0,0,0,0,0,0,0
20200526,0,ABT,0.488074,0.164865,0.024817,-0.372497,-0.017269,-0.013041,-0.210994,-0.264844,-0.193662,-0.224906,...,0,0,0,0,0,0,0,0,0,0
20200526,0,AEP,0.075202,-0.131192,0.182760,-0.431954,0.022856,-0.130020,-0.210994,-0.183455,-0.137578,-0.224906,...,0,0,0,0,0,0,0,0,0,0
20200526,0,AMT,0.399601,1.458166,-2.960304,0.323818,0.116480,2.365538,-0.125019,1.297838,-0.193662,-0.240450,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,387,TSM,0.266893,-0.084446,0.388086,-0.562973,-0.244643,-0.305489,-0.210994,-0.297400,-0.006715,-0.061689,...,0,0,0,0,0,1,0,0,0,0
20200529,387,UNP,0.045712,-0.162356,0.166966,-0.026092,-0.177768,-0.217755,-0.210994,-0.264844,-0.174967,-0.178273,...,0,0,0,0,0,0,1,0,0,0
20200529,387,UPS,0.222657,-0.084446,0.293320,-0.340063,-0.204518,-0.246999,-0.210994,-0.297400,-0.193662,-0.178273,...,0,0,0,0,0,0,0,1,0,0
20200529,387,V,0.001475,-0.006537,0.214349,0.090193,-0.151018,-0.217755,-0.210994,-0.248567,-0.193662,-0.131639,...,0,0,0,0,0,0,0,0,1,0


In [45]:
preds = pd.DataFrame(model.predict(X_test, verbose=1), columns=[1])
preds[0] = 1 - preds[1]



In [57]:
preds = preds[[0,1]]
preds

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,1
days,timestamps,ticker,Unnamed: 3_level_1,Unnamed: 4_level_1
20200526,0,AAPL,0.022326,0.977674
20200526,0,ABBV,0.021028,0.978972
20200526,0,ABT,0.025369,0.974631
20200526,0,AEP,0.031348,0.968652
20200526,0,AMT,0.026709,0.973291
...,...,...,...,...
20200529,387,TSM,0.027882,0.972118
20200529,387,UNP,0.036941,0.963059
20200529,387,UPS,0.033227,0.966773
20200529,387,V,0.036939,0.963061


In [59]:
np.random.seed(2020)
rand_numbers = np.random.rand(85690,3)
preds = rand_numbers / rand_numbers.sum(axis=1).reshape(-1,1)
preds = pd.DataFrame(preds)
preds

Unnamed: 0,0,1,2
0,0.416253,0.368611,0.215136
1,0.329215,0.408036,0.262749
2,0.186563,0.231664,0.581773
3,0.148577,0.133585,0.717838
4,0.513806,0.248181,0.238014
...,...,...,...
85685,0.129696,0.530026,0.340278
85686,0.395135,0.284087,0.320778
85687,0.222739,0.542764,0.234497
85688,0.276288,0.431778,0.291934


In [60]:
preds['class'] = preds.values.argmax(axis=1)


In [61]:
np.unique(preds['class'], return_counts=True)

(array([0, 1, 2], dtype=int64), array([28813, 28484, 28393], dtype=int64))

In [62]:
preds

Unnamed: 0,0,1,2,class
0,0.416253,0.368611,0.215136,0
1,0.329215,0.408036,0.262749,1
2,0.186563,0.231664,0.581773,2
3,0.148577,0.133585,0.717838,2
4,0.513806,0.248181,0.238014,0
...,...,...,...,...
85685,0.129696,0.530026,0.340278,1
85686,0.395135,0.284087,0.320778,0
85687,0.222739,0.542764,0.234497,1
85688,0.276288,0.431778,0.291934,1


In [71]:
'''
1) Each timestep 
    -check for new positions 
    -re-evaluate current positions
    -rebalance maybe
    -

'''


class backtest():
    def __init__(self, X_test, data, preds, max_steps, max_positions):
        self.open_long_positions = np.array(['AAPL','ABT'])
        self.open_short_positions = np.array(['BAC','KO'])
        self.pnl = []
        self.t = 0
        
        self.X_test = X_test
        self.data = data
        self.preds = preds
        self.max_steps = max_steps
        self.max_positions = max_positions
        
        
        
    def run(self):
        
        unique_timesteps = np.concatenate([[[i,j] for i in np.unique(self.X_test.index.get_level_values(1))] \
                                                  for j in np.unique(self.X_test.index.get_level_values(0))])
        
        while self.t < self.max_steps:
            
            ts = unique_timesteps[self.t]
            
            #print(i)
            try:
                ts_data = self.data.loc[(ts[1], ts[0])]
            except:
                pass
            if ts_data.shape == 0:
                pass
            
            #print(ts_data)

            close_info = ts_data[['close','spread_close','Ticker']]
            ts_preds = self.preds.loc[(ts[1], ts[0])]


            self.step(ts, close_info, ts_preds)
                
            self.t += 1
            
        print(f'run function finished at step {self.t}, time: {ts}')

        #for ts in unique_timesteps[:2]:
        
        # for ts in X_test.reset_index().groupby(['days','timestamps']).size().index[:2]:
        #     #print(i)
        #     ts_data = data.loc[(ts[0], ts[1])]
        #     if ts_data.shape == 0:
        #         pass
        #         #print(i, 'hovhov')     
        
        
    def step(self, ts, close_info, ts_preds):
        
        long_list = ts_preds[ts_preds['class'] == 1].index.values
        short_list = ts_preds[ts_preds['class'] == 0].index.values
        
        
        # if any open positions
        if (len(self.open_long_positions) > 0) or (len(self.open_short_positions) > 0):
            # check if any new positions are made
            new_buy = long_list[~np.isin(long_list, self.open_long_positions)]
            new_short = short_list[~np.isin(short_list, self.open_short_positions)] 
            #print(f'new_buy: {new_buy}')
            #print(f'new_short: {new_short}')
            
            print(self.open_long_positions,'\n') #[~np.isin(self.open_long_positions, long_list)])
            print(np.isin(self.open_long_positions, long_list),'\n')
            print(long_list,'\n')
        
            # check if any are closed
            close_buy = self.open_long_positions[~np.isin(self.open_long_positions, long_list)] 
            close_short = self.open_short_positions[~np.isin(self.open_short_positions, short_list)] 
            print(f'close_buy: {close_buy}')
            print(f'close_short: {close_short}')            
             
        
preds = preds.set_index(X_test.index) #.loc[(20200526,0)]

backtest_obj = backtest(X_test=X_test,
                        data=data, 
                        preds=preds, 
                        max_steps=1, 
                        max_positions = 50)
backtest_obj.run()


['AAPL' 'ABT'] 

[False False] 

['ABBV' 'BAC' 'BP' 'CCI' 'CHL' 'CSGP' 'D' 'ECL' 'KO' 'LIN' 'MSFT' 'PBR'
 'RTX' 'UNP' 'UPS' 'WMT'] 

close_buy: ['AAPL' 'ABT']
close_short: ['BAC' 'KO']
run function finished at step 1, time: [       0 20200526]




In [64]:
preds

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,1,2,class
days,timestamps,ticker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20200526,0,AAPL,0.416253,0.368611,0.215136,0
20200526,0,ABBV,0.329215,0.408036,0.262749,1
20200526,0,ABT,0.186563,0.231664,0.581773,2
20200526,0,AEP,0.148577,0.133585,0.717838,2
20200526,0,AMT,0.513806,0.248181,0.238014,0
...,...,...,...,...,...,...
20200529,387,TSM,0.129696,0.530026,0.340278,1
20200529,387,UNP,0.395135,0.284087,0.320778,0
20200529,387,UPS,0.222739,0.542764,0.234497,1
20200529,387,V,0.276288,0.431778,0.291934,1


In [52]:
data

Unnamed: 0,Unnamed: 1,open,high,low,close,spread_open,spread_high,spread_low,spread_close,bidsize_open,bidsize_high,bidsize_low,bidsize_close,ofrsize_open,ofrsize_high,ofrsize_low,ofrsize_close,Ticker,sector
20200501,0,286.250,289.260,285.870,289.260,0.50,0.50,0.01,0.24,6.0,95.0,1.0,10.0,1.0,85.0,1.0,4.0,AAPL,Technology
20200501,1,289.260,289.350,288.365,289.020,0.24,0.45,0.01,0.10,9.0,20.0,1.0,1.0,4.0,56.0,1.0,1.0,AAPL,Technology
20200501,2,289.035,289.705,288.280,288.580,0.07,0.49,0.01,0.30,1.0,50.0,1.0,1.0,1.0,13.0,1.0,1.0,AAPL,Technology
20200501,3,288.485,289.315,288.280,289.095,0.49,0.49,0.01,0.17,1.0,25.0,1.0,16.0,1.0,8.0,1.0,1.0,AAPL,Technology
20200501,4,289.100,290.435,288.940,290.320,0.16,0.33,0.01,0.10,13.0,71.0,1.0,1.0,1.0,236.0,1.0,1.0,AAPL,Technology
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,385,123.950,124.110,123.910,124.100,0.02,0.07,0.01,0.04,1.0,11.0,1.0,1.0,5.0,9.0,1.0,1.0,WMT,Consumer Defensive
20200529,386,124.085,124.085,123.920,123.995,0.01,0.06,0.01,0.01,1.0,8.0,1.0,3.0,1.0,9.0,1.0,2.0,WMT,Consumer Defensive
20200529,387,123.995,124.355,123.985,124.335,0.01,0.07,0.01,0.05,4.0,16.0,1.0,2.0,2.0,10.0,1.0,2.0,WMT,Consumer Defensive
20200529,388,124.335,124.355,124.060,124.075,0.05,0.12,0.01,0.01,3.0,6.0,1.0,2.0,2.0,10.0,1.0,4.0,WMT,Consumer Defensive


In [176]:
ts_preds = preds.loc[(20200526, 0)]
ts_preds[ts_preds['class'] == 1].index.values

array(['AAPL', 'ABBV', 'ABT', 'AEP', 'AMT', 'APD', 'BABA', 'BHP', 'BP',
       'CCI', 'CHL', 'COST', 'CSGP', 'D', 'DIS', 'ECL', 'ENB', 'EXC',
       'FB', 'GOOG', 'INTC', 'JNJ', 'LFC', 'LIN', 'LMT', 'MA', 'MCD',
       'MSFT', 'NKE', 'NVDA', 'NVS', 'PEP', 'PFE', 'PLD', 'PTR', 'PYPL',
       'SHW', 'SNP', 'SO', 'SRE', 'T', 'TSLA', 'TSM', 'UNP', 'UPS', 'V',
       'WMT'], dtype=object)

In [208]:
long_list = ts_preds[ts_preds['class'] == 1].index.values
short_list = ts_preds[ts_preds['class'] == 0].index.values

print(long_list, short_list,'\n\n')

open_long_positions = ['AAPL','ABT']
open_short_positions = ['BAC','KO']

# check if any new positions are made
# if any open positions
if (len(open_long_positions) > 0) or (len(open_short_positions) > 0):
    new_buy = long_list[~np.isin(long_list, open_long_positions)]
    new_short = short_list[~np.isin(short_list, open_short_positions)] 
    
new_buy, new_short

['AAPL' 'ABBV' 'ABT' 'AEP' 'AMT' 'APD' 'BABA' 'BHP' 'BP' 'CCI' 'CHL'
 'COST' 'CSGP' 'D' 'DIS' 'ECL' 'ENB' 'EXC' 'FB' 'GOOG' 'INTC' 'JNJ' 'LFC'
 'LIN' 'LMT' 'MA' 'MCD' 'MSFT' 'NKE' 'NVDA' 'NVS' 'PEP' 'PFE' 'PLD' 'PTR'
 'PYPL' 'SHW' 'SNP' 'SO' 'SRE' 'T' 'TSLA' 'TSM' 'UNP' 'UPS' 'V' 'WMT'] ['BA' 'BAC' 'FMX' 'KO' 'PBR' 'PSA' 'RTX' 'TM'] 




(array(['ABBV', 'AEP', 'AMT', 'APD', 'BABA', 'BHP', 'BP', 'CCI', 'CHL',
        'COST', 'CSGP', 'D', 'DIS', 'ECL', 'ENB', 'EXC', 'FB', 'GOOG',
        'INTC', 'JNJ', 'LFC', 'LIN', 'LMT', 'MA', 'MCD', 'MSFT', 'NKE',
        'NVDA', 'NVS', 'PEP', 'PFE', 'PLD', 'PTR', 'PYPL', 'SHW', 'SNP',
        'SO', 'SRE', 'T', 'TSLA', 'TSM', 'UNP', 'UPS', 'V', 'WMT'],
       dtype=object),
 array(['BA', 'FMX', 'PBR', 'PSA', 'RTX', 'TM'], dtype=object))

In [209]:
open_long_positions[int(~np.isin(open_long_positions, long_list))]

TypeError: only size-1 arrays can be converted to Python scalars

In [211]:
~np.isin(open_long_positions, long_list)

array([False, False])

In [215]:
# check if any are closed
close_buy = open_long_positions[~np.isin(open_long_positions, long_list)] 
close_short = open_short_positions[~np.isin(open_short_positions, short_list)] 

TypeError: only integer scalar arrays can be converted to a scalar index

In [191]:
np.isin(long_list, open_long_positions, invert=True)

array([], dtype=bool)

In [158]:
for ts in X_test.reset_index().groupby(['days','timestamps']).size().index[:2]:
    #print(i)
    ts_data = data.loc[(ts[0], ts[1])]
    if ts_data.shape == 0:
        pass

  This is separate from the ipykernel package so we can avoid doing imports until


In [162]:
ts_data[['close','spread_close']]

20200526  1     323.220
          1      93.110
          1      92.885
          1      79.700
          1     248.845
          1     243.805
          1     145.270
          1     206.010
          1      23.835
          1      46.610
          1      23.385
          1     160.760
          1      35.665
          1     305.560
          1     675.770
          1      82.060
          1     121.575
          1     203.670
          1      32.245
          1      37.970
          1     238.925
          1      69.170
          1    1437.280
          1      63.520
          1     145.605
          1      46.810
          1       9.430
          1     194.000
          1     374.935
          1     303.180
          1     190.055
          1     185.360
          1      96.175
          1     364.760
          1      85.290
          1       7.725
          1     131.645
          1      37.665
          1      90.830
          1     193.840
          1      34.395
          1     

In [128]:
#tmp = [(i,j) for i,j in zip(X_test.index.get_level_values(0),X_test.index.get_level_values(1))]

X_test.reset_index().groupby(['days','timestamps']).size()


#np.unique(X_test.index.get_level_values(1))

days      timestamps
20200526  0             55
          1             55
          2             55
          3             55
          4             55
                        ..
20200529  383           55
          384           55
          385           55
          386           55
          387           55
Length: 1558, dtype: int64

In [155]:
# 20200526  0
X_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,open_lag0,high_lag0,low_lag0,close_lag0,spread_open_lag0,spread_high_lag0,spread_low_lag0,spread_close_lag0,bidsize_open_lag0,bidsize_high_lag0,...,d_ticker_SO,d_ticker_SRE,d_ticker_T,d_ticker_TM,d_ticker_TSLA,d_ticker_TSM,d_ticker_UNP,d_ticker_UPS,d_ticker_V,d_ticker_WMT
days,timestamps,ticker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
20200526,0,AAPL,2.065833,1.816550,0.009023,0.662205,-0.097519,-0.120272,-0.210994,-0.248567,-0.193662,-0.147184,...,0,0,0,0,0,0,0,0,0,0
20200526,0,ABBV,0.709255,0.367430,0.388086,-0.371621,-0.204518,-0.208006,-0.210994,-0.216011,-0.193662,-0.224906,...,0,0,0,0,0,0,0,0,0,0
20200526,0,ABT,0.488074,0.164865,0.024817,-0.372497,-0.017269,-0.013041,-0.210994,-0.264844,-0.193662,-0.224906,...,0,0,0,0,0,0,0,0,0,0
20200526,0,AEP,0.075202,-0.131192,0.182760,-0.431954,0.022856,-0.130020,-0.210994,-0.183455,-0.137578,-0.224906,...,0,0,0,0,0,0,0,0,0,0
20200526,0,AMT,0.399601,1.458166,-2.960304,0.323818,0.116480,2.365538,-0.125019,1.297838,-0.193662,-0.240450,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,387,TSM,0.266893,-0.084446,0.388086,-0.562973,-0.244643,-0.305489,-0.210994,-0.297400,-0.006715,-0.061689,...,0,0,0,0,0,1,0,0,0,0
20200529,387,UNP,0.045712,-0.162356,0.166966,-0.026092,-0.177768,-0.217755,-0.210994,-0.264844,-0.174967,-0.178273,...,0,0,0,0,0,0,1,0,0,0
20200529,387,UPS,0.222657,-0.084446,0.293320,-0.340063,-0.204518,-0.246999,-0.210994,-0.297400,-0.193662,-0.178273,...,0,0,0,0,0,0,0,1,0,0
20200529,387,V,0.001475,-0.006537,0.214349,0.090193,-0.151018,-0.217755,-0.210994,-0.248567,-0.193662,-0.131639,...,0,0,0,0,0,0,0,0,1,0


In [132]:
np.unique(X_test.index.get_level_values(1))

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [134]:
hej = np.concatenate([[[i,j] for i in np.unique(X_test.index.get_level_values(1))] \
                              for j in np.unique(X_test.index.get_level_values(0))])
hej

array([[       0, 20200526],
       [       1, 20200526],
       [       2, 20200526],
       ...,
       [     387, 20200529],
       [     388, 20200529],
       [     389, 20200529]], dtype=int64)

In [152]:
def t1():
    for i in hej[:]:
        #print(i)
        try:
            timestep_data = X_test.loc[(i[1], i[0])]
        except:
            pass
        if timestep_data.shape == 0:
            pass
            #print(i, 'hovhov')

In [153]:
def t2():
    for i in X_test.reset_index().groupby(['days','timestamps']).size().index[:]:
        #print(i)
        timestep_data = X_test.loc[(i[0], i[1])]
        if timestep_data.shape == 0:
            pass
            #print(i, 'hovhov')

In [154]:
%timeit t1()
%timeit t2()

470 ms ± 2.27 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
505 ms ± 6.35 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [137]:
X_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,open_lag0,high_lag0,low_lag0,close_lag0,spread_open_lag0,spread_high_lag0,spread_low_lag0,spread_close_lag0,bidsize_open_lag0,bidsize_high_lag0,...,d_ticker_SO,d_ticker_SRE,d_ticker_T,d_ticker_TM,d_ticker_TSLA,d_ticker_TSM,d_ticker_UNP,d_ticker_UPS,d_ticker_V,d_ticker_WMT
days,timestamps,ticker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
20200526,0,AAPL,2.065833,1.816550,0.009023,0.662205,-0.097519,-0.120272,-0.210994,-0.248567,-0.193662,-0.147184,...,0,0,0,0,0,0,0,0,0,0
20200526,0,ABBV,0.709255,0.367430,0.388086,-0.371621,-0.204518,-0.208006,-0.210994,-0.216011,-0.193662,-0.224906,...,0,0,0,0,0,0,0,0,0,0
20200526,0,ABT,0.488074,0.164865,0.024817,-0.372497,-0.017269,-0.013041,-0.210994,-0.264844,-0.193662,-0.224906,...,0,0,0,0,0,0,0,0,0,0
20200526,0,AEP,0.075202,-0.131192,0.182760,-0.431954,0.022856,-0.130020,-0.210994,-0.183455,-0.137578,-0.224906,...,0,0,0,0,0,0,0,0,0,0
20200526,0,AMT,0.399601,1.458166,-2.960304,0.323818,0.116480,2.365538,-0.125019,1.297838,-0.193662,-0.240450,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200529,387,TSM,0.266893,-0.084446,0.388086,-0.562973,-0.244643,-0.305489,-0.210994,-0.297400,-0.006715,-0.061689,...,0,0,0,0,0,1,0,0,0,0
20200529,387,UNP,0.045712,-0.162356,0.166966,-0.026092,-0.177768,-0.217755,-0.210994,-0.264844,-0.174967,-0.178273,...,0,0,0,0,0,0,1,0,0,0
20200529,387,UPS,0.222657,-0.084446,0.293320,-0.340063,-0.204518,-0.246999,-0.210994,-0.297400,-0.193662,-0.178273,...,0,0,0,0,0,0,0,1,0,0
20200529,387,V,0.001475,-0.006537,0.214349,0.090193,-0.151018,-0.217755,-0.210994,-0.248567,-0.193662,-0.131639,...,0,0,0,0,0,0,0,0,1,0


In [106]:
np.unique(( X_test.index.get_level_values(0).values, X_test.index.get_level_values(1).values ))

array([       0,        1,        2,        3,        4,        5,
              6,        7,        8,        9,       10,       11,
             12,       13,       14,       15,       16,       17,
             18,       19,       20,       21,       22,       23,
             24,       25,       26,       27,       28,       29,
             30,       31,       32,       33,       34,       35,
             36,       37,       38,       39,       40,       41,
             42,       43,       44,       45,       46,       47,
             48,       49,       50,       51,       52,       53,
             54,       55,       56,       57,       58,       59,
             60,       61,       62,       63,       64,       65,
             66,       67,       68,       69,       70,       71,
             72,       73,       74,       75,       76,       77,
             78,       79,       80,       81,       82,       83,
             84,       85,       86,       87,       88,      