In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import time
import h5py
import copy
import datetime
import ta
import yfinance as yf
import tensorflow as tf
import tensorflow.compat.v2.feature_column as fc
from IPython.display import clear_output
import pyodbc

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score, f1_score, log_loss

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.exceptions import ConvergenceWarning 
from sklearn import ensemble
# ConvergenceWarning('ignore')
# Do you wanna see?
verbose = True

import sys
sys.path.append('../')

from utils.data_extraction import load_data,strList
from utils.data_cleaning import HFDataCleaning
from utils.generate_features import candleCreateNP_vect_final,\
                                    generateFeatures_final
from utils.preprocessing_features_and_labels import extract_labels,\
                                                    align_features_and_labels,\
                                                    pre_processing_initial,\
                                                    pre_processing_extended,\
                                                    pre_processing
from utils.models import make_input_fn
from utils.models import performanceTesting,scoreFunction

### Load data etc

In [4]:
stockInfo = pd.read_csv('../utils/stockInfo.csv',header=[0,1])

In [5]:
# How many are there contained in the file?
print('There are',
      stockInfo.shape[0],
      'tickers contained in the file, and there is data on',
      stockInfo.isnull().sum()[1:].min(),'of them.')

There are 8850 tickers contained in the file, and there is data on 5306 of them.


## Sneak peak on the 10 largest companies

In [None]:

stockInfo.sort_values(('2020-07-05','marketCap'),ascending=False).head(10)

## How is the tickers divided in sectors?

In [None]:
stockInfo.loc[:,('2020-07-05')].groupby(['sector']).count()#rename(None,axis=1)

## Lets get the X largest companies in each sector.

In [None]:
sectors = stockInfo.loc[:,('2020-07-05')].dropna(axis=0).sector.unique()

X = 5

topXsectors = pd.DataFrame(index = np.arange(X),columns = pd.MultiIndex.from_product([['Top {}'.format(X)],sectors]))

t1 = pd.DataFrame({'tickers':stockInfo.loc[:,'date'].ticker.values})
t1[stockInfo.loc[:,'2020-07-05'].columns] = stockInfo.loc[:,'2020-07-05']

for i,sector in enumerate(sectors):
    
    tempSec = t1[t1.sector==sector].sort_values('marketCap',
                                                      ascending=False).dropna(axis=0).values.T
    
    Y = len(tempSec[0][0:X]) 

    topXsectors.loc[0:(Y-1),('Top {}'.format(X),sector)] = tempSec[0][0:Y]

In [None]:
topXsectors

In [None]:
stockInfo.head()

In [None]:
'IYZ' in stockInfo.loc[:,('date','ticker')].values

In [None]:
sectors

In [None]:
## IShares Sector ETFS

# iShares Dow Jones U.S. Basic Materials Index:     IYM
# iShares Dow Jones U.S. Consumer Goods Index:      IYK
# iShares Dow Jones U.S. Consumer Services Index:   IYC
# iShares Dow Jones U.S. Energy Index:              IYE
# iShares Dow Jones U.S. Financial Sector Index:    IYF
# iShares Dow Jones U.S. Financial Services Index:  IYG
# iShares Dow Jones U.S. Healthcare Index:          IYH
# iShares Dow Jones U.S. Industrial Index:          IYJ
# iShares Dow Jones U.S. Real Estate Index:         IYR
# iShares Dow Jones U.S. Technology Index:          IYW
# iShares Dow Jones U.S. Telecommunications Index:  IYZ
# iShares Dow Jones Transportation Average Index:   IYT
# iShares Dow Jones U.S. Utilities Index:           IDU
# iShares Cohen & Steers Realty Majors Index:       ICF

etfs = ['IYH','IYM','IYK','IYJ','IYG','IYW','IYC','IYR','IDU','IYZ','IYE','IYF']
# {i:j for i,j in zip(sectors,etfs)}
pd.DataFrame({i:j for i,j in zip(sectors,etfs)},index=[0])

## Let's extract some data now!

In [2]:
def load_data_fast(dates, tickers, dataNeeded, path, verbose,aggHorizon = 1):
    # Measuring the exraction time
    start = time.time()
    
    allFiles = os.listdir(path)
    
    if verbose:
        print(len(allFiles), allFiles[:5], allFiles[-5:])
        print(allFiles[-10:])
    
    # Extracting just the dates of each file
    allDates = np.array([re.split("[._]",ele)[1] if ("." in ele ) & ("_" in ele) else 0 for ele in allFiles]).astype(int)

    minDate = np.min(dates)
    maxDate = np.max(dates)
    
    if verbose:
        print('##### Date range #####\n\nDate, Min: %i\nDate, Max: %i\n'%(minDate,maxDate))
        print('\n1 Lap time: %.3f\n' % ((time.time()-start)))

    # Locating what files we need.
    index = np.where((minDate <= allDates) & (allDates <= maxDate))
    
    relevantFiles = np.array(allFiles)[index[0]]
    
    # Separating the files into trade and quote files.
    trade = [ele for ele in relevantFiles if 'trade' in ele]
    quote = [ele for ele in relevantFiles if 'quote' in ele]
    
    if verbose:
        print('##### Data Extraction begins #####\n')

        if dataNeeded.lower() == 'both':
            print('Both trade and quote data is being extracted..\n')
        else:
            print('%s data is being extracted..\n' % dataNeeded[0:5])
        
        print('\n2 Lap time: %.3f\n' % ((time.time()-start)))
    
    if (dataNeeded == 'both') | (dataNeeded == 'quotes'):

        # Now to the quote data
        for i,file in enumerate(quote):

#             if (verbose) & (i == 0):
#                 print('### Quote Data ###\n')

            # Reading one file at a time
            raw_data = h5py.File(path+'/'+file,'r')
            dt2 = raw_data['Quotes'].dtype
            if verbose:
                print('3 Lap time: %.3f' % ((time.time()-start)))
            
            
            # Store the trade indecies
            QI = raw_data['QuoteIndex']
            
            if verbose:
                print('4 Lap time: %.3f' % ((time.time()-start)))
#             if (verbose) & (i==0):
#                 print('The raw H5 quote file contains: ',list(raw_data.keys()),'\n')

            # Extracting just the tickers
            QIC = np.array([ele[0].astype(str).strip() for ele in QI])
            
            if verbose:
                print('5 Lap time: %.3f' % ((time.time()-start)))
            
            pos_start = []
            pos_range = []
            # Lets get data on each ticker for the file processed at the moment
            for j,ticker in enumerate(tickers):
                
                tickerInfo = QI[QIC==ticker][0]
                pos_start.append(tickerInfo[1])
                pos_range.append(tickerInfo[2])
            
            if verbose:
                print('6 Lap time: %.3f' % ((time.time()-start)))

            # use boolean mask to slice all at once
            selector = zip(pos_start, pos_range) 
            mask = np.zeros(raw_data['Quotes'].shape[0], dtype=bool)
            for (pos_start, pos_range) in selector:
                mask[pos_start : pos_start + pos_range] = True
            tempData = raw_data['Quotes'][mask]
            
            if verbose:
                print('7 Lap time: %.3f' % ((time.time()-start)))

            # For first file and first ticker.
            if (i == 0):

                quoteData = pd.DataFrame(tempData, columns= dt2.names)
                # We remove all unnecessary variables
                unnecessaryVariables = ['NationalBBOInd',
                                        'FinraBBOInd',
                                        'FinraQuoteIndicator',
                                        'SequenceNumber',
                                        'FinraAdfMpidIndicator',
                                        'QuoteCancelCorrection',
                                        'SourceQuote',
                                        'RPI',
                                        'ShortSaleRestrictionIndicator',
                                        'LuldBBOIndicator',
                                        'SIPGeneratedMessageIdent',
                                        'NationalBBOLuldIndicator',
                                        'ParticipantTimestamp',
                                        'FinraTimestamp',
                                        'FinraQuoteIndicator',
                                        'SecurityStatusIndicator']

                quoteData = quoteData.drop(columns=unnecessaryVariables)

                if verbose:
                    print('8 Lap time: %.3f' % ((time.time()-start)))


                quoteData.loc[:,'ex'] = strList(quoteData.ex)
                
                if verbose:
                    print('9 Lap time: %.3f' % ((time.time()-start)))

                quoteData.loc[:,'mode'] = strList(quoteData['mode'])
                
                if verbose:
                    print('10 Lap time: %.3f' % ((time.time()-start)))

                # Adding the date of the file to the dataframe.
                quoteData['Date'] = re.split('[._]',
                                             file)[1]
                if verbose:
                    print('11 Lap time: %.3f' % ((time.time()-start)))

                # Adding a more readable timestamp - TEST IT
                dates = pd.to_datetime(quoteData.loc[:,'Date'], format='%Y%m%d', errors='ignore')
                times = pd.to_timedelta(quoteData.loc[:,'utcsec'])
                quoteData['Timestamp'] = dates + times

                if verbose:
                    print('12 Lap time: %.3f' % ((time.time()-start)))

                quoteData['Hour'] = quoteData.Timestamp.dt.hour
                quoteData['Minute'] = quoteData.Timestamp.dt.minute
                # Adding the ticker
                quoteData['Ticker'] = ticker
                
            else:

                # Storing the data on the following tickers in a temporary variable.

                temp = pd.DataFrame(tempData, columns= dt2.names)
                # Removing all unnecessary variables
                temp = temp.drop(columns=unnecessaryVariables)
                
                if verbose:
                    print('8 Lap time: %.3f' % ((time.time()-start)))

                temp.loc[:,'ex'] = strList(temp.ex)
                
                if verbose:
                    print('9 Lap time: %.3f' % ((time.time()-start)))
                
                temp.loc[:,'mode'] = strList(temp['mode'])
                
                if verbose:
                    print('10 Lap time: %.3f' % ((time.time()-start)))
                
                # Adding the date of the file to the dataframe.
                temp['Date'] = re.split('[._]',file)[1]
                
                if verbose:
                    print('11 Lap time: %.3f' % ((time.time()-start)))
                
                # Adding a more readable timestamp - TEST IT
#                 temp['Timestamp'] = pd.to_datetime(formatDate(re.split('[._]',file)[1],temp.utcsec))
                dates = pd.to_datetime(temp.loc[:,'Date'], format='%Y%m%d', errors='ignore')
                times = pd.to_timedelta(temp.loc[:,'utcsec'])
                temp['Timestamp'] = dates + times


                if verbose:
                    print('12 Lap time: %.3f' % ((time.time()-start)))
                
#                 temp['TSRemainder'] = list(map(lambda x: str(x)[11:], temp.utcsec))
                temp['Hour'] = temp.Timestamp.dt.hour
                temp['Minute'] = temp.Timestamp.dt.minute

                # Adding the ticker
                temp['Ticker'] = ticker

                # Adding the new data
                quoteData = pd.concat([quoteData,temp])
            
            # Closing the file after having used it.
            raw_data.close()
    end = time.time()
    
    quoteData = quoteData.reset_index(drop=True)
    print('The extraction time was %.3f seconds.' % (end-start))

    quoteData.loc[:,'price'] = (quoteData.bid + quoteData.ofr) / 2
    quoteData.loc[:,'spread'] = quoteData.ofr - quoteData.bid
    
    # Cleaning the data
    cleanedData = HFDataCleaning(['P1_2','p2', 'q2', 'p3'],quoteData,'quote',['q'])
    
    # Creating candles
    candles = candleCreateNP_vect_final(data = cleanedData,
                                       step = aggHorizon,
                                        verbose=False,
                                        fillHoles=True,
                                        sample='full',
                                        numpied=True
                                       ,return_spreads=True)
    
    return candles#quoteData

In [3]:
# print(os.listdir())
try:
    path = 'a:/taqhdf5'  #'a:/taqhdf5'
    os.listdir(path)
except:
    path = 't:/taqhdf5'  #'a:/taqhdf5'
    os.listdir(path)    
# allFiles = os.listdir(path)
# print(len(allFiles), allFiles[:5], allFiles[-5:])
# print(allFiles[-10:])

#dates = np.array(['2020040' + str(i) if i < 10 else '202004' + str(i) for i in np.arange(1,16)]).astype(int)
dates = np.array(['20200401']).astype(int)#,'20200402','20200403','20200406','20200407'

# Provide a list of tickers of interest
tickers = ['GOOG']#'MSFT'

# Do we need data on trades, quotes or both?
dataNeeded = 'quotes' # 'trades', 'quotes' or 'both'



# run load_data()
candles = load_data_fast(dates, tickers, dataNeeded, path, verbose)
# if dataNeeded == 'trades':
#     tradeData = load_data(dates, tickers, dataNeeded, path, verbose)
# elif dataNeeded == 'quotes':
#     quoteData = load_data(dates, tickers, dataNeeded, path, verbose)
# elif dataNeeded == 'both':
#     tradeData, quoteData = load_data(dates, tickers, dataNeeded, path, verbose)

8558 ['taq_19930315.h5', 'taq_19930104.h5', 'taq_19930317.h5', 'taq_19930105.h5', 'taq_19930316.h5'] ['taqquote_20200623.h5', 'taqquote_20200624.h5', 'taqquote_20200625.h5', 'taqquote_20200626.h5', 'taqquote_20200629.h5']
['taqquote_20200605.h5', 'taqquote_20200610.h5', 'taqquote_20200615.h5', 'taqquote_20200619.h5', 'taqquote_20200622.h5', 'taqquote_20200623.h5', 'taqquote_20200624.h5', 'taqquote_20200625.h5', 'taqquote_20200626.h5', 'taqquote_20200629.h5']
##### Date range #####

Date, Min: 20200401
Date, Max: 20200401


1 Lap time: 2.082

##### Data Extraction begins #####

quote data is being extracted..


2 Lap time: 2.090

3 Lap time: 2.545
4 Lap time: 2.559
5 Lap time: 11.931
6 Lap time: 11.935
7 Lap time: 79.992
8 Lap time: 81.187
9 Lap time: 82.729
10 Lap time: 84.175
11 Lap time: 84.189
12 Lap time: 87.372
The extraction time was 87.771 seconds.


In [8]:
candles = copy.deepcopy(quoteData)

In [9]:
quoteData.shape,candles.shape

((450, 4), (450, 4))

## Data Cleaning, Feature Engineering & Pre Processing

# Implemented technical features

A library: https://technical-analysis-library-in-python.readthedocs.io/en/latest/

### Features used in the literature:

* Stochastic K
* Stochastic D
* Slow Stochastic D
* Momentum/difference
* ROC
* Williams % R
* A/D Oscillator
* Disparity 5
* Disparity 10
* Price Oscillator - (detrended)
* Commodity Channel Index
* RSI

Formulas: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=876544

* Moving Average
* Exponential Moving Average
* True Range - (Average)

Formulas: https://www.sciencedirect.com/science/article/pii/S0957417407001819?via%3Dihub

#### Other Technical Features
* Moving Average Convergence Divergence (MACD)

**Non-classical technical features** - **NOT IMPLEMENTED**

* Bid/Ask prices of top of book
* Spread and mid price based on top og book
* Price derivatives

Formulas: https://www.tandfonline.com/doi/full/10.1080/14697688.2015.1032546?instName=UCL+%28University+College+London%29

In [92]:
def generateFeatures_final_test(data,listOfFeatures=[],feature_lags=1):
    # The input data is build up as follows:
    # Open, high, low and close.
    dataPD = pd.DataFrame({'open':data[:,0],
                             'high':data[:,1],
                             'low':data[:,2],
                             'close':data[:,3]})
    featuresPD = pd.DataFrame()

    for feature in listOfFeatures:

        # Past observations
        if feature.lower() == 'pastobs':
            featuresPD['open'] = dataPD.open
            featuresPD['high'] = dataPD.high
            featuresPD['low'] = dataPD.low
            featuresPD['close'] = dataPD.close

        # Stochastic K
        elif feature.lower() == 'stok':

            tempFeatures= ta.momentum.stoch(dataPD.high,
                                            dataPD.low,
                                            dataPD.close)
            # The below is implemented as Stochastic D at the moment.
            # tempFeatures= ta.momentum.stoch_signal(dataPD.high,
            #                                 dataPD.low,
            #                                 dataPD.close)
            # Adding the feature
            featuresPD['stok'] = tempFeatures

        # Stochastic D
        elif feature.lower() == 'stod':

            tempFeatures= ta.momentum.stoch_signal(dataPD.high,
                                                   dataPD.low,
                                                   dataPD.close)
            # Adding the feature
            featuresPD['stod'] = tempFeatures

        # Slow Stochastic D
        elif feature.lower() == 'sstod':

            tempFeatures= ta.trend.sma_indicator(ta.momentum.stoch_signal(dataPD.high,
                                                                          dataPD.low,
                                                                          dataPD.close))
            # Adding the feature
            featuresPD['sstod'] = tempFeatures

        # Williams %R
        elif feature.lower() == 'wilr':

            tempFeatures= ta.momentum.wr(dataPD.high,
                                         dataPD.low,
                                         dataPD.close)
            # Adding the feature
            featuresPD['wilr'] = tempFeatures

        # Rate Of Change
        elif feature.lower() == 'roc':

            tempFeatures= ta.momentum.roc(dataPD.close)

            # Adding the feature
            featuresPD['roc'] = tempFeatures

        # Relative Strength Index
        elif feature.lower() == 'rsi':

            tempFeatures= ta.momentum.rsi(dataPD.close)

            # Adding the feature
            featuresPD['rsi'] = tempFeatures

        # Average True Range
        elif feature.lower() == 'atr':

            tempFeatures= ta.volatility.average_true_range(dataPD.high,
                                                           dataPD.low,
                                                           dataPD.close)
            # Adding the feature
            featuresPD['atr'] = tempFeatures

        # Commodity Channel Index
        elif feature.lower() == 'cci':

            tempFeatures= ta.trend.cci(dataPD.high,
                                       dataPD.low,
                                       dataPD.close)
            # Adding the feature
            featuresPD['cci'] = tempFeatures

         # Detrended Price Ocillator
        elif feature.lower() == 'dpo':

            tempFeatures= ta.trend.dpo(dataPD.close)

            # Adding the feature
            featuresPD['dpo'] = tempFeatures

        # Simple Moving Average
        elif feature.lower() == 'sma':

            tempFeatures= ta.trend.sma_indicator(dataPD.close)

            # Adding the feature
            featuresPD['sma'] = tempFeatures

        # Exponential Moving Average
        elif feature.lower() == 'ema':

            tempFeatures= ta.trend.ema_indicator(dataPD.close)

            # Adding the feature
            featuresPD['ema'] = tempFeatures

        # Moving Average Convergence Divergence
        elif feature.lower() == 'macd':

            # note: having all 3 causes multicollinearity. Maybe not a problem in ML, let's see :-)
            # macd is the difference between two EMAs
            # macd_signal is an EMA of the above macd line
            # macd_diff is the so-called histogram (just bars really) of the time-wise difference between macd and macd_signal

            # Adding the features
            featuresPD['macd'] = ta.trend.macd(dataPD.close)
            featuresPD['macd_diff'] = ta.trend.macd_diff(dataPD.close)
            featuresPD['macd_signal'] = ta.trend.macd_signal(dataPD.close)

         # Disparity 5
        elif feature.lower() == 'dis5':

            tempFeatures= (dataPD.close/ta.trend.sma_indicator(dataPD.close,5))*100

            # Adding the feature
            featuresPD['dis5'] = tempFeatures

        # Disparity 10
        elif feature.lower() == 'dis10':

            tempFeatures= (dataPD.close/ta.trend.sma_indicator(dataPD.close,10))*100

            # Adding the feature
            featuresPD['dis10'] = tempFeatures

        # Bollinger Bands
        elif feature.lower() == 'bb':

            # Define Bollinger Bands function to extract from
            bb_function = ta.volatility.BollingerBands(close=dataPD.close, n=20, ndev=2)

            # Adding the features
            featuresPD['bb_mavg'] = bb_function.bollinger_mavg()
            featuresPD['bb_hband'] = bb_function.bollinger_hband()
            featuresPD['bb_lband'] = bb_function.bollinger_lband()
            featuresPD['bb_pband'] = bb_function.bollinger_pband()
            featuresPD['bb_wband'] = bb_function.bollinger_wband()

    # if we want any lags:
    if feature_lags > 0:

        # collect names of all raw features (before any lagging)
        all_raw_features = featuresPD.columns

        # loop through each lag and shift all features at once
        for roll_i in np.arange(feature_lags + 1): # + 1 as we treat feature_lags = 1 as having both lag0 and lag1

            # define new column name (feature_name_ + lagX) where X = roll_i is the shifting parameter
            new_col_names = [feature_name + '_lag' + str(roll_i) for feature_name in all_raw_features]

            # Shift/roll all raw features with the shifting parameter roll_i and save as new columns.
            # The shift parameter must be negative (we want lag0 to be the 'newest'/'latest')
            featuresPD[new_col_names] = featuresPD[all_raw_features].shift( - (feature_lags - roll_i))

        # remove all raw features
        featuresPD = featuresPD.loc[:, ~featuresPD.columns.isin(all_raw_features)]

        # Adjust price feature
    if 'pastobs' in listOfFeatures:
        if feature_lags > 0:
            priceCols = np.concatenate([[c for c in featuresPD.columns if t in c] for t in ['open','high','low','close']])
            print(priceCols)
            tempClose = featuresPD.close_lag0.copy(deep=True)
#             print('\n')
            
#             featuresPD.loc[:,priceCols] = featuresPD.loc[:,priceCols] - featuresPD.close_lag0
            featuresPD.loc[:,priceCols] = featuresPD.loc[:,priceCols].subtract(featuresPD.close_lag0,axis=0)
#             print('\n')
#             print([featuresPD.loc[:,priceCols] - featuresPD.close_lag0][0:5])
#             print(tempClose)
            featuresPD.loc[:,'close_lag0'] = tempClose
        else:
#             tempClose = copy.deepcopy(featuresPD.close.values)
            tempClose = featuresPD.close.copy(deep=True)
            
#             print(tempClose)
#             featuresPD.loc[:,['open','high','low','close']] = featuresPD.loc[:,['open','high','low','close']] - featuresPD.close
            featuresPD.loc[:,['open','high','low','close']] = featuresPD.loc[:,['open','high','low','close']].subtract(featuresPD.close,axis=0)
#             print('\n')
#             print(featuresPD.loc[:,['open','high','low','close']])
#             print(featuresPD.close)
#             print([featuresPD.loc[:,['open','high','low','close']] - featuresPD.close][0:5])
            featuresPD.loc[:,'close'] = tempClose

    return featuresPD

In [93]:
########### Clean data ###########
    
DATA_SAMPLE = 'full' # or 'stable'

# if DATA_SAMPLE == 'stable':
#     # P1 is used for keeping data within [9.5, 16]
#     cleanedData = HFDataCleaning(['P1','p2','t1','p3'],tradeData,'trade',['q'])
# elif DATA_SAMPLE == 'full':
#     # P1_2 is used for keeping data within [9, 16.5]
#     cleanedData = HFDataCleaning(['P1_2','p2', 'q2', 'p3'],quoteData,'quote',['q'])#'t1',tradeData # q2, quotedate
    
# ########### Construct Candles ################
# # candles = candleCreateNP_vect_final(cleanedData
# #                          ,1)

# candles = candleCreateNP_vect_final(data = cleanedData,
#                                        step = 1,
#                                         verbose=False,
#                                         fillHoles=True,
#                                         sample='stable',
#                                         numpied=True)

########### Generate Features ################

n_feature_lags = 1
features = generateFeatures_final_test(data = candles, 
                                  listOfFeatures = [
                                                    'pastobs',
                                                    'stok',
                                                    'stod',
                                                    'sstod',
                                                    'wilr',
                                                    'roc',
                                                    'rsi',
                                                    'atr',
                                                    'cci',
                                                    'dpo',
                                                    'sma',
                                                    'ema',
                                                    'macd',
                                                    'dis5',
                                                    'dis10',
                                                   ], 
                                   feature_lags = n_feature_lags)

########### Generate Labels ################

n_classes = 3

labels = extract_labels(data = candles, classes = n_classes, group_style = 'equal')

########### Align Data ################

# from imported function (see testing_preprocessing_features_and_labels.ipynb for thorough experimenting with all the cut-offs):    
X, y = align_features_and_labels(candles = candles, 
                                 prediction_horizon = 1, 
                                 features = features, 
                                 n_feature_lags = n_feature_lags, 
                                 n_classes = n_classes, # 5,
                                 safe_burn_in = False, 
                                 data_sample = 'full')

['open_lag0' 'open_lag1' 'high_lag0' 'high_lag1' 'low_lag0' 'low_lag1'
 'close_lag0' 'close_lag1']


In [91]:
features

Unnamed: 0,open_lag0,high_lag0,low_lag0,close_lag0,stok_lag0,stod_lag0,sstod_lag0,wilr_lag0,roc_lag0,rsi_lag0,...,atr_lag1,cci_lag1,dpo_lag1,sma_lag1,ema_lag1,macd_lag1,macd_diff_lag1,macd_signal_lag1,dis5_lag1,dis10_lag1
0,-3.110,0.300,-4.465,1131.650,,,,,,,...,0.000000,,,,,,,,,
1,-4.105,0.000,-4.105,1131.650,,,,,,,...,0.000000,,,,,,,,,
2,-0.165,2.905,-0.745,1128.745,,,,,,,...,0.000000,,,,,,,,,
3,0.000,0.000,0.000,1128.745,,,,,,,...,0.000000,,,,,,,,,
4,0.000,0.000,0.000,1128.745,,,,,,,...,0.000000,,,,,,,,99.84052,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,0.000,0.000,0.000,1105.750,42.611191,42.611191,45.292140,-57.388809,-0.494938,45.86519,...,1.823268,-50.315516,3.99175,1106.437500,1106.402517,-0.312364,-0.387984,0.075620,100.00000,100.0
446,0.000,0.000,0.000,1105.750,42.611191,42.611191,42.403679,-57.388809,-0.248083,45.86519,...,1.693035,-46.698794,1.39175,1105.979167,1106.302130,-0.341279,-0.333519,-0.007760,100.00000,100.0
447,0.000,0.000,0.000,1105.750,44.751381,43.324588,40.979548,-55.248619,0.000000,45.86519,...,1.572103,-44.776841,-1.38900,1105.750000,1106.217187,-0.360043,-0.281827,-0.078216,100.00000,100.0
448,0.000,0.000,0.000,1105.750,44.751381,44.037984,40.893485,-55.248619,0.000000,45.86519,...,1.459810,-44.588720,-1.55150,1105.750000,1106.145312,-0.370642,-0.233940,-0.136701,100.00000,100.0


In [86]:
X[['open_lag0','open_lag1']]

Unnamed: 0,open_lag0,open_lag1
0,2.475,4.055
1,3.980,6.450
2,-7.925,-3.480
3,-1.870,-9.795
4,0.320,-1.355
...,...,...
410,0.250,-1.000
411,0.000,0.250
412,0.000,0.000
413,0.000,0.000


In [68]:
quoteData

array([[1129.235, 1131.95 , 1129.16 , 1131.95 ],
       [1128.54 , 1131.95 , 1127.185, 1131.65 ],
       [1127.545, 1131.65 , 1127.545, 1131.65 ],
       ...,
       [1105.75 , 1105.75 , 1105.75 , 1105.75 ],
       [1105.75 , 1105.75 , 1105.75 , 1105.75 ],
       [1105.75 , 1105.75 , 1105.75 , 1105.75 ]])

In [67]:
candles[-10:]

array([[1105.75, 1105.75, 1105.75, 1105.75],
       [1105.75, 1105.75, 1105.75, 1105.75],
       [1105.75, 1105.75, 1105.75, 1105.75],
       [1104.75, 1105.75, 1104.75, 1105.75],
       [1106.  , 1106.  , 1105.75, 1105.75],
       [1105.75, 1105.75, 1105.75, 1105.75],
       [1105.75, 1105.75, 1105.75, 1105.75],
       [1105.75, 1105.75, 1105.75, 1105.75],
       [1105.75, 1105.75, 1105.75, 1105.75],
       [1105.75, 1105.75, 1105.75, 1105.75]])

In [53]:
t1 = pd.DataFrame({'0':np.random.randint(0,10,1000000),
                   '1':np.random.randint(0,10,1000000),
                   '2':np.random.randint(0,10,1000000)})

# t2 = pd.Series({'0':np.random.randint(0,10,10)})

In [28]:
t1.loc[:,['0','1','2']] = t1.loc[:,['0','1','2']] - t1.loc[:,'2']

In [30]:
t1.loc[:,['0','1','2']] - t1.loc[:,'2']

Unnamed: 0,0,1,2,0.1,1.1,2.1,3,4,5,6,7,8,9
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
5,,,,,,,,,,,,,
6,,,,,,,,,,,,,
7,,,,,,,,,,,,,
8,,,,,,,,,,,,,
9,,,,,,,,,,,,,


In [40]:
type(t1.loc[:,'2'])

pandas.core.series.Series

In [47]:
t1[['0','1','2']]

Unnamed: 0,0,1,2
0,6,6,9
1,7,0,7
2,7,7,2
3,7,5,5
4,0,8,8
5,5,5,7
6,3,0,1
7,4,2,6
8,2,0,3
9,4,0,9


In [54]:
%timeit t1[['0','1','2']].values-t1[['2']].values

63 ms ± 9.52 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [55]:
%timeit t1[['0','1','2']].subtract(t1.loc[:,'2'],axis=0)#tract

50 ms ± 2.86 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [29]:
t1

Unnamed: 0,0,1,2
0,,,
1,,,
2,,,
3,,,
4,,,
5,,,
6,,,
7,,,
8,,,
9,,,


## Let's investigate the features a little bit

In [14]:
X.describe()

Unnamed: 0,open,high,low,close,stok,stod,sstod,wilr,roc,rsi,atr,cci,dpo,sma,ema,macd,macd_diff,macd_signal,dis5,dis10
count,356.0,356.0,356.0,356.0,356.0,356.0,356.0,356.0,356.0,356.0,356.0,356.0,356.0,356.0,356.0,356.0,356.0,356.0,356.0,356.0
mean,1113.773258,1114.526629,1112.868174,1113.757767,45.965678,45.866528,45.018269,-54.034322,-0.033712,46.5108,1.667143,-29.389592,0.021178,1113.916745,1113.940554,-0.35279,0.034606,-0.387397,99.99509,99.988762
std,8.044471,8.083798,8.043112,8.063764,30.592797,28.17147,20.774258,30.592797,0.312338,9.771436,0.377615,104.655551,1.444503,7.880745,7.826475,0.898852,0.349058,0.80861,0.108266,0.158964
min,1097.575,1099.175,1097.375,1097.59,0.0,1.742517,10.260569,-100.0,-0.698726,22.065871,1.116435,-257.657044,-4.806,1099.746667,1099.899247,-2.605586,-0.615902,-2.200649,99.720445,99.63208
25%,1108.2075,1108.96875,1107.28625,1108.15875,16.986568,19.694642,28.553813,-83.013432,-0.259309,38.838962,1.416167,-106.727739,-0.875187,1108.346771,1108.331645,-1.08454,-0.195433,-1.020117,99.92199,99.885761
50%,1111.6325,1112.2125,1110.6025,1111.6125,43.191829,42.603679,42.059838,-56.808171,-0.094255,46.184344,1.593985,-51.517723,0.22075,1111.189167,1111.078969,-0.368386,-0.027618,-0.374518,99.994655,99.966343
75%,1121.4575,1122.06,1120.4925,1121.5675,73.043017,71.13108,61.557961,-26.956983,0.125682,53.717224,1.833109,52.922946,1.030063,1120.916979,1121.278897,0.259694,0.219287,0.211788,100.064057,100.075917
max,1129.185,1129.885,1128.145,1129.09,100.0,99.180868,90.629735,-0.0,1.041685,69.924603,2.97088,313.669065,3.248,1127.830833,1127.38895,1.816809,1.152648,1.328417,100.418115,100.571381


## Standardization, Normalization (MinMax), Norm-Scaling, Quantile and Power Transformation

**Inspiration:**

* [ScikitLearn Overview](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#sphx-glr-auto-examples-preprocessing-plot-all-scaling-py)
* [A Note on Feature Scaling and Normalization](http://sebastianraschka.com/Articles/2014_about_feature_scaling.html)

In [16]:
# Setting up the Scalers!
mm_scaler = MinMaxScaler()
scaler = StandardScaler()
norm_scaler = Normalizer()
pt = PowerTransformer()
ptNst = PowerTransformer(standardize=False)
qtUni = QuantileTransformer(n_quantiles=100)
qtGau = QuantileTransformer(n_quantiles=100,output_distribution='normal')
robo = RobustScaler()


## Split data into train and test set

In [15]:
#X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train = X_train.reset_index(drop=True) 
X_test = X_test.reset_index(drop=True)
# y_train = y_train.reset_index(drop=True)
# y_test = y_test.reset_index(drop=True)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((320, 20), (36, 20), (320,), (36,))

In [16]:
# featurePreProcessing = {'open':'std',
#                         'close':'std',
#                         'high':'std',
#                         'low':'std',
                        
#                         'stok':'std',
#                         'stod':'std',
#                         'sstod':'std',
#                         'wilr':'std',
#                         'ema':'std',
#                         'sma':'std',
#                         'dis5':'sub',
#                         'dis10':'sub',
#                         'macd_diff':'act',
#                         'roc':'actde',
#                         'atr':'actde',
#                         'rsi':'std',
#                         'cci':'quantgau',
#                         'dpo':'quantgau',
#                         'macd':'quantgau',
#                         'macd_signal':'quantgau'}
featurePreProcessing = {col:'quantgau' for col in X.columns}
ppX_train,ppX_test = pre_processing(X_train,
                                       X_test,
                                       featurePreProcessing,
                                       100)

In [19]:
corr = ppX_train.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,open,high,low,close,stok,stod,sstod,wilr,roc,rsi,atr,cci,dpo,sma,ema,macd,macd_diff,macd_signal,dis5,dis10
open,1.0,0.94,0.94,0.93,0.2,0.3,0.35,0.2,0.22,0.36,0.43,0.23,-0.1,0.9,0.93,0.43,0.1,0.41,0.04,0.13
high,0.94,1.0,0.95,0.96,0.25,0.33,0.36,0.25,0.31,0.44,0.46,0.29,-0.17,0.87,0.9,0.45,0.15,0.42,0.14,0.21
low,0.94,0.95,1.0,0.96,0.24,0.3,0.34,0.24,0.23,0.39,0.39,0.26,-0.11,0.89,0.92,0.4,0.1,0.38,0.14,0.19
close,0.93,0.96,0.96,1.0,0.29,0.33,0.33,0.29,0.28,0.44,0.43,0.29,-0.12,0.87,0.91,0.42,0.14,0.39,0.19,0.25
stok,0.2,0.25,0.24,0.29,1.0,0.78,0.32,1.0,0.67,0.74,0.25,0.76,-0.34,0.04,0.1,0.41,0.63,0.17,0.69,0.8
stod,0.3,0.33,0.3,0.33,0.78,1.0,0.53,0.78,0.76,0.79,0.31,0.8,-0.43,0.1,0.16,0.57,0.75,0.3,0.43,0.71
sstod,0.35,0.36,0.34,0.33,0.32,0.53,1.0,0.32,0.54,0.59,0.36,0.42,-0.15,0.31,0.32,0.8,0.33,0.75,-0.05,0.13
wilr,0.2,0.25,0.24,0.29,1.0,0.78,0.32,1.0,0.67,0.74,0.25,0.76,-0.34,0.04,0.1,0.41,0.63,0.17,0.69,0.8
roc,0.22,0.31,0.23,0.28,0.67,0.76,0.54,0.67,1.0,0.84,0.29,0.77,-0.63,0.02,0.08,0.69,0.83,0.37,0.43,0.69
rsi,0.36,0.44,0.39,0.44,0.74,0.79,0.59,0.74,0.84,1.0,0.45,0.83,-0.38,0.18,0.24,0.78,0.7,0.53,0.56,0.77


In [24]:
X_train[['open','high']].corr()

Unnamed: 0,open,high
open,1.0,0.999868
high,0.999868,1.0


## Feature Selection

In [20]:
start = time.time()

## Setting up the model and corresponding parameters

rf = ensemble.RandomForestClassifier()
param_grid = {'knn': {'n_neighbors': [1,3,5,7,9,11,13,15,17,19]},
                  'rf': {'n_estimators': [50,100,200], 'max_features': ['auto', None], 'min_samples_leaf': [1, 5, 10]}}

## Setting parameters for grid search
cv_folds = 5
n_jobs = 1

## Performing grid search
grid_search = GridSearchCV(rf, param_grid['rf'], cv = cv_folds, n_jobs = n_jobs)
grid_search.fit(ppX_train,y_train)

# store the best hyperparameters and initialize a separate random forest with those parameters
rf_params = grid_search.best_params_
rf = ensemble.RandomForestClassifier(**rf_params)

# refit the random forest using only the correctly accessible training data
# and return feature importances. This is not the perfect solution but it is 
# better than extracting feature importances from the grid search above which 
# is refit on all data (also test set) of the inner folds

rf.fit(ppX_train, y_train)
rf_features =  rf.feature_importances_




In [21]:
# compute mean of importances across folds and select those above the mean
# rf_features = np.mean(rf_features, axis=0)            
threshold_value = np.mean(rf_features) 
rf_features_w = np.where(rf_features > threshold_value)[0]

end = time.time()

if verbose:
    print("RF Feature selection final best features: " + str(rf_features_w))
    print('The model selection took %.3f seconds.' % (end-start))

RF Feature selection final best features: [ 6  8  9 10 11 12 16 17 18 19]
The model selection took 74.003 seconds.


In [23]:
# reduce data sets using selected features X_train = X_train[:, rf_features]
msX_test,msX_train = ppX_test.iloc[:, rf_features_w],ppX_train.iloc[:, rf_features_w]

# Refitting

rf.fit(msX_train,y_train)

# Evaluating
rf.score(msX_test,y_test)

# store selected features
# total_features[counter, :] = str(rf_features)

0.3888888888888889

In [24]:
corr = msX_train.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,sstod,roc,rsi,atr,cci,dpo,macd_diff,macd_signal,dis5,dis10
sstod,1.0,0.54,0.59,0.36,0.42,-0.15,0.33,0.75,-0.05,0.13
roc,0.54,1.0,0.84,0.29,0.77,-0.63,0.83,0.37,0.43,0.69
rsi,0.59,0.84,1.0,0.45,0.83,-0.38,0.7,0.53,0.56,0.77
atr,0.36,0.29,0.45,1.0,0.32,-0.09,0.18,0.44,0.09,0.2
cci,0.42,0.77,0.83,0.32,1.0,-0.38,0.77,0.25,0.61,0.82
dpo,-0.15,-0.63,-0.38,-0.09,-0.38,1.0,-0.55,-0.08,-0.1,-0.31
macd_diff,0.33,0.83,0.7,0.18,0.77,-0.55,1.0,0.06,0.42,0.74
macd_signal,0.75,0.37,0.53,0.44,0.25,-0.08,0.06,1.0,-0.13,0.01
dis5,-0.05,0.43,0.56,0.09,0.61,-0.1,0.42,-0.13,1.0,0.83
dis10,0.13,0.69,0.77,0.2,0.82,-0.31,0.74,0.01,0.83,1.0


## Logistic Regression in Tensorflow

In [25]:
#### Building a Logistic Regression Model in Tensorflow
msX_train = ppX_train.copy(deep=True)
msX_test = ppX_test.copy(deep=True)
## Setting up data
NUMERIC_COLUMNS = msX_train.columns

feature_columns = []

for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))
    
## Arange data correctly
train_input_fn = make_input_fn(msX_train, y_train.astype(int))
eval_input_fn = make_input_fn(msX_test, y_test.astype(int), num_epochs=1, shuffle=False)

## Inspecting the data
ds = make_input_fn(msX_train, y_train.astype(int), batch_size=10)()
for feature_batch, label_batch in ds.take(1):
    print('Some feature keys:', list(feature_batch.keys()))
    print()
    print('A batch of class:', feature_batch['sstod'].numpy())
    print()
    print('A batch of Labels:', label_batch.numpy())
    
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns,n_classes=3)
linear_est.train(train_input_fn)
result = linear_est.evaluate(eval_input_fn)

clear_output()
print(result)

{'accuracy': 0.4722222, 'average_loss': 1.0821623, 'loss': 1.0469733, 'global_step': 100}


In [26]:
pred_dicts = list(linear_est.predict(eval_input_fn))
print(len(pred_dicts))
pred_dicts[0]
# probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])

# probs.plot(kind='hist', bins=20, title='predicted probabilities')

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\PC\AppData\Local\Temp\tmpyrbgw1ar\model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
36


{'logits': array([-0.17374697,  0.4323543 , -0.18306282], dtype=float32),
 'probabilities': array([0.26150647, 0.47941193, 0.2590816 ], dtype=float32),
 'class_ids': array([1], dtype=int64),
 'classes': array([b'1'], dtype=object),
 'all_class_ids': array([0, 1, 2]),
 'all_classes': array([b'0', b'1', b'2'], dtype=object)}

## Lets test some performance

### Lets test different preprocessing methods

In [81]:
for ppm in ['std','quantgau','quantuni','pow','minmax']:#'act','actde',
    print('The preprocessing method tested is: %s\n' % ppm)
    testDict = {col:ppm for col in X.columns}
    performanceTesting(X,y,5,2020,testDict,verbose=0)
    print('\n')

The preprocessing method tested is: std

Scores:  [0.3903186274509804, 0.3948497854077253, 0.3905579399141631, 0.39607602697731453, 0.38381361128142244]
Average Score: 0.391 (0.004)


The preprocessing method tested is: quantgau

Scores:  [0.3860294117647059, 0.4064990803188228, 0.39914163090128757, 0.38994481912936846, 0.37216431637032493]
Average Score: 0.391 (0.012)


The preprocessing method tested is: quantuni

Scores:  [0.39950980392156865, 0.39914163090128757, 0.3905579399141631, 0.39362354383813614, 0.3727774371551196]
Average Score: 0.391 (0.010)


The preprocessing method tested is: pow

Scores:  [0.39276960784313725, 0.39546290619251995, 0.3746167995095034, 0.3917841814837523, 0.38136112814224404]
Average Score: 0.387 (0.008)


The preprocessing method tested is: minmax

Scores:  [0.39644607843137253, 0.3825873697118332, 0.37155119558553035, 0.3911710606989577, 0.4009809932556714]
Average Score: 0.389 (0.010)




## Pre-Process features individually

In [19]:
featurePreProcessing = {'open':'std',
                        'close':'std',
                        'high':'std',
                        'low':'std',
                        'stok':'std',
                        'stod':'std',
                        'sstod':'std',
                        'wilr':'std',
                        'ema':'std',
                        'sma':'std',
                        'dis5':'sub',
                        'dis10':'sub',
                        'macd_diff':'act',
                        'roc':'actde',
                        'atr':'actde',
                        'rsi':'std',
                        'cci':'quantgau',
                        'dpo':'quantgau',
                        'macd':'quantgau',
                        'macd_signal':'quantgau'}
performanceTesting(X,y,5,2020,featurePreProcessing,verbose=0)

Scores:  [0.18309859154929578, 0.15714285714285714, 0.21428571428571427, 0.14285714285714285, 0.17142857142857143]
Average Score: 0.174 (0.024)
