In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import time
import h5py
import copy
import datetime
import ta
# Do you wanna see?
verbose = True

In [3]:
a = 6
b = 2
a**b

36

# Kristian test 2000

In [2]:
# FMNS testing pull request

In [3]:
def transformData(dataset, datainfo):
  
    # Use the column-name information to rename the columns.
    renameCol = {i:col[0] for i,col in enumerate(datainfo)}
  
    # Rename
    dataset = dataset.rename(columns=renameCol)
  
    # Use the datatype information to convert the arrays back to the right datatype.
    dt = {col[0]:str if col[1] == 'object' else col[1] for col in datainfo}

    # Convert the datatypes
    dataset = dataset.astype(dt)

    # Strip the string-type arrays for the unintended characters.
    for ele in datainfo:
        # if the datatype is string, we need to do some additional conversion.
        if ele[1] == 'object':

            dataset[ele[0]] = list(map(f,dataset[ele[0]]))

            if 'date' in ele[0].lower():
                dataset[ele[0]] = dataset[ele[0]].astype(np.datetime64) 

    return dataset

# We create a function to clean the string-type arrays
f = lambda a: re.split('[\']',a)[1]

# Function to clean the unpacked data from the compressed files.
def strList(ls):
    return list(map(lambda x: x.decode('utf-8'),ls))

# The following function is based on the research of (Lunde, 2016), summarized in the slides found here:
# https://econ.au.dk/fileadmin/site_files/filer_oekonomi/subsites/creates/Diverse_2016/PhD_High-Frequency/HF_TrQuData_v01.pdf

def formatDate(date,timestamps):
    return list(map(lambda x: date[0:4]+'/'+date[4:6]+'/'+date[6:]+' '+str(datetime.timedelta(seconds = int(str(x)[0:5]),
                                                     microseconds = int(str(x)[5:11]))),timestamps))
def HFDataCleaning(cleaningProcedures,dataToClean,dataType,p3Exchanges = []):
    
    # There are 11 cleaning procedures, with 3 relevant for both trade and quote data and 4 for either trade or quote data.
    # The cleaning procedures are listed below for simplicity
    
    # Applicable for both trade and quote data
    
    # P1. Delete entries with a time stamp outside the 9:30 am to 4 pm window when the exchange is open.
    # P2. Delete entries with a bid, ask or transaction price equal to zero.
    # P3. Retain entries originating from a single exchange. Delete other entries.
    
    # Applicable for just trade data
    
    # T1. Delete entries with corrected trades. (Trades with a Correction Indicator, CORR != 0).
    # T2. Delete entries with abnormal Sale Condition. (Trades where COND has a letter code, except for “E” and “F”).
    # T3. If multiple transactions have the same time stamp: use the median price.
    # T4. Delete entries with prices that are above the ask plus the bid-ask spread. 
    # Similar for entries with prices below the bid minus the bid-ask spread.
    
    # Applicable for just quote data
    
    # Q1. When multiple quotes have the same timestamp, we replace all these with a single entry 
    # with the median bid and median ask price.
    # Q2. Delete entries for which the spread is negative.
    # Q3. Delete entries for which the spread is more that 50 times the median spread on that day.
    # Q4. Delete entries for which the mid-quote deviated by more than 5 median absolute deviations from 
    # a centered median (excluding the observation under consideration) of 50 observations.

    # Some comments, by (Lunde,2016), on the relative importance of the individual cleaning procedures
    
    # ➤ By far the most important rules here are P3, T3 and Q1.
    # ➤ In our empirical work we will see the impact of suspending P3. It is used to reduce the impact
    # of time-delays in the reporting of trades and quote updates.
    # ➤ Some form of T3 and Q1 rule seems inevitable here, and it is these rules which lead to the largest deletion of data.
    # ➤ T4 is an attractive rule, as it disciplines the trade data using quotes. However, it has the disadvantage 
    # that it cannot be applied when quote data is not available.
    # ➤ In situations where quote data is not available, Q4 can be applied to the transaction prices in place of T4.

    dataType = dataType.lower().strip()
    
  
        
    for cp in cleaningProcedures:
        
        cp = cp.lower().strip()
        
        
        # check if cp is sensible
        if (cp.startswith('t')) & (dataType != 'trade'):
            raise ValueError(f'Cleaning procedure {cp} is not compatible with dataType {dataType}')  
            
        elif (cp.startswith('q')) & (dataType != 'quote'):
            raise ValueError(f'Cleaning procedure {cp} is not compatible with dataType {dataType}') 


        # if the cleaning procedure in question is p1.
        if cp == 'p1':
            # ((tradeData.Hour+tradeData.Minute/60)>9.5)&((tradeData.Hour+tradeData.Minute/60)<16)
#             dataToClean = dataToClean[(datetime.timedelta(hours = 9,
#                                                          minutes = 30) <= dataToClean.Timestamp)&\
#                                       (dataToClean.Timestamp <= datetime.timedelta(hours = 16,
#                                                                                    minutes = 0))].reset_index(drop=True)
            dataToClean = dataToClean[((dataToClean.Hour+dataToClean.Minute/60)>=9.5)&\
                                      ((dataToClean.Hour+dataToClean.Minute/60)<16)]
        
        # if the cleaning procedure in question is p2.
        elif cp == 'p2':
            
            # if the cleaning procedure in question is p1.
            if dataType == 'trade':
                
                dataToClean = dataToClean[dataToClean.price != 0].reset_index(drop=True)
                
            elif dataType == 'quote':
                
                dataToClean = dataToClean[(dataToClean.bid != 0) | (dataToClean.ofr != 0)].reset_index(drop=True)
                
                
        # if the cleaning procedure in question is p3.
        elif cp == 'p3':
            
            if len(p3Exchanges) == 0:
                
                raise ValueError('No exchanges, to filter on, has been provided.\nPlease provide a list with minimum one exchanges to filter on.')
            
            else:
                
                # Ensuring correct format
                p3Exchanges = [ele.lower().strip() for ele in p3Exchanges]
                
                # Filtering on exchanges ### Consider to use "isin" on the dataToClean.ex-Series instead, to improve execution time.
                dataToClean = dataToClean[[True if ele.lower().strip() in p3Exchanges else False for ele in dataToClean.ex]].reset_index(drop=True)
        
        
        # if the cleaning procedure in question is t1.
        # T1. Delete entries with corrected trades. (Trades with a Correction Indicator, CORR != 0).
        elif cp == 't1':

            dataToClean = dataToClean[dataToClean['corr'] == '00'].reset_index(drop=True)                
                
                
        # if the cleaning procedure in question is t2.
        # T2. Delete entries with abnormal Sale Condition. (Trades where COND has a letter code, except for “E” and “F”).
        # FMNS: Most are COND = '@ XX' such as '@ TI', make sure this works properly. Assuming startswith('@') is cool
        elif cp == 't2':
            
            dataToClean = dataToClean[(dataToClean.cond.startswith('@')) | (dataToClean.cond in ['E', 'F'])].reset_index(drop=True) 
            
            
        # if the cleaning procedure in question is t3.
        # T3. If multiple transactions have the same time stamp: use the median price.
        # FMNS: Let's consider if these median prices are cheating in relation to OHLC bars
        elif cp == 't3':

            # get unique timestamps
            unique_ts_idx = np.unique(dataToClean.Timestamp, return_index=True)[1]
            
            # get median prices
            median_price = dataToClean[['Timestamp', 'price']].groupby('Timestamp')['price'].median().values
                
            # keep only unique timestamps
            dataToClean = dataToClean.iloc[unique_ts_idx, :].reset_index(drop=True)
            
            # fill the price variable with medians matched on unique_ts
            dataToClean.loc[:,'price'] = median_price
            
            ### We could add a print to tell how many duplicated values there where? - Kris
            
            # note that all other variables now hold the first entry for each timestamp!

            
        # if the cleaning procedure in question is t3.        
        # T4. Delete entries with prices that are above the ask plus the bid-ask spread. 
        # Similar for entries with prices below the bid minus the bid-ask spread.
        # FMNS: We have no bid/ask/spread in trades-table. 
        #       To do this, we would probably need to cross-match timestamps between trades and quotes properly
        elif cp == 't4':
            
            raise ValueError(f'Cleaning procedure {cp} is on hold')          

            
        # if the cleaning procedure in question is q1.
        # Q1. When multiple quotes have the same timestamp, we replace all these with a single entry 
        # with the median bid and median ask price.   
        # FMNS: Let's consider if these median prices are cheating in relation to OHLC bars
        elif cp == 'q1':
            
            if datatype == 'quote':
            
                # get unique timestamps
                unique_ts_idx = np.unique(dataToClean.Timestamp, return_index=True)[1]

                # get median prices
                median_price = dataToClean[['Timestamp', 'bid', 'ofr']].groupby('Timestamp')['bid', 'ofr'].median().values

                # keep only unique timestamps
                dataToClean = dataToClean.iloc[unique_ts_idx, :].reset_index(drop=True)

                # fill the price variable with medians matched on unique_ts
                dataToClean.loc[:,['bid','ofr']] = median_price

                # note that all other variables now hold the first entry for each timestamp!
            
            else:
                
                raise ValueError('The datatype has to be quote, in order to apply this cleaning procedure.\nPlease revisit your request.')
            

        # if the cleaning procedure in question is q2.
        # Q2. Delete entries for which the spread is negative.
        elif cp == 'q2':
            
            if datatype == 'quote':
                
                dataToClean = dataToClean[dataToClean.ofr - dataToClean.bid >= 0].reset_index(drop=True)     
            
            else:
                raise ValueError('The datatype has to be quote, in order to apply this cleaning procedure.\nPlease revisit your request.')

        # if the cleaning procedure in question is q3.
        # Q3. Delete entries for which the spread is more that 50 times the median spread on that day.
        elif cp == 'q3':
            
            if datatype == 'quote':
                
                # get all spreads across days, groupby Date and take daily median spreads
                all_spreads = dataToClean[['Date', 'bid', 'ofr']]
                all_spreads['spread'] =  dataToClean.ofr - dataToClean.bid
                all_spreads.drop(['bid','ofr'], axis=1, inplace=True)

                median_spreads = all_spreads.groupby('Date').median().values     


                total_keep_idx = []
                # for each unique day ...
                for day in np.unique(dataToClean.Date):

                    # for every spread within this day, check if it's below 50*median 
                    # (below_50median is a boolean with all existing index)
                    below_50median = (all_spreads[all_spreads.Date == day].spread <= 50*median_spreads[median_spreads.index == day].values[0][0])

                    # get the indices where below_50median == True (meaning individual spread is within 50*median)
                    below_50median[below_50median].index

                    total_keep_idx.append(below_50median[below_50median].index)


                # after going through all days, flatten the list
                total_keep_idx = [ele for intraday_idx in total_keep_idx for ele in intraday_idx]

                # keep all entries that passed the filter
                dataToClean = dataToClean.iloc[total_keep_idx, :]
            
            else:

                raise ValueError('The datatype has to be quote, in order to apply this cleaning procedure.\nPlease revisit your request.')
        
        # if the cleaning procedure in question is q4.
        # Q4. Delete entries for which the mid-quote deviated by more than 5 median absolute deviations from 
        # a centered median (excluding the observation under consideration) of 50 observations.        
        elif cp == 'q4':
            
            raise ValueError(f'Cleaning procedure {cp} is on hold')
    return dataToClean

def candleCreateNP():
    ii = 0
    for l in cleanedData.Date.unique():
        for i in aggregateHour:
            for j in aggregateMinute:
                if (i == 9) & (j <30):
                    continue
                
                p1 = numpiedPrice[((numpiedData[0]==l)&\
                                     (numpiedData[1]==i)&\
                                     (numpiedData[2]>=j))&((numpiedData[0]==l)&\
                                                           (numpiedData[1]==i)&\
                                                           (numpiedData[2]<j+step))]
                if len(p1) > 0:
                    candleNP[ii] = np.array([p1[0],p1.max(),p1.min(),p1[-1]])
                else:
                    # if no new prices in the interval considered, use the previous pne
                    candleNP[ii] = candleNP[ii-1]
                ii += 1
                
    return candleNP


def candleCreateNP_vect(verbose=True):
        
    cleanedData['hour_min_col'] = cleanedData['Hour'] + cleanedData['Minute']/60
    if verbose:
        print(f"min and max of new hour_min_col: \
              {cleanedData['hour_min_col'].min()}, {cleanedData['hour_min_col'].max()}")
              
    # setup time_bins to group each timestamp
    delta = step/60
    time_bins = np.arange(9.5-delta, 16+delta, delta)
              
    # put each timestamp into a bucket according to time_bins defined by the step variable
    cleanedData['time_group'] = pd.cut(cleanedData['hour_min_col'], bins=time_bins, right=True, labels=False)
    
    # group by date and time_group, extract price, take it first, max, min, last (open, high, low, close)
    OHLC = cleanedData.groupby(['Date','time_group'])[['price']].agg(['first', 'max', 'min', 'last'])              
    
    # return as numpy if preferred
    return OHLC.values

In [4]:
# def candleCreate():
#     ii = 0
#     for l in cleanedData.Date.unique():
#         for i in aggregateHour:
#             for j in aggregateMinute:

#                 temp = cleanedData[((cleanedData.Date == l)&\
#                                     (cleanedData.Hour==i)&\
#                                     (cleanedData.Minute<j+step))&((cleanedData.Date == l)&\
#                                                                   (cleanedData.Hour==i)&\
#                                                                   (cleanedData.Minute>=j))]
#                 if temp.shape[0] > 0:
#                     candle[ii] = np.array([temp.price.iloc[0],temp.price.max(),temp.price.min(),temp.price.iloc[-1]])

#                 ii += 1

# %timeit candleCreate()

# Reading in data, LOBSTER as well as TAQ

## TAQ

In [5]:
print(os.listdir())
path = 'T:/taqhdf5' #'a:/taqhdf5'
allFiles = os.listdir(path)

['.git', '.gitignore', '.ipynb_checkpoints', 'CrunchTAQ.ipynb', 'drafts', 'FMNS_draft', 'hello.py', 'README.md', 'Speciale to-do.docx', 'Speciale to-do.txt', 'test', 'utils']


FileNotFoundError: [WinError 3] Den angivne sti blev ikke fundet: 'T:/taqhdf5'

In [None]:
#allFiles
len(allFiles), allFiles[:5], allFiles[-5:]

In [None]:
allFiles[-10:]

In [None]:
dates = np.array(['2020040' + str(i) if i < 10 else '202004' + str(i) for i in np.arange(1,32)])

In [None]:
dates

In [None]:
# Measuring the exraction time
start = time.time()

# Provide a list of dates of interest (format: yyyymmdd)
dates = np.array(['2020040' + str(i) if i < 10 else '202004' + str(i) for i in np.arange(1,32)]).astype(int)
# dates = np.array(['20200401']).astype(int)#,'20200402'

# Provide a list of tickers of interest
tickers = ['GOOG']#'MSFT'

# Do we need data on trades, quotes or both?
dataNeeded = 'trades' # 'trades', 'quotes' or 'both'

# Extracting just the dates of each file
allDates = np.array([re.split("[._]",ele)[1] if ("." in ele ) & ("_" in ele) else 0 for ele in allFiles]).astype(int)

minDate = np.min(dates)
maxDate = np.max(dates)

if verbose:
    print('##### Date range #####\n\nDate, Min: %i\nDate, Max: %i\n'%(minDate,maxDate))

# Locating what files we need.
index = np.where((minDate <= allDates) & (allDates <= maxDate))

relevantFiles = np.array(allFiles)[index[0]]

# Separating the files into trade and quote files.
trade = [ele for ele in relevantFiles if 'trade' in ele]
quote = [ele for ele in relevantFiles if 'quote' in ele]

if verbose:
    print('##### Data Extraction begins #####\n')
    
    if dataNeeded.lower() == 'both':
        print('Both trade and quote data is being extracted..\n')
    else:
        print('%s data is being extracted..\n' % dataNeeded[0:5])
        
if (dataNeeded == 'both') | (dataNeeded == 'trades'):
           
# Lets start out by extracting the trade data

    for i,file in enumerate(trade):

        if (verbose) & (i == 0):
            print('### Trade Data ###\n')

        # Reading one file at a time
        raw_data = h5py.File(path+'/'+file,'r')

        # Store the trade indecies
        TI = raw_data['TradeIndex']

        if (verbose) & (i==0):
            print('The raw H5 trade file contains: ',list(raw_data.keys()),'\n')

        # Extracting just the tickers
        TIC = np.array([ele[0].astype(str).strip() for ele in TI])

        # Lets get data on each ticker for the file processed at the moment
        for j,ticker in enumerate(tickers):

            # Getting the specific ticker information
            tickerInfo = TI[TIC==ticker][0]

            if (verbose) & (i == 0):
                    print('Ticker Information: ',tickerInfo,'\n')

            # Raw data
            tempData = raw_data['Trades'][list(np.arange(tickerInfo[1],tickerInfo[1]+tickerInfo[2]))]

            # For first file and first ticker.
            if (i == 0) & (j == 0):    

                tradeData = pd.DataFrame(tempData, columns= tempData.dtype.names)

                tradeData.loc[:,'ex'] = strList(tradeData.ex)
                tradeData.loc[:,'cond'] = strList(tradeData.cond)
                tradeData.loc[:,'TradeStopStockIndicator'] = strList(tradeData.TradeStopStockIndicator)
                tradeData.loc[:,'corr'] = strList(tradeData['corr'])
                tradeData.loc[:,'TradeID'] = strList(tradeData.TradeID)
                tradeData.loc[:,'TTE'] = strList(tradeData.TTE)
                tradeData.loc[:,'TradeReportingFacility'] = strList(tradeData.TradeReportingFacility)
                tradeData.loc[:,'SourceOfTrade'] = strList(tradeData.SourceOfTrade)

                # Adding the date of the file to the dataframe.
                tradeData['Date'] = re.split('[._]',file)[1]

                # Adding a more readable timestamp - TEST IT
                tradeData['Timestamp'] = pd.to_datetime(formatDate(re.split('[._]',file)[1],tradeData.utcsec))
                tradeData['TSRemainder'] = list(map(lambda x: str(x)[11:], tradeData.utcsec))
                tradeData['Hour'] = tradeData.Timestamp.dt.hour
                tradeData['Minute'] = tradeData.Timestamp.dt.minute
                # Adding the ticker
                tradeData['Ticker'] = ticker

                if (verbose) & (i==0) & (j==0):
                    print('Sneak peak of the data\n\n',tradeData.head())

            else:

                # Storing the data on the following tickers in a temporary variable.

                temp = pd.DataFrame(tempData, columns= tempData.dtype.names)

                temp.loc[:,'ex'] = strList(temp.ex)
                temp.loc[:,'cond'] = strList(temp.cond)
                temp.loc[:,'TradeStopStockIndicator'] = strList(temp.TradeStopStockIndicator)
                temp.loc[:,'corr'] = strList(temp['corr'])
                temp.loc[:,'TradeID'] = strList(temp.TradeID)
                temp.loc[:,'TTE'] = strList(temp.TTE)
                temp.loc[:,'TradeReportingFacility'] = strList(temp.TradeReportingFacility)
                temp.loc[:,'SourceOfTrade'] = strList(temp.SourceOfTrade)

                # Adding the date of the file to the dataframe.
                temp['Date'] = re.split('[._]',file)[1]

                # Adding a more readable timestamp - TEST IT
                temp['Timestamp'] = pd.to_datetime(formatDate(re.split('[._]',file)[1],temp.utcsec))
                temp['TSRemainder'] = list(map(lambda x: str(x)[11:], temp.utcsec))
                temp['Hour'] = temp.Timestamp.dt.hour
                temp['Minute'] = temp.Timestamp.dt.minute

                # Adding the ticker
                temp['Ticker'] = ticker

                # Adding the new data 
                tradeData = pd.concat([tradeData,temp])

if (dataNeeded == 'both') | (dataNeeded == 'quotes'):
    
    # Now to the quote data
    for i,file in enumerate(quote):

        if (verbose) & (i == 0):
            print('### Quote Data ###\n')

        # Reading one file at a time
        raw_data = h5py.File(path+'/'+file,'r')

        # Store the trade indecies
        QI = raw_data['QuoteIndex']

        if (verbose) & (i==0):
            print('The raw H5 quote file contains: ',list(raw_data.keys()),'\n')

        # Extracting just the tickers
        QIC = np.array([ele[0].astype(str).strip() for ele in QI])

        # Lets get data on each ticker for the file processed at the moment
        for j,ticker in enumerate(tickers):

            # Getting the specific ticker information
            tickerInfo = QI[QIC==ticker][0]

            if (verbose) & (i == 0):
                    print('Ticker Information: ',tickerInfo,'\n')

            # Raw data
            tempData = raw_data['Quotes'][list(np.arange(tickerInfo[1],tickerInfo[1]+tickerInfo[2]))]

            # For first file and first ticker.
            if (i == 0) & (j == 0):    

                quoteData = pd.DataFrame(tempData, columns= tempData.dtype.names)
                # We remove all unnecessary variables
                unnecessaryVariables = ['NationalBBOInd',
                                        'FinraBBOInd',
                                        'FinraQuoteIndicator',
                                        'SequenceNumber',
                                        'FinraAdfMpidIndicator',
                                        'QuoteCancelCorrection',
                                        'SourceQuote',
                                        'RPI',
                                        'ShortSaleRestrictionIndicator',
                                        'LuldBBOIndicator',
                                        'SIPGeneratedMessageIdent',
                                        'NationalBBOLuldIndicator',
                                        'ParticipantTimestamp',
                                        'FinraTimestamp',
                                        'FinraQuoteIndicator',
                                        'SecurityStatusIndicator']
                
                quoteData = quoteData.drop(columns=unnecessaryVariables)

                quoteData.loc[:,'ex'] = strList(quoteData.ex)
                quoteData.loc[:,'mode'] = strList(quoteData['mode'])
                
                # Adding the date of the file to the dataframe.
                quoteData['Date'] = re.split('[._]',file)[1]

                # Adding a more readable timestamp - TEST IT
                quoteData['Timestamp'] = pd.to_datetime(formatDate(re.split('[._]',file)[1],quoteData.utcsec))
                quoteData['TSRemainder'] = list(map(lambda x: str(x)[11:], quoteData.utcsec))
                quoteData['Hour'] = quoteData.Timestamp.dt.hour
                quoteData['Minute'] = quoteData.Timestamp.dt.minute
                # Adding the ticker
                quoteData['Ticker'] = ticker

                if (verbose) & (i==0) & (j==0):
                    print('Sneak peak of the data\n\n',quoteData.head())

            else:

                # Storing the data on the following tickers in a temporary variable.

                temp = pd.DataFrame(tempData, columns= tempData.dtype.names)
                # Removing all unnecessary variables
                temp = temp.drop(columns=unnecessaryVariables)
                
                temp.loc[:,'ex'] = strList(temp.ex)
                temp.loc[:,'mode'] = strList(temp['mode'])

                # Adding the date of the file to the dataframe.
                temp['Date'] = re.split('[._]',file)[1]

                # Adding a more readable timestamp - TEST IT
                temp['Timestamp'] = pd.to_datetime(formatDate(re.split('[._]',file)[1],temp.utcsec))
                temp['TSRemainder'] = list(map(lambda x: str(x)[11:], temp.utcsec))
                temp['Hour'] = temp.Timestamp.dt.hour
                temp['Minute'] = temp.Timestamp.dt.minute

                # Adding the ticker
                temp['Ticker'] = ticker

                # Adding the new data 
                quoteData = pd.concat([quoteData,temp])
                    
end = time.time()

if verbose:
    print('The extraction time was %.3f seconds.' % (end-start))

In [None]:
# quoteData.head()
tradeData.head()

In [None]:
tradeData[['Date','Ticker','utcsec']].groupby(['Date','Ticker']).count()
# quoteData[['Date','Ticker','utcsec']].groupby(['Date','Ticker']).count()

In [None]:
tradeData.cond.unique()

In [None]:
tradeData[['cond','utcsec']].groupby('cond').count()

In [None]:
tradeData[tradeData.duplicated(['utcsec'])]
# quoteData[quoteData.duplicated(['utcsec'])]

In [None]:
tradeData[['ex','utcsec']].groupby('ex').count()

# Implementing technical features

A library: https://technical-analysis-library-in-python.readthedocs.io/en/latest/

### Features used in the literature:

* Stochastic K - Implemented
* Stochastic D - Implemented
* Slow Stochastic D - Implemented
* Momentum - Same as difference
* ROC - Implemented
* Williams % R - Implemented
* A/D Oscillator
* Disparity 5 - Implemented
* Disparity 10 - Implemented
* Price Oscillator - (detrended) - Implemented
* Commodity Channel Index - Implemented
* RSI - Impliemented

Formulas: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=876544

* Moving Average - Implemented
* Bias
* Exponential Moving Average - Implemented
* Difference - Same af Momentum
* True Range - (Average) - Implemented
* 

Formulas: https://www.sciencedirect.com/science/article/pii/S0957417407001819?via%3Dihub

#### Other Technical Features
* Moving Average Convergence Divergence (MACD) - Implemented

**Non-classical technical features**

* Bid/Ask prices of top of book
* Spread and mid price based on top og book
* Price derivatives

Formulas: https://www.tandfonline.com/doi/full/10.1080/14697688.2015.1032546?instName=UCL+%28University+College+London%29

# Aggregation - going from irregular spaced data to regular spaced data.

Financial econometric analysis at ultra-high frequency: Data handling concerns

Paper: https://www.sciencedirect.com/science/article/pii/S0167947306003458

In [None]:
tradeData.head()

In [None]:
tradeData.shape

In [None]:
tradeData[['corr','utcsec']].groupby('corr').count()

In [None]:
# There are 11 cleaning procedures, with 3 relevant for both trade and quote data and 4 for either trade or quote data.
    # The cleaning procedures are listed below for simplicity
    
    # Applicable for both trade and quote data
    
    # P1. Delete entries with a time stamp outside the 9:30 am to 4 pm window when the exchange is open.
    # P2. Delete entries with a bid, ask or transaction price equal to zero.
    # P3. Retain entries originating from a single exchange. Delete other entries.
    
    # Applicable for just trade data
    
    # T1. Delete entries with corrected trades. (Trades with a Correction Indicator, CORR != 0).
    # T2. Delete entries with abnormal Sale Condition. (Trades where COND has a letter code, except for “E” and “F”).
    # T3. If multiple transactions have the same time stamp: use the median price.
    # T4. Delete entries with prices that are above the ask plus the bid-ask spread. 
    # Similar for entries with prices below the bid minus the bid-ask spread.
    
    # Applicable for just quote data
    
    # Q1. When multiple quotes have the same timestamp, we replace all these with a single entry 
    # with the median bid and median ask price.
    # Q2. Delete entries for which the spread is negative.
    # Q3. Delete entries for which the spread is more that 50 times the median spread on that day.
    # Q4. Delete entries for which the mid-quote deviated by more than 5 median absolute deviations from 
    # a centered median (excluding the observation under consideration) of 50 observations.

    # Some comments, by (Lunde,2016), on the relative importance of the individual cleaning procedures
    
    # ➤ By far the most important rules here are P3, T3 and Q1.
    # ➤ In our empirical work we will see the impact of suspending P3. It is used to reduce the impact
    # of time-delays in the reporting of trades and quote updates.
    # ➤ Some form of T3 and Q1 rule seems inevitable here, and it is these rules which lead to the largest deletion of data.
    # ➤ T4 is an attractive rule, as it disciplines the trade data using quotes. However, it has the disadvantage 
    # that it cannot be applied when quote data is not available.
    # ➤ In situations where quote data is not available, Q4 can be applied to the transaction prices in place of T4.

# def HFDataCleaning(cleaningProcedures,dataToClean,dataType,p3Exchanges = []):

cleanedData = HFDataCleaning(['P1','p2','t1','p3'],tradeData,'trade',['q'])

In [None]:
cleanedData.shape

# Aggregate data in candle sticks

In [None]:
step = 10 # in minutes

aggregateMinute = np.arange(0,60,step)
aggregateHour = np.arange(9,16,1)
aggregateDate = np.arange(len(cleanedData.Date.unique()))

remove = 30//step

# candle = np.zeros(((len(aggregateDate)*len(aggregateMinute)*len(aggregateHour)),4))
candleNP = np.zeros((((len(aggregateDate)*len(aggregateMinute)*len(aggregateHour))-remove*len(aggregateDate)),4))

In [None]:
numpiedData = cleanedData[['Date','Hour','Minute']].to_numpy()
numpiedData = numpiedData.T
numpiedPrice = cleanedData['price'].to_numpy()

In [None]:
# Aggregate data in candle sticks
#candleNP = candleCreateNP()
candleNP = candleCreateNP_vect(verbose=True)

In [None]:
candleNP[0:5]

In [None]:
candleNP[-5:]

In [None]:
###### List of possible features to include: ######
###################################################

##### pastObs: Full name: Past Observations - Includes the current plus X-1 past observations as features
##### StoOsc: Full name: Stochastic Oscillator - From the library: 
# The stochastic oscillator presents the location of 
# the closing price of a stock in relation to the high and low range of the price of a stock over a period of time, 
# typically a 14-day period.
####

def generateFeatures(data,listOfFeatures=[],featureWindow=1):
    # The input data is build up as follows:
    # Open, high, low and close.
    
#     npFeatures = np.zeros((len(candleNP)-featureWindow+1,featureWindow*len(candleNP[0])))
    
    candlePD = pd.DataFrame({'open':candleNP.T[0],
                         'high':candleNP.T[1],
                         'low':candleNP.T[2],
                         'close':candleNP.T[3]})
    
    featuresPD = pd.DataFrame()
    
    for feature in listOfFeatures:
        
        # Past observations
        if feature.lower() == 'pastobs':
            
            # Creating column names
            if isinstance(data[0],np.ndarray):
                cn = [['open_'+str(i),
                       'high_'+str(i),
                       'low_'+str(i),
                       'close_'+str(i)] for i in np.arange(featureWindow)]
                colnames = []
                
                for ele in cn:
                    colnames += ele
            else:
                # Made ready if we at some point moved to the data being a scalar series.
                raise ValueError('Im not ready to take on a scalar series.')
            
            # Create a variable to temporary store the new features
            tempFeatures = np.zeros((len(data)-featureWindow+1,featureWindow*len(data[0])))
            
            stepper = np.arange(featureWindow,len(tempFeatures)+featureWindow)
            
            i = 0
            # Creating the features
            for s in stepper:

                tempFeatures[i] = data[i:s].flatten()

                i += 1
            
            # Adding the features
            for colnm,feat in zip(colnames,tempFeatures.T):
                featuresPD[colnm] = feat
        
        # Stochastic K
        elif feature.lower() == 'stok':
            
            tempFeatures= ta.momentum.stoch(candlePD.high,
                                            candlePD.low,
                                            candlePD.close)
            # Adding the feature
            featuresPD['stok'] = tempFeatures
        
        # Stochastic D
        elif feature.lower() == 'stod':
            
            tempFeatures= ta.momentum.stoch_signal(candlePD.high,
                                                   candlePD.low,
                                                   candlePD.close)
            # Adding the feature
            featuresPD['stod'] = tempFeatures
        
        # Slow Stochastic D
        elif feature.lower() == 'sstod':
            
            tempFeatures= ta.trend.sma_indicator(ta.momentum.stoch_signal(candlePD.high,
                                                                          candlePD.low,
                                                                          candlePD.close))
            # Adding the feature
            featuresPD['sstod'] = tempFeatures
        
        # Williams %R
        elif feature.lower() == 'wilr':
            
            tempFeatures= ta.momentum.wr(candlePD.high,
                                         candlePD.low,
                                         candlePD.close)
            # Adding the feature
            featuresPD['wilr'] = tempFeatures
        
        # Rate Of Change
        elif feature.lower() == 'roc':
            
            tempFeatures= ta.momentum.roc(candlePD.close)
            
            # Adding the feature
            featuresPD['roc'] = tempFeatures
        
        # Relative Strength Index
        elif feature.lower() == 'rsi':
            
            tempFeatures= ta.momentum.rsi(candlePD.close)
            
            # Adding the feature
            featuresPD['rsi'] = tempFeatures
            
        # Average True Range
        elif feature.lower() == 'atr':
            
            tempFeatures= ta.volatility.average_true_range(candlePD.high,
                                                           candlePD.low,
                                                           candlePD.close)
            # Adding the feature
            featuresPD['atr'] = tempFeatures
        
        # Commodity Channel Index
        elif feature.lower() == 'cci':
            
            tempFeatures= ta.trend.cci(candlePD.high,
                                       candlePD.low,
                                       candlePD.close)
            # Adding the feature
            featuresPD['cci'] = tempFeatures
        
         # Detrended Price Ocillator
        elif feature.lower() == 'dpo':
            
            tempFeatures= ta.trend.dpo(candlePD.close)
            
            # Adding the feature
            featuresPD['dpo'] = tempFeatures
        
        # Simple Moving Average
        elif feature.lower() == 'sma':
            
            tempFeatures= ta.trend.sma_indicator(candlePD.close)
            
            # Adding the feature
            featuresPD['sma'] = tempFeatures
        
        # Exponential Moving Average
        elif feature.lower() == 'ema':
            
            tempFeatures= ta.trend.ema_indicator(candlePD.close)
            
            # Adding the feature
            featuresPD['ema'] = tempFeatures
            
        # Moving Average Convergence Divergence
        elif feature.lower() == 'macd':
            
            tempFeatures= ta.trend.macd(candlePD.close)
            
            # Adding the feature
            featuresPD['macd'] = tempFeatures
        
         # Disparity 5
        elif feature.lower() == 'dis5':
            
            tempFeatures= (candlePD.close/ta.trend.sma_indicator(candlePD.close,5))*100
            
            # Adding the feature
            featuresPD['dis5'] = tempFeatures
            
        # Disparity 10
        elif feature.lower() == 'dis10':
            
            tempFeatures= (candlePD.close/ta.trend.sma_indicator(candlePD.close,10))*100
            
            # Adding the feature
            featuresPD['dis10'] = tempFeatures
        
                
                
    return featuresPD
# featureWindow = 5

candleNP[0:5].flatten()

In [None]:
test = generateFeatures(candleNP,['pastobs',
                                  'stok',
                                  'stod',
                                  'sstod',
                                  'wilr',
                                  'roc',
                                  'rsi',
                                  'atr',
                                  'cci',
                                  'dpo',
                                  'sma',
                                  'ema',
                                  'macd','dis5','dis10'],5)

In [None]:
test

In [None]:
type(candleNP[0])

In [None]:
candleNP.T

In [None]:
candlePD = pd.DataFrame({'open':candleNP.T[0],
                         'high':candleNP.T[1],
                         'low':candleNP.T[2],
                         'close':candleNP.T[3]})
ta.momentum.stoch(candlePD.high,
                  candlePD.low,
                  candlePD.close)[0:50]

In [None]:
test = [1,2,3]
test1 = [1,2,3]

In [None]:
test+test1

In [None]:
test

In [None]:
stepper = np.arange(featureWindow,len(npFeatures)+featureWindow)
i = 0
for s in stepper:
    
    npFeatures[i] = candleNP[i:s].flatten()
    
    i += 1

In [None]:
npFeatures.shape

In [None]:
npFeatures[npFeatures==0]

In [None]:
candleNP[0:20]

In [None]:
returns = ((candleNP.T[-1][1:]/candleNP.T[-1][0:-1])-1)*100
returns[0:20]

In [None]:
plt.hist(returns,bins=20)
plt.show()

In [None]:
returns

In [None]:

(np.sort(returns))

In [None]:
[len(np.array_split(returns,5)[i]) for i in np.arange(5)]

In [None]:
classes = 5
labels = np.zeros(returns.shape[0])#-featureWindow
thresholdsMin = [np.array_split(np.sort(returns),classes)[i].min() for i in np.arange(classes)]
thresholdsMax = [np.array_split(np.sort(returns),classes)[i].max() for i in np.arange(classes)]

In [None]:
for i in np.arange(classes):
    
    if i == 0:
        
        labels[(returns <= thresholdsMax[i])] = i
    
    elif i == (classes-1):
        
        labels[(returns >= thresholdsMin[i])] = i
    
    else:
        
        labels[(returns >= thresholdsMin[i])&(returns<=thresholdsMax[i])] = i

In [None]:
labels

In [None]:
np.unique(labels,return_counts=True)

In [None]:
npFeatures.shape

In [None]:
labels.shape

In [None]:
labels[5:].shape

In [None]:
npFeatures[0:2]

In [None]:
npFeatures[2:].shape

In [None]:
candleNP[0:7]

In [None]:
labels[4]

In [None]:
returns[4:6]

In [None]:
returns[4:].shape

In [None]:
returns[0:4]

In [None]:
labels.shape

In [None]:
numpiedPrice