In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import time
import h5py
import copy

# Do you wanna see?
verbose = True

In [2]:
def transformData(dataset, datainfo):
  
    # Use the column-name information to rename the columns.
    renameCol = {i:col[0] for i,col in enumerate(datainfo)}
  
    # Rename
    dataset = dataset.rename(columns=renameCol)
  
    # Use the datatype information to convert the arrays back to the right datatype.
    dt = {col[0]:str if col[1] == 'object' else col[1] for col in datainfo}

    # Convert the datatypes
    dataset = dataset.astype(dt)

    # Strip the string-type arrays for the unintended characters.
    for ele in datainfo:
        # if the datatype is string, we need to do some additional conversion.
        if ele[1] == 'object':

            dataset[ele[0]] = list(map(f,dataset[ele[0]]))

            if 'date' in ele[0].lower():
                dataset[ele[0]] = dataset[ele[0]].astype(np.datetime64) 

    return dataset

# We create a function to clean the string-type arrays
f = lambda a: re.split('[\']',a)[1]

# Reading in data, LOBSTER as well as TAQ

## TAQ

In [3]:
print(os.listdir())
path = 'a:/taqhdf5'
allFiles = os.listdir(path)

['.git', '.ipynb_checkpoints', 'CrunchTAQ.ipynb', 'README.md']


In [4]:
allFiles

['taq_19930315.h5',
 'taq_19930104.h5',
 'taq_19930317.h5',
 'taq_19930105.h5',
 'taq_19930316.h5',
 'taq_19930106.h5',
 'taq_19930318.h5',
 'taq_19930107.h5',
 'taq_19930319.h5',
 'taq_19930108.h5',
 'taq_19930322.h5',
 'taq_19930111.h5',
 'taq_19930331.h5',
 'taq_19930112.h5',
 'taq_19930401.h5',
 'taq_19930113.h5',
 'taq_19930402.h5',
 'taq_19930114.h5',
 'taq_19930405.h5',
 'taq_19930115.h5',
 'taq_19930406.h5',
 'taq_19930118.h5',
 'taq_19930407.h5',
 'taq_19930119.h5',
 'taq_19930408.h5',
 'taq_19930120.h5',
 'taq_19930412.h5',
 'taq_19930121.h5',
 'taq_19930413.h5',
 'taq_19930122.h5',
 'taq_19930414.h5',
 'taq_19930125.h5',
 'taq_19930415.h5',
 'taq_19930126.h5',
 'taq_19930416.h5',
 'taq_19930127.h5',
 'taq_19930419.h5',
 'taq_19930128.h5',
 'taq_19930420.h5',
 'taq_19930129.h5',
 'taq_19930421.h5',
 'taq_19930201.h5',
 'taq_19930422.h5',
 'taq_19930202.h5',
 'taq_19930423.h5',
 'taq_19930203.h5',
 'taq_19930426.h5',
 'taq_19930204.h5',
 'taq_19930427.h5',
 'taq_19930205.h5',


In [6]:
allFiles[-10:]

['taqquote_20200417.h5',
 'taqquote_20200420.h5',
 'taqquote_20200421.h5',
 'taqquote_20200422.h5',
 'taqquote_20200423.h5',
 'taqquote_20200424.h5',
 'taqquote_20200427.h5',
 'taqquote_20200428.h5',
 'taqquote_20200429.h5',
 'taqquote_20200430.h5']

In [91]:
# Measuring the exraction time
start = time.time()

# Provide a list of dates of interest (format: yyyymmdd)
dates = np.array(['20200401']).astype(int)#,'20200402'

# Provide a list of tickers of interest
tickers = ['GOOG','MSFT']

# Extracting just the dates of each file
allDates = np.array([re.split("[._]",ele)[1] if ("." in ele ) & ("_" in ele) else 0 for ele in allFiles]).astype(int)

minDate = np.min(dates)
maxDate = np.max(dates)

if verbose:
    print('##### Date range #####\n\nDate, Min: %i\nDate, Max:%i\n'%(minDate,maxDate))

# Locating what files we need.
index = np.where((minDate <= allDates) & (allDates <= maxDate))

relevantFiles = np.array(allFiles)[index[0]]

# Separating the files into trade and quote files.
trade = [ele for ele in relevantFiles if 'trade' in ele]
quote = [ele for ele in relevantFiles if 'quote' in ele]

if verbose:
    print('##### Data Extraction begins #####\n')

# Lets set out by extracting the trade data



for i,file in enumerate(trade):
    
    if (verbose) & (i == 0):
        print('### Trade Data ###\n')
    
    # Reading one file at a time
    raw_data = h5py.File(path+'/'+file,'r')
    
    # Store the trade indecies
    TI = raw_data['TradeIndex']
    
    if (verbose) & (i==0):
        print('The raw H5 trade file contains: ',list(raw_data.keys()),'\n')
    
    # Extracting just the tickers
    TIC = np.array([ele[0].astype(str).strip() for ele in TI])
    
    # Lets get data on each ticker for the file processed at the moment
    for j,ticker in enumerate(tickers):
        
        # Getting the specific ticker information
        tickerInfo = TI[TIC==ticker][0]
        
        if (verbose) & (i == 0):
                print('Ticker Information: ',tickerInfo,'\n')
        
        # Raw data
        tempData = raw_data['Trades'][np.arange(tickerInfo[1],tickerInfo[1]+tickerInfo[2])]
        
        # For first file and first ticker.
        if (i == 0) & (j == 0):    
            
            # Lets define the dataframe to contain our data
            data = pd.DataFrame([[ele.astype(str) if type(ele) is np.bytes_ else ele for ele in tempData[i]] \
                                 for i in np.arange(tempData.shape[0])],
                                columns=tempData.dtype.names)
            
            # Adding the date of the file to the dataframe.
            data['Date'] = re.split('[._]',file)[1]
            # Adding the ticker
            data['Ticker'] = ticker
            # Note: The way we currently trasnform the data from being bytes to strings is very inefficient but 
            # ".astype({colName:str})" does not seem to do the trick!
            
            if (verbose) & (i==0) & (j==0):
                print('Sneak peak of the data\n\n',data.head())
            
        else:
            
            # Storing the data on the following tickers in a temporary variable.
            temp = pd.DataFrame([[ele.astype(str) if type(ele) is np.bytes_ else ele for ele in tempData[i]] \
                                 for i in np.arange(tempData.shape[0])],
                                columns=tempData.dtype.names)
            
            # Adding the date of the file to the dataframe.
            temp['Date'] = re.split('[._]',file)[1]
            
            # Adding the ticker
            temp['Ticker'] = ticker
            
            # Adding the new data 
            data = pd.concat([data,temp])
                    
end = time.time()

if verbose:
    print('The extraction time was %.3f seconds.' % (end-start))

##### Date range #####

Date, Min: 20200401
Date, Max:20200401

##### Data Extraction begins #####

### Trade Data ###

The raw H5 trade file contains:  ['TradeIndex', 'Trades'] 

Ticker Information:  (b'GOOG            ', 26900500, 71427) 

Sneak peak of the data

            utcsec ex  cond  volume    price TradeStopStockIndicator corr  \
0  14400048517953  P  @ TI      67  1139.44                           00   
1  14422296771981  P  @ TI      20  1138.55                           00   
2  14429472894282  Q  @FTI       1  1138.54                           00   
3  14506997225243  P  @ TI      31  1143.65                           00   
4  14516526073882  P  @ TI       1  1143.59                           00   

   TradeSequenceNumber TradeID SourceOfTrade TradeReportingFacility  \
0                 1507       1             N                          
1                 1552       2             N                          
2                 1554       1             N                   

In [92]:
data.head()

Unnamed: 0,utcsec,ex,cond,volume,price,TradeStopStockIndicator,corr,TradeSequenceNumber,TradeID,SourceOfTrade,TradeReportingFacility,ParticipantTime,TRFTime,TTE,Date,Ticker
0,14400048517953,P,@ TI,67,1139.44,,0,1507,1,N,,14400048141056,99,0,20200401,GOOG
1,14422296771981,P,@ TI,20,1138.55,,0,1552,2,N,,14422296394240,99,0,20200401,GOOG
2,14429472894282,Q,@FTI,1,1138.54,,0,1554,1,N,,14429472872353,99,1,20200401,GOOG
3,14506997225243,P,@ TI,31,1143.65,,0,1581,3,N,,14506996848640,99,0,20200401,GOOG
4,14516526073882,P,@ TI,1,1143.59,,0,1587,4,N,,14516525699840,99,0,20200401,GOOG


In [95]:
data[['Date','Ticker','utcsec']].groupby(['Date','Ticker']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,utcsec
Date,Ticker,Unnamed: 2_level_1
20200401,GOOG,71427
20200401,MSFT,531829
