# 1.6 Historical Tick Data

In order to download tick level data, we had to subscribe to Polygon.io Professional API. 

Each stock is queried individually from the API, and the response is limited to 50,000 trades. 

We will need to collect for all 505 SP500 stocks over a 2 month period for training data. 

In [5]:
import config
import pandas as pd 
from polygon import RESTClient
import datetime

In [79]:
!pip install pandas_market_calendars
import pandas_market_calendars as pmc

Collecting pandas_market_calendars
  Downloading pandas_market_calendars-1.6.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 4.2 MB/s eta 0:00:011
[?25hCollecting trading-calendars
  Downloading trading_calendars-1.11.11.tar.gz (101 kB)
[K     |████████████████████████████████| 101 kB 19.5 MB/s ta 0:00:01
Collecting toolz
  Downloading toolz-0.10.0-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 12.6 MB/s eta 0:00:01
Using legacy 'setup.py install' for trading-calendars, since package 'wheel' is not installed.
Installing collected packages: toolz, trading-calendars, pandas-market-calendars
    Running setup.py install for trading-calendars ... [?25ldone
[?25hSuccessfully installed pandas-market-calendars-1.6.0 toolz-0.10.0 trading-calendars-1.11.11


In [3]:
table=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
df = table[0]
df['Symbol'] = df['Symbol'].str.replace('.','')
df.to_csv('../data/sp500/S&P500-Symbols.csv', columns=['Symbol'])
print(df.shape)
stocks = df['Symbol']
df.head()

(505, 9)


Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M Company,reports,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",1976-08-09,66740,1902
1,ABT,Abbott Laboratories,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
2,ABBV,AbbVie Inc.,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
3,ABMD,ABIOMED Inc,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094,1981
4,ACN,Accenture plc,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989


In [9]:
with RESTClient(config.POLYGON) as client:
    resp = client.historic_trades_v2("AAPL", "2018-03-02")
    
resp.results[-1]

{'t': 1520003048206060889,
 'y': 1520003048205660160,
 'q': 361297,
 'i': '7900',
 'x': 11,
 's': 1675,
 'p': 173.76,
 'z': 3}

In [10]:
resp.results[-1]

{'t': 1520003048206060889,
 'y': 1520003048205660160,
 'q': 361297,
 'i': '7900',
 'x': 11,
 's': 1675,
 'p': 173.76,
 'z': 3}

In [11]:
last = resp.results[-1]['t']
    
last    

1520003048206060889

In [12]:
last

1520003048206060889

In [13]:
with RESTClient(config.POLYGON) as client:
    resp.results.extend(client.historic_trades_v2("AAPL", "2018-03-02", timestamp=last).results)
    
resp

<polygon.rest.models.definitions.HistoricTradesV2ApiResponse at 0x11dc92460>

In [14]:
resp.results[-1]

{'t': 1520007143025260959,
 'y': 1520007143025231956,
 'q': 830286,
 'i': '28542',
 'x': 12,
 's': 100,
 'c': [14, 41],
 'p': 174.67,
 'z': 3}

In [15]:
len(resp.results)

100000

In [16]:
key_map = {key:resp.map[key]['name'] for key in resp.map}

In [17]:
df = pd.DataFrame(resp.results)
df.rename(key_map, axis=1, inplace=True)
df['sip_timestamp'] = pd.to_datetime(df['sip_timestamp'])
df['participant_timestamp'] = pd.to_datetime(df['participant_timestamp'])
df.head()

Unnamed: 0,sip_timestamp,participant_timestamp,sequence_number,id,exchange,size,conditions,price,tape,trf_id,trf_timestamp,correction
0,2018-03-02 09:00:00.014196645,2018-03-02 09:00:00.013260032,1069,1,11,6,"[12, 37]",175.1,3,,,
1,2018-03-02 09:00:00.014200363,2018-03-02 09:00:00.013260544,1070,2,11,54,"[12, 37]",175.1,3,,,
2,2018-03-02 09:00:00.069460073,2018-03-02 09:00:00.069078784,1077,3,11,3,"[12, 37]",175.1,3,,,
3,2018-03-02 09:01:41.734929827,2018-03-02 09:01:41.734551552,1082,4,11,1,"[12, 37]",175.2,3,,,
4,2018-03-02 09:03:46.888187959,2018-03-02 09:03:46.887811328,1087,5,11,28,"[12, 37]",174.97,3,,,


In [18]:
df['participant_timestamp'].max() - df['participant_timestamp'].min() 

Timedelta('0 days 07:12:23.011980809')

In [19]:
client.historic_trades_v2("AAPL", "2018-03-02", timestamp=1520038795615409664).results

[{'t': 1520038795615784911,
  'y': 1520038795615409664,
  'q': 2730013,
  'i': '28077',
  'x': 11,
  's': 20,
  'c': [12, 37],
  'p': 176.05,
  'z': 3}]

In [20]:
df.shape

(100000, 12)

In [21]:
pd.DatetimeIndex(df['sip_timestamp']).tz_localize('UTC').tz_convert('US/EASTERN')

DatetimeIndex(['2018-03-02 04:00:00.014196645-05:00',
               '2018-03-02 04:00:00.014200363-05:00',
               '2018-03-02 04:00:00.069460073-05:00',
               '2018-03-02 04:01:41.734929827-05:00',
               '2018-03-02 04:03:46.888187959-05:00',
               '2018-03-02 04:04:37.980812370-05:00',
               '2018-03-02 04:06:01.876769632-05:00',
               '2018-03-02 04:06:01.876778402-05:00',
               '2018-03-02 04:08:20.631003136-05:00',
               '2018-03-02 04:12:27.130563808-05:00',
               ...
               '2018-03-02 11:12:22.830696816-05:00',
               '2018-03-02 11:12:22.912030316-05:00',
               '2018-03-02 11:12:23.023966460-05:00',
               '2018-03-02 11:12:23.024604269-05:00',
               '2018-03-02 11:12:23.024771116-05:00',
               '2018-03-02 11:12:23.024786675-05:00',
               '2018-03-02 11:12:23.025252267-05:00',
               '2018-03-02 11:12:23.025253691-05:00',
         

50,000 rows comprises less than a full day of data from the market. 



In [22]:
first = resp.results[0]['t']
first

1519981200014196645

In [23]:


epoch = datetime.datetime.utcfromtimestamp(0)

def unix_time_millis(dt):
    return (dt - epoch).total_seconds() * 1000.0

begin = unix_time_millis(pd.to_datetime(first) - pd.Timedelta(8, 'hours'))

In [24]:
with RESTClient(config.POLYGON) as client:
    first_res = client.historic_trades_v2("AAPL", "2018-03-02", timestamp=first).results
    
first_res

[{'t': 1519981200014196645,
  'y': 1519981200013260032,
  'q': 1069,
  'i': '1',
  'x': 11,
  's': 6,
  'c': [12, 37],
  'p': 175.1,
  'z': 3},
 {'t': 1519981200014200363,
  'y': 1519981200013260544,
  'q': 1070,
  'i': '2',
  'x': 11,
  's': 54,
  'c': [12, 37],
  'p': 175.1,
  'z': 3},
 {'t': 1519981200069460073,
  'y': 1519981200069078784,
  'q': 1077,
  'i': '3',
  'x': 11,
  's': 3,
  'c': [12, 37],
  'p': 175.1,
  'z': 3},
 {'t': 1519981301734929827,
  'y': 1519981301734551552,
  'q': 1082,
  'i': '4',
  'x': 11,
  's': 1,
  'c': [12, 37],
  'p': 175.2,
  'z': 3},
 {'t': 1519981426888187959,
  'y': 1519981426887811328,
  'q': 1087,
  'i': '5',
  'x': 11,
  's': 28,
  'c': [12, 37],
  'p': 174.97,
  'z': 3},
 {'t': 1519981477980812370,
  'y': 1519981477980435968,
  'q': 1088,
  'i': '6',
  'x': 11,
  's': 1,
  'c': [12, 37],
  'p': 174.89,
  'z': 3},
 {'t': 1519981561876769632,
  'y': 1519981561876389632,
  'q': 1092,
  'i': '7',
  'x': 11,
  's': 56,
  'c': [12, 37],
  'p': 174.5

In [25]:
df = pd.DataFrame(first_res)
df.rename(key_map, axis=1, inplace=True)
df['sip_timestamp'] = pd.to_datetime(df['sip_timestamp'])
df['participant_timestamp'] = pd.to_datetime(df['participant_timestamp'])
df.head()

Unnamed: 0,sip_timestamp,participant_timestamp,sequence_number,id,exchange,size,conditions,price,tape,trf_id,trf_timestamp,correction
0,2018-03-02 09:00:00.014196645,2018-03-02 09:00:00.013260032,1069,1,11,6,"[12, 37]",175.1,3,,,
1,2018-03-02 09:00:00.014200363,2018-03-02 09:00:00.013260544,1070,2,11,54,"[12, 37]",175.1,3,,,
2,2018-03-02 09:00:00.069460073,2018-03-02 09:00:00.069078784,1077,3,11,3,"[12, 37]",175.1,3,,,
3,2018-03-02 09:01:41.734929827,2018-03-02 09:01:41.734551552,1082,4,11,1,"[12, 37]",175.2,3,,,
4,2018-03-02 09:03:46.888187959,2018-03-02 09:03:46.887811328,1087,5,11,28,"[12, 37]",174.97,3,,,


In [26]:
df['participant_timestamp'].min(), df['participant_timestamp'].max()

(Timestamp('2018-03-02 09:00:00.013260032'),
 Timestamp('2018-03-02 15:04:08.205660160'))

In [27]:
df['sip_timestamp'].iloc[0].date().strftime("%Y-%m-%d")

'2018-03-02'

In [91]:
startDate = (df['sip_timestamp'].iloc[0] - pd.Timedelta(3, 'd')).date()
start_date = startDate.strftime("%Y-%m-%d")

In [92]:
endDate = datetime.datetime.now().date()
end_date = endDate.strftime("%Y-%m-%d")

In [93]:
start_date, end_date

('2018-02-27', '2020-09-22')

In [94]:
startDate, endDate

(datetime.date(2018, 2, 27), datetime.date(2020, 9, 22))

In [95]:
pd.date_range(startDate, endDate)

DatetimeIndex(['2018-02-27', '2018-02-28', '2018-03-01', '2018-03-02',
               '2018-03-03', '2018-03-04', '2018-03-05', '2018-03-06',
               '2018-03-07', '2018-03-08',
               ...
               '2020-09-13', '2020-09-14', '2020-09-15', '2020-09-16',
               '2020-09-17', '2020-09-18', '2020-09-19', '2020-09-20',
               '2020-09-21', '2020-09-22'],
              dtype='datetime64[ns]', length=939, freq='D')

In [96]:
nyse = pmc.get_calendar('NYSE')

# Show available calendars
print(pmc.get_calendar_names())

['ASX', 'BMF', 'CFE', 'NYSE', 'stock', 'NASDAQ', 'BATS', 'CME', 'CBOT', 'COMEX', 'NYMEX', 'CME_Equity', 'CBOT_Equity', 'CME_Agriculture', 'CBOT_Agriculture', 'COMEX_Agriculture', 'NYMEX_Agriculture', 'CME_Rate', 'CBOT_Rate', 'CME_InterestRate', 'CBOT_InterestRate', 'CME_Bond', 'CBOT_Bond', 'EUREX', 'HKEX', 'ICE', 'ICEUS', 'NYFE', 'JPX', 'LSE', 'OSE', 'SIX', 'SSE', 'TSX', 'TSXV', 'XBOM', 'ASEX', 'BVMF', 'CMES', 'IEPA', 'XAMS', 'XASX', 'XBKK', 'XBOG', 'XBRU', 'XBUD', 'XBUE', 'XCBF', 'XCSE', 'XDUB', 'XFRA', 'XHEL', 'XHKG', 'XICE', 'XIDX', 'XIST', 'XJSE', 'XKAR', 'XKLS', 'XKRX', 'XLIM', 'XLIS', 'XLON', 'XMAD', 'XMEX', 'XMIL', 'XMOS', 'XNYS', 'XNZE', 'XOSL', 'XPAR', 'XPHS', 'XPRA', 'XSES', 'XSGO', 'XSHG', 'XSTO', 'XSWX', 'XTAI', 'XTKS', 'XTSE', 'XWAR', 'XWBO', 'us_futures', '24/7', '24/5']


In [97]:
days = nyse.schedule(start_date=startDate, end_date=endDate).index

In [87]:
# initialize an aggregation dataframe
agg_df = pd.DataFrame()
ticks_df = pd.DataFrame()
increment = 100000000
sp500 = ['AAPL']

for date in days: 
    laststamps = {stock:None for stock in sp500}

    for stock in sp500:
        # download a batch of data and add it to the list 
        with RESTClient(config.POLYGON) as client:
            current_ticks = client.historic_trades_v2(stock, date, timestamp=laststamps[stock]).results
            laststamps[stock] = current_ticks[-1][t]
            # add stock symbol to each row, but do it in the best way possible 
            current_df = pd.DataFrame(current_ticks)
            current_df.rename(key_map, axis=1, inplace=True)
            current_df['sip_timestamp'] = pd.to_datetime(current_df['sip_timestamp'])
            current_df['participant_timestamp'] = pd.to_datetime(current_df['participant_timestamp'])
            current_df['SYMBOL'] = stock
            ticks_df = pd.concat([ticks_df, currrent_df], axis=0)
            
    ticks_df.sort_values(by='t', ascending=True, inplace=True)
    ticks_df.drop_duplicates(inplace=True)
    ticks_df['dollar_volume'] = ticks_df['size'] * ticks_df['price']
    
    if 'dv_cumsum' in ticks_df.columns:
        cumsum_start = ticks_df['dv_cumsum'].iloc[0] 
    else:
        cumsum_start = 0
        
    ticks_df['dv_cumsum'] = ticks_df['dollar_volume'].cum() +  cumsum_start
    
    start_increment = ticks_df['dv_cumsum'].min()// increment * increment
    end_increment = ticks_df['dv_cumsum'].max()// increment * increment + increment
    ticks_df['interval_range'] = pd.cut(df['dv_cumsum'], 
                                        pd.interval_range(start_increment, 
                                                          end_increment, 
                                                          increment))
    
    last_interval_ticks = ticks_df['interval_range'].iloc[-1]
    if 'interval_range' in agg_df.columns:
        last_interval_agg = agg_df['interval_range'].iloc[-1]
    else:
        last_interval_agg = None
    
    if last_interval_ticks > last_interval_agg and last_interval_agg is not None: 
        ## aggregate the data, insert it into agg_df, and then drop what has been aggregated from ticks_df
        mask = ticks_df['interval_range']
        agged = reduce(lambda left,right: pd.merge(left,right, how='outer', left_index=True, right_index=True), [
                                    ticks_df.groupby(['interval_range','SYMBOL'])['sip_timestamp'].first().rename('open_timestamp'),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['sip_timestamp'].last().rename('close_timestamp'),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['size'].sum(),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['price'].first(),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['price'].min(),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['price'].max(),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['price'].last(),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['dollar_volume'].sum()
                                                                                                                ])
        agged = agged.i # drop the unfinished interval range .... 
        agg_df = pd.concat([agg_df, agged], axis=0)
        ticks_df = ticks_df[ticks_df['interval_range'] == last_interval_ticks]
        
    
    
    sort the downloaded batch 
    compute the cumulative volume column 
    if the cum value column is a new interval:
        compute the interval's aggregates and at them to the interval table
        delete all data prior to the new interval row

SyntaxError: invalid syntax (<ipython-input-87-cd80f01c2e3b>, line 66)

In [155]:
225000000 // 100000000 * 100000000

200000000

In [158]:
rang = pd.interval_range(0,10,10)

In [161]:
rang[2] < rang[1]

False

In [59]:
df.groupby(['size','price'])['id'].sum()

size    price   
1       172.2800           1274
        172.4000           1253
        172.4300            165
        172.4400       19391941
        172.4600    99463286329
                       ...     
9307    172.5500           8310
10000   173.2299            687
14597   173.7500           2047
24194   172.8000           1255
433903  172.6700         899900
Name: id, Length: 14950, dtype: object

In [112]:
days

DatetimeIndex(['2018-02-27', '2018-02-28', '2018-03-01', '2018-03-02',
               '2018-03-05', '2018-03-06', '2018-03-07', '2018-03-08',
               '2018-03-09', '2018-03-12',
               ...
               '2020-09-09', '2020-09-10', '2020-09-11', '2020-09-14',
               '2020-09-15', '2020-09-16', '2020-09-17', '2020-09-18',
               '2020-09-21', '2020-09-22'],
              dtype='datetime64[ns]', length=648, freq=None)

In [268]:
# initialize an aggregation dataframe
agg_df = pd.DataFrame()
ticks_df = pd.DataFrame()
increment = 100000000
sp500 = ['AAPL','GOOG','MSFT']

for date in days[:3]: 
    ## TO DO 
    ## Ensure that the loop gathers all the data for each day before moving on to the next day 
    
    strdate = date.date().strftime('%Y-%m-%d')
    laststamps = {stock:None for stock in sp500}
    print(strdate)
    for stock in sp500:
        # download a batch of data and add it to the list 
        print(stock)
        with RESTClient(config.POLYGON) as client:
            current_ticks = client.historic_trades_v2(stock, strdate, limit=10000).results
            print(current_ticks[-1])
            laststamps[stock] = current_ticks[-1]['t']
            # add stock symbol to each row, but do it in the best way possible 
            
            current_df = pd.DataFrame(current_ticks)
            current_df.rename(key_map, axis=1, inplace=True)
            current_df['sip_timestamp'] = pd.to_datetime(current_df['sip_timestamp'])
            current_df['participant_timestamp'] = pd.to_datetime(current_df['participant_timestamp'])
            current_df['SYMBOL'] = stock
            
            ticks_df = pd.concat([ticks_df, current_df], axis=0)

    ticks_df.sort_values(by='sip_timestamp', ascending=True, inplace=True)
    ticks_df.drop_duplicates(subset=list(ticks_df.columns.drop('conditions')), inplace=True)
    ticks_df['dollar_volume'] = ticks_df['size'] * ticks_df['price']
    
    if 'dv_cumsum' in ticks_df.columns:
        cumsum_start = ticks_df['dv_cumsum'].iloc[0] 
    else:
        cumsum_start = 0
        
    ticks_df['dv_cumsum'] = ticks_df['dollar_volume'].cumsum() +  cumsum_start
    
    start_increment = ticks_df['dv_cumsum'].min()// increment * increment
    end_increment = ticks_df['dv_cumsum'].max()// increment * increment + increment
    int_val = pd.interval_range(start_increment, end_increment, freq=increment)
    ticks_df['interval_range'] = pd.cut(ticks_df['dv_cumsum'], int_val)

    last_interval_ticks = ticks_df['interval_range'].max()
    if 'interval_range' in agg_df.columns:
        last_interval_agg = agg_df['interval_range'].iloc[-1] + 2 * increment
    else:
        last_interval_agg = ticks_df['interval_range'].min() + 2 * increment
    
    if last_interval_ticks > last_interval_agg: 
        ## aggregate the data, insert it into agg_df, and then drop what has been aggregated from ticks_df
        mask = ticks_df['interval_range'] < ticks_df['interval_range'].max()
        agged = reduce(lambda left,right: pd.merge(left,right, how='outer', left_index=True, right_index=True), [
                                    ticks_df.groupby(['interval_range','SYMBOL'])['sip_timestamp'].first().rename('open_timestamp'),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['sip_timestamp'].last().rename('close_timestamp'),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['size'].sum(),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['price'].first().rename('open'),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['price'].min().rename('low'),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['price'].max().rename('high'),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['price'].last().rename('close'),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['dollar_volume'].sum()
                                                                                                                ])
        agged = agged[agged.index.get_level_values(0) != agged.index.get_level_values(0)[-1]]
        agged = agged.reset_index()
        agg_df = pd.concat([agg_df, agged], axis=0)
        ticks_df = ticks_df[~mask]
        
    
    
#     sort the downloaded batch 
#     compute the cumulative volume column 
#     if the cum value column is a new interval:
#         compute the interval's aggregates and at them to the interval table
#         delete all data prior to the new interval row

2018-02-27
AAPL
{'t': 1519742049509851944, 'y': 1519742049509614000, 'q': 71019, 'i': '1224', 'x': 8, 's': 100, 'c': [14, 41], 'p': 179.59, 'z': 3}
GOOG
{'t': 1519744696870355903, 'y': 1519744696870319078, 'q': 394053, 'i': '3696', 'x': 12, 's': 20, 'c': [14, 37, 41], 'p': 1139.51, 'z': 3}
MSFT
{'t': 1519742373470237244, 'y': 1519742373469849600, 'q': 116436, 'i': '1442', 'x': 11, 's': 99, 'c': [37], 'p': 95.73, 'z': 3}
2018-02-28
AAPL
{'t': 1519828816545653645, 'y': 1519828816544000000, 'f': 1519828816545592766, 'q': 119306, 'i': '2425', 'x': 4, 'r': 12, 's': 173, 'p': 179.57, 'z': 3}
GOOG
{'t': 1519831085914450404, 'y': 1519831085914431220, 'q': 370540, 'i': '3532', 'x': 12, 's': 4, 'c': [37], 'p': 1123.85, 'z': 3}
MSFT
{'t': 1519829292880309441, 'y': 1519829292880298346, 'q': 161972, 'i': '138', 'x': 17, 's': 69, 'c': [37], 'p': 94.99, 'z': 3}
2018-03-01
AAPL
{'t': 1519914948714399994, 'y': 1519914948711000000, 'f': 1519914948714363831, 'q': 73319, 'i': '1860', 'x': 4, 'r': 12, 's':

In [269]:
ticks_df

Unnamed: 0,sip_timestamp,participant_timestamp,sequence_number,id,exchange,size,conditions,price,tape,trf_id,trf_timestamp,SYMBOL,dollar_volume,dv_cumsum,interval_range
9318,2018-03-01 15:20:16.163983648,2018-03-01 15:20:16.163724530,454417,141,15,100,,1100.2100,3,,,GOOG,110021.0000,3.200098e+09,"(3200000000.0, 3300000000.0]"
9319,2018-03-01 15:20:19.964098823,2018-03-01 15:20:19.956000000,454785,1532,4,2,[37],1100.6382,3,12.0,1.519918e+18,GOOG,2201.2764,3.200100e+09,"(3200000000.0, 3300000000.0]"
9320,2018-03-01 15:20:27.384161848,2018-03-01 15:20:27.381000000,455350,1533,4,500,,1099.6602,3,12.0,1.519918e+18,GOOG,549830.1000,3.200650e+09,"(3200000000.0, 3300000000.0]"
9321,2018-03-01 15:20:27.898581970,2018-03-01 15:20:27.890000000,455392,317,4,200,,1099.8720,3,10.0,,GOOG,219974.4000,3.200870e+09,"(3200000000.0, 3300000000.0]"
9322,2018-03-01 15:20:32.556076234,2018-03-01 15:20:32.556053566,456454,3489,12,8,"[14, 37, 41]",1099.7600,3,,,GOOG,8798.0800,3.200878e+09,"(3200000000.0, 3300000000.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2018-03-01 15:27:26.212807788,2018-03-01 15:27:26.212786271,505819,3811,12,20,"[14, 37, 41]",1103.6200,3,,,GOOG,22072.4000,3.225113e+09,"(3200000000.0, 3300000000.0]"
9996,2018-03-01 15:27:26.225716773,2018-03-01 15:27:26.225465000,505845,1190,8,10,"[14, 37, 41]",1103.7900,3,,,GOOG,11037.9000,3.225124e+09,"(3200000000.0, 3300000000.0]"
9997,2018-03-01 15:27:27.510772151,2018-03-01 15:27:27.509000000,505962,1634,4,13,[37],1103.8438,3,12.0,1.519918e+18,GOOG,14349.9694,3.225139e+09,"(3200000000.0, 3300000000.0]"
9998,2018-03-01 15:27:27.944766561,2018-03-01 15:27:27.944527000,505987,1191,8,10,"[14, 37, 41]",1103.5800,3,,,GOOG,11035.8000,3.225150e+09,"(3200000000.0, 3300000000.0]"


In [270]:
agg_df

Unnamed: 0,interval_range,SYMBOL,open_timestamp,close_timestamp,size,open,low,high,close,dollar_volume
0,"(0.0, 100000000.0]",AAPL,2018-02-27 09:00:00.017746820,2018-02-27 14:30:00.321054557,348233.0,179.05,178.2600,179.1700,179.13,6.224873e+07
1,"(0.0, 100000000.0]",GOOG,2018-02-27 09:16:53.485852356,2018-02-27 14:30:00.231620614,9408.0,1136.00,1135.9100,1145.7900,1141.94,1.074457e+07
2,"(0.0, 100000000.0]",MSFT,2018-02-27 09:22:02.429403583,2018-02-27 14:30:00.303879885,39594.0,95.13,94.9300,95.8400,95.80,3.781872e+06
3,"(100000000.0, 200000000.0]",AAPL,2018-02-27 14:30:00.327751363,2018-02-27 14:30:00.363324273,498.0,179.18,179.1700,179.2300,179.23,8.923735e+04
4,"(100000000.0, 200000000.0]",GOOG,2018-02-27 14:30:00.383065713,2018-02-27 14:30:00.383065713,32789.0,1140.39,1140.3900,1140.3900,1140.39,3.739225e+07
...,...,...,...,...,...,...,...,...,...,...
28,"(3000000000.0, 3100000000.0]",GOOG,2018-03-01 14:47:05.129982032,2018-03-01 14:59:10.576343098,91318.0,1100.84,1090.2650,1101.5565,1092.42,1.000141e+08
29,"(3000000000.0, 3100000000.0]",MSFT,NaT,NaT,,,,,,
30,"(3100000000.0, 3200000000.0]",AAPL,NaT,NaT,,,,,,
31,"(3100000000.0, 3200000000.0]",GOOG,2018-03-01 14:59:10.576353039,2018-03-01 15:20:15.238730963,91198.0,1092.42,1090.1901,1100.9900,1100.16,1.000748e+08


In [170]:
agg_df.index[-1][0]

Interval(400000000.0, 500000000.0, closed='right')

In [160]:
agg_df['interval_range'].iloc[-1] + 2 * increment

KeyError: 'interval_range'

In [132]:
ints = pd.interval_range(start_increment, 
                                                          end_increment+increment, 
                                                          freq=increment)

In [127]:
from functools import reduce

agged = reduce(lambda left,right: pd.merge(left,right, how='outer', left_index=True, right_index=True), [
                                    ticks_df.groupby(['interval_range','SYMBOL'])['sip_timestamp'].first().rename('open_timestamp'),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['sip_timestamp'].last().rename('close_timestamp'),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['size'].sum(),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['price'].first(),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['price'].min(),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['price'].max(),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['price'].last(),
                                    ticks_df.groupby(['interval_range','SYMBOL'])['dollar_volume'].sum()
                                                                                                                ])

In [128]:
agged

Unnamed: 0_level_0,Unnamed: 1_level_0,open_timestamp,close_timestamp,size,price_x,price_y,price_x,price_y,dollar_volume
interval_range,SYMBOL,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"(0.0, 100000000.0]",AAPL,2018-02-27 09:00:00.017746820,2018-03-01 09:00:03.108088398,1461,179.05,176.69,179.15,178.77,261250.57
"(0.0, 100000000.0]",GOOG,2018-02-27 09:16:53.485852356,2018-03-01 09:00:02.972611340,539,1136.0,1103.68,1145.0,1117.16,607449.34
"(0.0, 100000000.0]",MSFT,2018-02-27 09:22:02.429403583,2018-03-01 09:00:03.272244407,707,95.13,93.41,95.46,94.44,66937.51


In [133]:
last = ticks_df['interval_range'].max()

In [136]:
ints

IntervalIndex([(0.0, 100000000.0], (100000000.0, 200000000.0]],
              closed='right',
              dtype='interval[float64]')

In [138]:
last < ints[-1]

True

In [139]:
last

Interval(0.0, 100000000.0, closed='right')

In [147]:
ints[0]+100000000 

Interval(100000000.0, 200000000.0, closed='right')

In [154]:
ticks_df

Unnamed: 0,sip_timestamp,participant_timestamp,sequence_number,id,exchange,size,conditions,price,tape,trf_id,trf_timestamp,SYMBOL,dollar_volume,dv_cumsum,interval_range
0,2018-02-27 09:00:00.017746820,2018-02-27 09:00:00.017360384,1074,1,11,10,"[12, 37]",179.05,3,,,AAPL,1790.50,3.581000e+03,"(0.0, 100000000.0]"
1,2018-02-27 09:00:00.017750369,2018-02-27 09:00:00.017361152,1075,2,11,5,"[12, 37]",179.05,3,,,AAPL,895.25,4.476250e+03,"(0.0, 100000000.0]"
2,2018-02-27 09:01:26.483949291,2018-02-27 09:01:26.483924880,1102,1,12,50,"[14, 12, 37, 41]",178.60,3,,,AAPL,8930.00,1.340625e+04,"(0.0, 100000000.0]"
3,2018-02-27 09:01:50.209147098,2018-02-27 09:01:50.208771328,1103,3,11,10,"[12, 37]",179.04,3,,,AAPL,1790.40,1.519665e+04,"(0.0, 100000000.0]"
4,2018-02-27 09:06:15.080563660,2018-02-27 09:06:15.080186112,1107,4,11,400,[12],179.02,3,,,AAPL,71608.00,8.680465e+04,"(0.0, 100000000.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2018-02-28 14:30:28.128631352,2018-02-28 14:30:28.128609671,15852,349,12,100,"[14, 41]",94.95,3,,,MSFT,9495.00,4.420780e+08,"(400000000.0, 500000000.0]"
996,2018-02-28 14:30:28.130594398,2018-02-28 14:30:28.130216960,15853,220,11,55,"[14, 37, 41]",94.96,3,,,MSFT,5222.80,4.420832e+08,"(400000000.0, 500000000.0]"
997,2018-02-28 14:30:28.130601454,2018-02-28 14:30:28.130216960,15854,221,11,100,"[14, 41]",94.94,3,,,MSFT,9494.00,4.420927e+08,"(400000000.0, 500000000.0]"
998,2018-02-28 14:30:28.139430801,2018-02-28 14:30:28.139057152,15855,222,11,200,"[14, 41]",94.93,3,,,MSFT,18986.00,4.421117e+08,"(400000000.0, 500000000.0]"


In [155]:
mask

[0       True
 1       True
 2       True
 3       True
 4       True
        ...  
 995    False
 996    False
 997    False
 998    False
 999    False
 Name: interval_range, Length: 6000, dtype: bool]

In [202]:
agged.reset_index()['interval_range'].astype('Interval') < ticks_df['interval_range'].max()

TypeError: '<' not supported between instances of 'IntervalArray' and 'pandas._libs.interval.Interval'

In [203]:
ticks_df['interval_range'].max()

Interval(600000000.0, 700000000.0, closed='right')

In [217]:
agged.reset_index(inplace=True)

In [228]:
agged['interval_range'] = pd.IntervalIndex(agged['interval_range'])

In [235]:
agged['interval_range'] < agged['interval_range'].iloc[-1]

TypeError: '<' not supported between instances of 'IntervalArray' and 'pandas._libs.interval.Interval'

In [248]:
agged.loc[~agged.index[-1][0], :]

TypeError: bad operand type for unary ~: 'pandas._libs.interval.Interval'

In [256]:
agged.index.get_level_values(0) != agged.index.get_level_values(0)[-1]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False])

In [267]:
agged

Unnamed: 0_level_0,Unnamed: 1_level_0,open_timestamp,close_timestamp,size,open,low,high,close,dollar_volume
interval_range,SYMBOL,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"(1000000000.0, 1100000000.0]",AAPL,2018-02-28 09:00:00.077538685,2018-02-28 10:51:53.133542627,14552.0,178.75,178.25,178.98,178.7,2601554.0
"(1000000000.0, 1100000000.0]",GOOG,2018-02-27 15:03:08.331565905,2018-02-28 10:29:39.214420986,85471.0,1134.325,1123.24,1141.73,1124.6,97248280.0
"(1000000000.0, 1100000000.0]",MSFT,2018-02-28 09:11:41.413178569,2018-02-28 10:53:02.897739446,515.0,94.62,94.51,94.82,94.52,48719.65
"(1100000000.0, 1200000000.0]",AAPL,2018-02-28 10:55:08.003881272,2018-02-28 14:30:00.345015436,243553.0,178.57,178.01,179.5,179.27,43626820.0
"(1100000000.0, 1200000000.0]",GOOG,2018-02-28 11:15:13.611714156,2018-02-28 14:30:00.310116160,36502.0,1124.3,1118.29,1126.59,1123.37,40885450.0
"(1100000000.0, 1200000000.0]",MSFT,2018-02-28 10:53:02.898555058,2018-02-28 14:30:00.324254666,30994.0,94.51,94.2,94.87,94.8,2932497.0
"(1200000000.0, 1300000000.0]",AAPL,2018-02-28 14:30:00.380433549,2018-02-28 14:30:00.460421439,925.0,179.26,179.26,179.3478,179.29,165854.8
"(1200000000.0, 1300000000.0]",GOOG,2018-02-28 14:30:00.412165961,2018-02-28 14:30:00.483442984,38563.0,1122.07,1121.0,1122.79,1121.0,43229190.0
"(1200000000.0, 1300000000.0]",MSFT,2018-02-28 14:30:00.367498203,2018-02-28 14:30:00.476411062,478037.0,94.8,94.8,94.94,94.825,45318010.0
"(1300000000.0, 1400000000.0]",AAPL,2018-02-28 14:30:00.492554835,2018-02-28 14:30:00.647495593,373866.0,179.29,179.2,179.29,179.2,66996900.0


In [265]:
agg_df

Unnamed: 0_level_0,Unnamed: 1_level_0,open_timestamp,close_timestamp,size,open,low,high,close,dollar_volume
interval_range,SYMBOL,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"(0.0, 100000000.0]",AAPL,2018-02-27 09:00:00.017746820,2018-02-27 14:30:00.321054557,348233.0,179.05,178.26,179.17,179.13,62248730.0
"(0.0, 100000000.0]",GOOG,2018-02-27 09:16:53.485852356,2018-02-27 14:30:00.231620614,9408.0,1136.0,1135.91,1145.79,1141.94,10744570.0
"(0.0, 100000000.0]",MSFT,2018-02-27 09:22:02.429403583,2018-02-27 14:30:00.303879885,39594.0,95.13,94.93,95.84,95.8,3781872.0
"(100000000.0, 200000000.0]",AAPL,2018-02-27 14:30:00.327751363,2018-02-27 14:30:00.363324273,498.0,179.18,179.17,179.23,179.23,89237.35
"(100000000.0, 200000000.0]",GOOG,2018-02-27 14:30:00.383065713,2018-02-27 14:30:00.383065713,32789.0,1140.39,1140.39,1140.39,1140.39,37392250.0
"(100000000.0, 200000000.0]",MSFT,2018-02-27 14:30:00.324491210,2018-02-27 14:30:00.381137224,708837.0,95.66,95.66,95.84,95.7,67807910.0
"(200000000.0, 300000000.0]",AAPL,2018-02-27 14:30:00.403137653,2018-02-27 14:30:00.524262924,365527.0,179.15,179.0,179.23,179.0,65429560.0
"(200000000.0, 300000000.0]",GOOG,2018-02-27 14:30:00.383171676,2018-02-27 14:30:00.479865501,32979.0,1140.39,1139.1,1144.4,1141.3,37609450.0
"(200000000.0, 300000000.0]",MSFT,2018-02-27 14:30:00.386273300,2018-02-27 14:30:00.515652702,2803.0,95.7,95.7,95.75,95.75,268287.7
"(300000000.0, 400000000.0]",AAPL,2018-02-27 14:30:00.524303295,2018-02-27 14:30:20.773753651,519594.0,179.0,178.37,179.44,179.44,93031690.0
