# 1.6.1 Historical Tick Data

In this notebook we will write a script to pull historical tick data directly from Polygon and inject it immediately without any transformation into BigQuery. This script will manage and track pulling a years worth of historical S&P500 data, including tracking dates pulled and changes in ticker symbol by date. 

Many tickers have changing symbols over time while representing the same underlying data, so this takes some investigation in order to ensure we have the correct set of symbols. 

In [176]:
import pandas_market_calendars as pmc
import datetime
from polygon import RESTClient
import os
from IPython.display import clear_output

In [3]:
key = os.environ.get('POLYGON')

# download a list of S&P500 companies
sp500_df = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]

# fill missing added date as 1950-01-01
# fix encoding for T and cast as datetime
sp500_df['Date first added'] = sp500_df['Date first added'].fillna('1950-01-01')
sp500_df.loc[53, 'Date first added'] = '1983-11-30'
sp500_df['Date first added'] = pd.to_datetime(sp500_df['Date first added'])
sp500_df[sp500_df['Symbol'] == 'LUMN']

nyse = pmc.get_calendar('NASDAQ')
enddate = datetime.datetime.now().date()
startdate = enddate - pd.Timedelta('1Y')
days = nyse.schedule(start_date=startdate, end_date=enddate).index
days = [day.date().strftime('%Y-%m-%d') for i, day in enumerate(days)]
day = days[0]
day

'2019-09-30'

In [8]:
for i in range(1000):
    print(i)
    clear_output(wait=True)
#     if i % 100:
#         clear_output(wait=True)

999


The current date is September 28, 2020. Since I know that the pull I intend to process will require several days to complete, I would like to download data for one year ending September 30. I hope that the data for the 28-30th exists by the time my script gets around to asking for it. 

In [46]:
days[-1]

'2020-09-28'

In [45]:
call_historical('AAPL', days[-1])

{'results': [{'t': 1601280031258342876,
   'y': 1601280031257950464,
   'q': 1473,
   'i': '92',
   'x': 11,
   's': 2,
   'c': [12, 37],
   'p': 113.94,
   'z': 3},
  {'t': 1601280038418975513,
   'y': 1601280038418559744,
   'q': 1489,
   'i': '93',
   'x': 11,
   's': 2,
   'c': [12, 37],
   'p': 113.94,
   'z': 3},
  {'t': 1601280041896803571,
   'y': 1601280041896775150,
   'q': 1496,
   'i': '6',
   'x': 12,
   's': 1,
   'c': [12, 37],
   'p': 114.13,
   'z': 3},
  {'t': 1601280043149056027,
   'y': 1601280043148676352,
   'q': 1500,
   'i': '94',
   'x': 11,
   's': 2,
   'c': [12, 37],
   'p': 113.96,
   'z': 3},
  {'t': 1601280045829272559,
   'y': 1601280045828891904,
   'q': 1502,
   'i': '95',
   'x': 11,
   's': 2,
   'c': [12, 37],
   'p': 113.96,
   'z': 3},
  {'t': 1601280046161905429,
   'y': 1601280046161876154,
   'q': 1504,
   'i': '7',
   'x': 12,
   's': 5,
   'c': [12, 14, 37, 41],
   'p': 114.01,
   'z': 3},
  {'t': 1601280048420311888,
   'y': 1601280048419934

In [47]:
missing = {}
day = days[-1]
for stock in sp500_df['Symbol']:
    print(stock)
    print(missing)
    clear_output(wait=True)
    with RESTClient(key) as client:
        results = client.historic_trades_v2(stock, day).results
    try:
        results[-1]
    except:
        added = sp500_df[sp500_df['Symbol'] == stock]['Date first added'].iloc[0].date().strftime('%Y-%m-%d')
        missing[stock] = {'Missing on': day,
                         'Added on': added}

ZTS
{}


In [48]:
missing

{}

In [11]:
missing

{'BKR': {'Missing on': '2019-09-30', 'Added on': '2017-07-07'},
 'CARR': {'Missing on': '2019-09-30', 'Added on': '2020-04-03'},
 'PEAK': {'Missing on': '2019-09-30', 'Added on': '2008-03-31'},
 'HWM': {'Missing on': '2019-09-30', 'Added on': '1964-03-31'},
 'J': {'Missing on': '2019-09-30', 'Added on': '2007-10-26'},
 'LUMN': {'Missing on': '2019-09-30', 'Added on': '1999-03-25'},
 'NLOK': {'Missing on': '2019-09-30', 'Added on': '2003-03-25'},
 'OTIS': {'Missing on': '2019-09-30', 'Added on': '2020-04-03'},
 'RTX': {'Missing on': '2019-09-30', 'Added on': '1950-01-01'},
 'TT': {'Missing on': '2019-09-30', 'Added on': '2010-11-17'},
 'TFC': {'Missing on': '2019-09-30', 'Added on': '1997-12-04'},
 'VIAC': {'Missing on': '2019-09-30', 'Added on': '1950-01-01'}}

In [13]:
len(missing)

12

Given that data could be missing at any moment in time, and I won't know until I check and try to figure out personally what the correct symbol is, what should my protocol be? 

I want to ensure that the data pull:
1) Continues  
2) Keeps a saved record of where the data went wrong  
3) Probably abandon trying to check that symbol from that point forward, but still pull everything else  
4) Does not give an error and die at that point  

Furthermore I will be directly inserting into bigtable, so the error will probably respond to me from the bigtable query when the results are null. 


In [14]:
import requests

ticker = 'BKR'
date = days[-1]
url = f'https://api.polygon.io/v2/ticks/stocks/trades/{ticker}/{date}'
print(url)
params = {
        'apiKey': os.environ.get('POLYGON'),
        'limit':10
        }


res = requests.get(url, params)

https://api.polygon.io/v2/ticks/stocks/trades/BKR/2020-09-28


In [177]:
def call_historical(ticker, date):
    url = f'https://api.polygon.io/v2/ticks/stocks/trades/{ticker}/{date}'
    params = {
            'apiKey': os.environ.get('POLYGON'),
            'limit':10
            }


    res = requests.get(url, params)
    return res.json()

call_historical('T', '2019-09-30')

{'results': [{'t': 1569830414630659408,
   'y': 1569830414630370304,
   'q': 24801,
   'i': '52983525027894',
   'x': 11,
   's': 100,
   'c': [12, 41],
   'p': 37.73,
   'z': 1},
  {'t': 1569830414646095979,
   'y': 1569830414645365883,
   'q': 24901,
   'i': '3472338191832597814',
   'x': 12,
   's': 100,
   'c': [14, 12, 41],
   'p': 37.8,
   'z': 1},
  {'t': 1569830677773465082,
   'y': 1569830677772621210,
   'q': 25301,
   'i': '3472338191832598872',
   'x': 12,
   's': 100,
   'c': [14, 12, 41],
   'p': 37.69,
   'z': 1},
  {'t': 1569830678920929180,
   'y': 1569830678920715008,
   'q': 25401,
   'i': '52983525027908',
   'x': 11,
   's': 100,
   'c': [12],
   'p': 37.69,
   'z': 1},
  {'t': 1569830749440563563,
   'y': 1569830749440281344,
   'q': 25501,
   'i': '52983525027909',
   'x': 11,
   's': 100,
   'c': [14, 12, 41],
   'p': 37.66,
   'z': 1},
  {'t': 1569830749440587163,
   'y': 1569830749439826705,
   'q': 25601,
   'i': '3472338191832599100',
   'x': 12,
   's': 100

In [20]:
days[0]

'2019-09-30'

In [24]:
# Oct. 18, 2019

print(call_historical('BHGE', '2019-10-17'))

{'results': [{'t': 1571317142756572908, 'y': 1571317142754000000, 'f': 1571317142756312611, 'q': 352901, 'i': '79371804170315', 'x': 4, 'r': 10, 's': 3, 'c': [12, 37], 'p': 21.95, 'z': 1}, {'t': 1571318811489103208, 'y': 1571318811488880640, 'q': 479901, 'i': '52983525033265', 'x': 11, 's': 200, 'c': [12], 'p': 22.21, 'z': 1}, {'t': 1571318880656102583, 'y': 1571318880655923712, 'q': 487301, 'i': '52983525033526', 'x': 11, 's': 300, 'c': [12], 'p': 22.22, 'z': 1}, {'t': 1571318880656636978, 'y': 1571318880656128000, 'q': 487401, 'i': '52983525028719', 'x': 8, 's': 300, 'c': [12], 'p': 22.21, 'z': 1}, {'t': 1571318880656658738, 'y': 1571318880656120000, 'q': 487501, 'i': '52983525027938', 'x': 19, 's': 300, 'c': [12], 'p': 22.21, 'z': 1}, {'t': 1571318880656806662, 'y': 1571318880656221759, 'q': 488001, 'i': '3472338191833328198', 'x': 12, 's': 300, 'c': [12], 'p': 22.21, 'z': 1}, {'t': 1571318880656827457, 'y': 1571318880656240781, 'q': 488101, 'i': '3472338191833328199', 'x': 12, 's':

In [27]:
print(call_historical('BKR', '2019-10-18'))

{'results': [{'t': 1571405428719774105, 'y': 1571405428719573760, 'q': 885001, 'i': '52983525029962', 'x': 10, 's': 228842, 'c': [17, 41], 'p': 22.19, 'z': 1}, {'t': 1571405428719796611, 'y': 1571405428719573760, 'q': 885101, 'i': '52983525029963', 'x': 10, 's': 228842, 'c': [16], 'p': 22.19, 'z': 1}, {'t': 1571405428722069307, 'y': 1571405428719000000, 'f': 1571405428721562973, 'q': 885301, 'i': '71675223299890', 'x': 4, 'r': 12, 's': 10, 'c': [10, 37, 41], 'p': 22.19, 'z': 1}, {'t': 1571405428722266051, 'y': 1571405428719000000, 'f': 1571405428721745817, 'q': 885401, 'i': '71675223299891', 'x': 4, 'r': 12, 's': 3230, 'c': [10, 41], 'p': 22.19, 'z': 1}, {'t': 1571405428722669433, 'y': 1571405428719000000, 'f': 1571405428722153450, 'q': 885501, 'i': '71675223299892', 'x': 4, 'r': 12, 's': 1170, 'c': [10, 41], 'p': 22.19, 'z': 1}, {'t': 1571405428723214281, 'y': 1571405428719000000, 'f': 1571405428722693973, 'q': 885601, 'i': '71675223299893', 'x': 4, 'r': 12, 's': 3081, 'c': [10, 41], 

On october 18, 2019 ('2019-10-18') the symbol 'BHGE' ceases to exist, and the symbol 'BKR' appears in its place. 

In [31]:
sp500_df['Symbol']

0       MMM
1       ABT
2      ABBV
3      ABMD
4       ACN
       ... 
500     YUM
501    ZBRA
502     ZBH
503    ZION
504     ZTS
Name: Symbol, Length: 505, dtype: object

In [32]:
missing

{'BKR': {'Missing on': '2019-09-30', 'Added on': '2017-07-07'},
 'CARR': {'Missing on': '2019-09-30', 'Added on': '2020-04-03'},
 'PEAK': {'Missing on': '2019-09-30', 'Added on': '2008-03-31'},
 'HWM': {'Missing on': '2019-09-30', 'Added on': '1964-03-31'},
 'J': {'Missing on': '2019-09-30', 'Added on': '2007-10-26'},
 'LUMN': {'Missing on': '2019-09-30', 'Added on': '1999-03-25'},
 'NLOK': {'Missing on': '2019-09-30', 'Added on': '2003-03-25'},
 'OTIS': {'Missing on': '2019-09-30', 'Added on': '2020-04-03'},
 'RTX': {'Missing on': '2019-09-30', 'Added on': '1950-01-01'},
 'TT': {'Missing on': '2019-09-30', 'Added on': '2010-11-17'},
 'TFC': {'Missing on': '2019-09-30', 'Added on': '1997-12-04'},
 'VIAC': {'Missing on': '2019-09-30', 'Added on': '1950-01-01'}}

In [73]:
from datetime import datetime
# UTX acquires RTN and becomes four companies, UTX, RTX, CARR, and OTIS
# RTX, CARR, and OTIS are all listed
# UTX is not
# RTN was previously listed, now just changing to RTX
# where UTX was previously listed and now split into CARR and OTIS
# The determination here is we will have to drop CARR and OTIS, ignore UTX, but keep RTX -> RTN
drop_stocks = ['CARR','OTIS']

namechanges = {
    'BKR': {'date': '2019-10-18',
            'as_of_name': 'BKR',
            'beforename': 'BHGE'
           },
    'PEAK': {'date': '2019-11-05',
            'as_of_name': 'PEAK',
            'beforename': 'HPC'
           },
    'HWM': {'date': '2019-10-18',
            'as_of_name': 'HWM',
            'beforename': 'ARNC'
           },
    'J': {'date': '2019-12-10',
            'as_of_name': 'J',
            'beforename': 'JEC'
           },
    'LUMN': {'date': '2020-09-19',
            'as_of_name': 'LUMN',
            'beforename': 'CTL'
           },
    'NLOK': {'date': '2019-11-04',
            'as_of_name': 'NLOK',
            'beforename': 'SYMC'
           },
    'RTX': {'date': '2020-04-03',
            'as_of_name': 'RTX',
            'beforename': 'RTN'
           },
    'IR': {'date': '2020-03-03',
           'as_of_name':'IR',
           'beforename':'GDI'
        
           },
    'TT': {'date': '2019-10-18',
            'as_of_name': 'TT',
            'beforename': 'IR'
           },
    'TFC': {'date': '2019-12-06',
            'as_of_name': 'TFC',
            'beforename': 'BBT'
           },
    'VIAC':  {'date': '2019-12-05',
            'as_of_name': 'VIAC',
            'beforename': 'CBS'
           },
}

for sym in namechanges:
    namechanges[sym]['date'] = datetime.strptime(namechanges[sym]['date'], '%Y-%m-%d')
    print(namechanges[sym]['date'])
    
stocks_list = list(sp500_df['Symbol'])

for dropstock in drop_stocks:
    stocks_list.remove(dropstock)
    
stocks = {}

for stock in stocks_list:
    stocks[stock] = {'timestamp_tracker':{
                                         'start_timestamp':None,
                                         'end_timestamp':None
                                        }
                    }
    if stock in namechanges:
        stocks[stock]['namechange'] = namechanges[stock]
        
stocks

2019-10-18 00:00:00
2019-11-05 00:00:00
2019-10-18 00:00:00
2019-12-10 00:00:00
2020-09-19 00:00:00
2019-11-04 00:00:00
2020-04-03 00:00:00
2020-03-03 00:00:00
2019-10-18 00:00:00
2019-12-06 00:00:00
2019-12-05 00:00:00


{'MMM': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'ABT': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'ABBV': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'ABMD': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'ACN': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'ATVI': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'ADBE': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'AMD': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'AAP': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'AES': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'AFL': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'A': {'timestamp_tracker': {'start_timestamp': None, 'end_timestamp': N

In [59]:
len(stocks)

503

In [88]:
import pickle 
with open('stocks_tracker.pickle', 'wb') as f:
    pickle.dump(stocks, f)


In [89]:
with open('stocks_tracker.pickle', 'rb') as f:
    load_test = pickle.load(f)
    
load_test

{'MMM': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'ABT': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'ABBV': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'ABMD': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'ACN': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'ATVI': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'ADBE': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'AMD': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'AAP': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'AES': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'AFL': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'A': {'timestamp_tracker': {'start_timestamp': None, 'end_timestamp': N

In [50]:
#missing_2019_09_30 = missing
missing_2019_09_30

{'BKR': {'Missing on': '2019-09-30', 'Added on': '2017-07-07'},
 'CARR': {'Missing on': '2019-09-30', 'Added on': '2020-04-03'},
 'PEAK': {'Missing on': '2019-09-30', 'Added on': '2008-03-31'},
 'HWM': {'Missing on': '2019-09-30', 'Added on': '1964-03-31'},
 'J': {'Missing on': '2019-09-30', 'Added on': '2007-10-26'},
 'LUMN': {'Missing on': '2019-09-30', 'Added on': '1999-03-25'},
 'NLOK': {'Missing on': '2019-09-30', 'Added on': '2003-03-25'},
 'OTIS': {'Missing on': '2019-09-30', 'Added on': '2020-04-03'},
 'RTX': {'Missing on': '2019-09-30', 'Added on': '1950-01-01'},
 'TT': {'Missing on': '2019-09-30', 'Added on': '2010-11-17'},
 'TFC': {'Missing on': '2019-09-30', 'Added on': '1997-12-04'},
 'VIAC': {'Missing on': '2019-09-30', 'Added on': '1950-01-01'}}

In [53]:

stocks

In [36]:
json = call_historical('BKR', '2019-10-18')

In [38]:
json['results'][0]['t']

1571405428719774105

In [39]:
json['results'][-1]['t']

1571405468118011428

In [90]:
len(json['results'])

10

In [91]:
json

{'results': [{'t': 1571405428719774105,
   'y': 1571405428719573760,
   'q': 885001,
   'i': '52983525029962',
   'x': 10,
   's': 228842,
   'c': [17, 41],
   'p': 22.19,
   'z': 1},
  {'t': 1571405428719796611,
   'y': 1571405428719573760,
   'q': 885101,
   'i': '52983525029963',
   'x': 10,
   's': 228842,
   'c': [16],
   'p': 22.19,
   'z': 1},
  {'t': 1571405428722069307,
   'y': 1571405428719000000,
   'f': 1571405428721562973,
   'q': 885301,
   'i': '71675223299890',
   'x': 4,
   'r': 12,
   's': 10,
   'c': [10, 37, 41],
   'p': 22.19,
   'z': 1},
  {'t': 1571405428722266051,
   'y': 1571405428719000000,
   'f': 1571405428721745817,
   'q': 885401,
   'i': '71675223299891',
   'x': 4,
   'r': 12,
   's': 3230,
   'c': [10, 41],
   'p': 22.19,
   'z': 1},
  {'t': 1571405428722669433,
   'y': 1571405428719000000,
   'f': 1571405428722153450,
   'q': 885501,
   'i': '71675223299892',
   'x': 4,
   'r': 12,
   's': 1170,
   'c': [10, 41],
   'p': 22.19,
   'z': 1},
  {'t': 1571

Create a dictionary which houses all of the stocks in the S&P500 today. 

For each stock, the value should encode what the symbol was... on every given date? 



In [66]:
stocks['MMM']

{'timestamp_tracker': {'start_timestamp': None, 'end_timestamp': None}}

In [76]:
day

'2020-09-28'

In [117]:
stocks

{'MMM': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'ABT': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'ABBV': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'ABMD': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'ACN': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'ATVI': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'ADBE': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'AMD': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'AAP': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'AES': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'AFL': {'timestamp_tracker': {'start_timestamp': None,
   'end_timestamp': None}},
 'A': {'timestamp_tracker': {'start_timestamp': None, 'end_timestamp': N

In [109]:
#pd.DataFrame(json['results'])
cols = ['sip_timestamp','participant_timestamp','sequence_number','id','exchange','size','price','tape','symbol']
key_map = {key:json['map'][key]['name'] for key in json['map']}
df = pd.DataFrame(json['results'])
df = df.rename(key_map, axis=1)
df['symbol'] = stock
df = df[cols]
df

Unnamed: 0,sip_timestamp,participant_timestamp,sequence_number,id,exchange,size,price,tape,symbol
0,1571405428719774105,1571405428719573760,885001,52983525029962,10,228842,22.19,1,ZTS
1,1571405428719796611,1571405428719573760,885101,52983525029963,10,228842,22.19,1,ZTS
2,1571405428722069307,1571405428719000000,885301,71675223299890,4,10,22.19,1,ZTS
3,1571405428722266051,1571405428719000000,885401,71675223299891,4,3230,22.19,1,ZTS
4,1571405428722669433,1571405428719000000,885501,71675223299892,4,1170,22.19,1,ZTS
5,1571405428723214281,1571405428719000000,885601,71675223299893,4,3081,22.19,1,ZTS
6,1571405429755862156,1571405429000000000,897101,71675223300692,4,6,22.19,1,ZTS
7,1571405445059278442,1571405445059128064,1017901,52983525028422,3,2,22.5,1,ZTS
8,1571405449728761279,1571405449728623872,1042201,52983525028435,3,7,22.5,1,ZTS
9,1571405468118011428,1571405468117824000,1170501,52983525028692,3,38,22.11,1,ZTS


In [113]:
import pickle
with open('key_map.pickle', 'wb') as f:
    pickle.dump(key_map, f)

In [174]:
from google.cloud import bigquery

client = bigquery.Client()
project = client.project
dataset_ref = bigquery.DatasetReference(project, 'sp500historical')

#   't': {'name': 'sip_timestamp', 'type': 'int64'},
#   'e': {'name': 'correction', 'type': 'int'},
#   'x': {'name': 'exchange', 'type': 'int'},
#   'p': {'name': 'price', 'type': 'float64'},
#   'y': {'name': 'participant_timestamp', 'type': 'int64'},
#   'f': {'name': 'trf_timestamp', 'type': 'int64'},
#   'q': {'name': 'sequence_number', 'type': 'int'},

#   'i': {'name': 'id', 'type': 'string'},
#   'I': {'name': 'orig_id', 'type': 'string'},
#   'r': {'name': 'trf_id', 'type': 'int'},
#   's': {'name': 'size', 'type': 'int'},
#   'z': {'name': 'tape', 'type': 'int'}},

schema = [
    bigquery.SchemaField("sip_timestamp", "TIMESTAMP", mode="NULLABLE"),
    bigquery.SchemaField("participant_timestamp", "TIMESTAMP", mode="NULLABLE"),
    bigquery.SchemaField("sequence_number", "INT64", mode="NULLABLE"),
    bigquery.SchemaField("id", "INT64", mode="NULLABLE"),
    bigquery.SchemaField("exchange", "INT64", mode="NULLABLE"),
    bigquery.SchemaField("size", "INT64", mode="NULLABLE"),
    bigquery.SchemaField("price", "FLOAT64", mode="NULLABLE"),
    bigquery.SchemaField("tape", "INT64", mode="NULLABLE"),
    bigquery.SchemaField("symbol", "STRING", mode="NULLABLE"),
    ]

table_ref = dataset_ref.table("csv_historic")
table = bigquery.Table(table_ref, schema=schema)
table = client.create_table(table)  # API request

print("Created table {}".format(table.full_table_id))

Created table tpu-training-289520:sp500historical.csv_historic


In [132]:
dataset_ref

DatasetReference('tpu-training-289520', 'sp500historical')

In [129]:
table.full_table_id

'tpu-training-289520:sp500historical.csv_historic'

In [127]:
table.

'csv_historic'

In [165]:
def call_historical(ticker, date, time=None, limit=50000):
    url = f'https://api.polygon.io/v2/ticks/stocks/trades/{ticker}/{date}'
    params = {
            'apiKey': '0URbzTqnNwTsHIe4UPz8AjazYT9vYFNq',
            'limit':limit,
            'timestamp':time
            }
    print(params)

    res = requests.get(url, params)
    return res.json()

call_historical('MMM','2020-08-28',time=1569848473567922350, limit=1)

{'apiKey': '0URbzTqnNwTsHIe4UPz8AjazYT9vYFNq', 'limit': 1, 'timestamp': 1569848473567922350}


{'results': [{'t': 1598601805556116480,
   'y': 1598601805555770477,
   'q': 618901,
   'i': '62879129950000',
   'x': 12,
   's': 11,
   'c': [14, 12, 37, 41],
   'p': 164.08,
   'z': 1}],
 'success': True,
 'map': {'p': {'name': 'price', 'type': 'float64'},
  't': {'name': 'sip_timestamp', 'type': 'int64'},
  'f': {'name': 'trf_timestamp', 'type': 'int64'},
  'i': {'name': 'id', 'type': 'string'},
  'e': {'name': 'correction', 'type': 'int'},
  's': {'name': 'size', 'type': 'int'},
  'r': {'name': 'trf_id', 'type': 'int'},
  'z': {'name': 'tape', 'type': 'int'},
  'y': {'name': 'participant_timestamp', 'type': 'int64'},
  'q': {'name': 'sequence_number', 'type': 'int'},
  'c': {'name': 'conditions', 'type': '[]int'},
  'I': {'name': 'orig_id', 'type': 'string'},
  'x': {'name': 'exchange', 'type': 'int'}},
 'ticker': 'MMM',
 'results_count': 1,
 'db_latency': 22}

In [175]:
from google.cloud import bigquery

client = bigquery.Client()


def call_historical(ticker, date, time=None, limit=50000):
    url = f'https://api.polygon.io/v2/ticks/stocks/trades/{ticker}/{date}'
    params = {
            'apiKey': '0URbzTqnNwTsHIe4UPz8AjazYT9vYFNq',
            'limit':limit,
            'timestamp':time
            }


    res = requests.get(url, params)
    return res.json()

def load_from_csv(file_path, table_id):
    '''
    table_id = "your-project.your_dataset.your_table_name"
    '''
    print(table_id)
    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.CSV, skip_leading_rows=1, autodetect=True,
    )

    with open(file_path, "rb") as source_file:
        job = client.load_table_from_file(source_file, table_id, job_config=job_config)

    job.result()  # Waits for the job to complete.

    table = client.get_table(table_id)  # Make an API request.
    print(
        "Loaded {} rows and {} columns to {}".format(
            table.num_rows, len(table.schema), table_id
        )
    )
    
json = call_historical('MMM', days[0], time=None, limit=10)

df = pd.DataFrame(json['results'])
df = df.rename(key_map, axis=1)
df['sip_timestamp'] = pd.to_datetime(df['sip_timestamp'])
df['participant_timestamp'] = pd.to_datetime(df['participant_timestamp'])
df['symbol'] = stock
df = df[cols]
df.to_csv('historic_csv.csv', index=False)

load_from_csv('historic_csv.csv', 'sp500historical.csv_historic')

stocks['MMM']['timestamp_tracker']['end_timestamp'] = json['results'][-1]['t']
stocks['MMM']['timestamp_tracker']['start_timestamp'] = json['results'][0]['t']

with open('stocks_tracker.pickle', 'wb') as f:
    pickle.dump(stocks, f)

    

sp500historical.csv_historic
Loaded 10 rows and 9 columns to sp500historical.csv_historic


In [155]:
os.environ.get('POLYGON')

'0URbzTqnNwTsHIe4UPz8AjazYT9vYFNq'

In [172]:
df

Unnamed: 0,sip_timestamp,participant_timestamp,sequence_number,id,exchange,size,price,tape,symbol
0,2019-09-30 09:44:23.666110360,2019-09-30 09:44:23.665284738,57201,3472338191832606545,12,7,165.54,1,ZTS
1,2019-09-30 12:00:09.052732450,2019-09-30 12:00:09.048000000,234101,71675222707767,4,1,165.9,1,ZTS
2,2019-09-30 12:33:44.429397278,2019-09-30 12:33:44.428780000,374901,52983525028672,8,20,164.99,1,ZTS
3,2019-09-30 12:59:00.610281518,2019-09-30 12:59:00.606000000,405001,79371804105011,4,1,164.8,1,ZTS
4,2019-09-30 12:59:03.096315111,2019-09-30 12:59:03.094000000,408701,79371804161332,4,1,164.53,1,ZTS
5,2019-09-30 12:59:05.606335309,2019-09-30 12:59:05.603000000,413201,79371804165173,4,8,164.53,1,ZTS
6,2019-09-30 13:00:01.263384213,2019-09-30 13:00:01.259000000,416801,79371804166733,4,1,165.0,1,ZTS
7,2019-09-30 13:01:12.515786309,2019-09-30 13:01:08.607000000,424001,71675222836301,4,1,164.53,1,ZTS
8,2019-09-30 13:01:12.829321752,2019-09-30 13:01:08.983000000,424201,71675222836534,4,1,164.53,1,ZTS
9,2019-09-30 13:01:13.567922350,2019-09-30 13:01:09.841000000,424601,71675222837041,4,1,164.53,1,ZTS


In [170]:
df

Unnamed: 0,sip_timestamp,participant_timestamp,sequence_number,id,exchange,size,price,tape,symbol
0,2019-09-30 09:44:23.666110360,2019-09-30 09:44:23.665284738,57201,3472338191832606545,12,7,165.54,1,ZTS
1,2019-09-30 12:00:09.052732450,2019-09-30 12:00:09.048000000,234101,71675222707767,4,1,165.9,1,ZTS
2,2019-09-30 12:33:44.429397278,2019-09-30 12:33:44.428780000,374901,52983525028672,8,20,164.99,1,ZTS
3,2019-09-30 12:59:00.610281518,2019-09-30 12:59:00.606000000,405001,79371804105011,4,1,164.8,1,ZTS
4,2019-09-30 12:59:03.096315111,2019-09-30 12:59:03.094000000,408701,79371804161332,4,1,164.53,1,ZTS
5,2019-09-30 12:59:05.606335309,2019-09-30 12:59:05.603000000,413201,79371804165173,4,8,164.53,1,ZTS
6,2019-09-30 13:00:01.263384213,2019-09-30 13:00:01.259000000,416801,79371804166733,4,1,165.0,1,ZTS
7,2019-09-30 13:01:12.515786309,2019-09-30 13:01:08.607000000,424001,71675222836301,4,1,164.53,1,ZTS
8,2019-09-30 13:01:12.829321752,2019-09-30 13:01:08.983000000,424201,71675222836534,4,1,164.53,1,ZTS
9,2019-09-30 13:01:13.567922350,2019-09-30 13:01:09.841000000,424601,71675222837041,4,1,164.53,1,ZTS


In [169]:
stocks['MMM']

{'timestamp_tracker': {'start_timestamp': 1569836663666110360,
  'end_timestamp': 1569848473567922350}}

In [152]:
with open('days.pickle', 'wb') as f:
    pickle.dump(days, f)

In [None]:
from google.cloud import bigquery
from urllib.error import HTTPError
import pickle
import pandas_market_calendars as pmc
from datetime import datetime
from polygon import RESTClient
import os
from IPython.display import clear_output

client = bigquery.Client()


def call_historical(ticker, date, time=None, limit=50000):
    url = f'https://api.polygon.io/v2/ticks/stocks/trades/{ticker}/{date}'
    params = {
            'apiKey': os.environ.get('POLYGON'),
            'limit':limit,
            'timestamp':time
            }

    try:
        res = requests.get(url, params)
    except HTTPError:
        return call_historical(ticker, data, time, limit)
    
    return res.json()

def load_from_csv(file_path, table_id):
    '''
    table_id = "your-project.your_dataset.your_table_name"
    '''
    print(table_id)
    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.CSV, skip_leading_rows=1, autodetect=True,
    )

    with open(file_path, "rb") as source_file:
        job = client.load_table_from_file(source_file, table_id, job_config=job_config)

    job.result()  # Waits for the job to complete.

    table = client.get_table(table_id)  # Make an API request.
    print(
        "Loaded {} rows and {} columns to {}".format(
            table.num_rows, len(table.schema), table_id
        )
    )

def format_and_save(json, file_path='historic_csv.csv'):
    df = pd.DataFrame(json['results'])
    df = df.rename(key_map, axis=1)
    df['sip_timestamp'] = pd.to_datetime(df['sip_timestamp'])
    df['participant_timestamp'] = pd.to_datetime(df['participant_timestamp'])
    df['symbol'] = stock
    df = df[cols]
    df.to_csv(file_path, index=False)

limit = 50000
cols = ['sip_timestamp','participant_timestamp','sequence_number','id','exchange','size','price','tape','symbol']

with open('stocks_tracker.pickle', 'rb') as f:
    stocks = pickle.load(f)

with open('key_map.pickle', 'rb') as f:
    key_map = pickle.load(f)
    
with open('days.pickle', 'rb') as f:
    days = pickle.load(f)
    
for i, day in enumerate(days):
    today = datetime.strptime(day, '%Y-%m-%d')
    for stock in stocks:
        ## complete the next block iteratively until all the data for that day is gathered
        complete = False
        
        while complete == False:
            if 'namechange' in stocks[stock]:
                if today < stocks[stock]['namechange']['date']:
                    stock_name = stocks[stock]['namechange']['beforename']
                elif today >= stocks[stock]['namechange']['date']:
                    stock_name = stocks[stock]['namechange']['as_of_name']
            # grab the starttime if an end_timestamp exists, else set the starttime as None
            if stocks[stock]['timestamp_tracker']['end_timestamp'] is None:
                starttime = None
            elif stocks[stock]['timestamp_tracker']['end_timestamp']:
                starttime = stocks[stock]['timestamp_tracker']['end_timestamp']

            # get the json for the stock on the day with the correct symbol and starttime 
            json = call_historical('MMM', days[0], time=None, limit=10)
            
            # format the json and save as csv
            format_and_save(json, file_path='historic_csv.csv')

            # insert the json in the bigquery table
            load_from_csv('historic_csv.csv', 'sp500historical.csv_historic')
            
            # set the end_timestamp as the last tick 
            stocks[stock]['timestamp_tracker']['end_timestamp'] = json.results[-1]['t']

            # if the start timestamp is None, create an entry for it 
            if stocks[stock]['timestamp_tracker']['start_timestamp'] is None:
                stocks[stock]['timestamp_tracker']['start_timestamp'] = json.results[0]['t']
            
            with open('stocks_tracker.pickle', 'wb') as f:
                pickle.dump(stocks, f)
                
            if len(json.results) < limit:
                complete = True
    
    with open('current_day.pickle', 'wb') as f:
        pickle.dump((i, day), f)