#  The purpose of this document is to learn how to query and navigate the reference files for information.  When you go to train a model, or develop a signal, these files will be joined with job records, dropping reference materials with no job records.

# Import reference files

1. PIT Ticker
2. PIT Company Reference
3. Scrape Log

In [43]:
import pandas as pd

# Choose a ticker to look at
Ticker = 'MSFT'
Company_name = 'Microsoft Corporation'
Exchange = 'NAS'
Country = 'US'

In [44]:
company_ticker = pd.read_csv('company_ticker_2021-01-31.csv.gz')
PIT_Company = pd.read_csv('raw_pit_company_reference_full_2020-12-31.csv.gz')
ScrapeLog = pd.read_csv('raw_company_scrape_log_full_2020-12-31.csv.gz')

# Look up Company ID and other information based on company name

In [45]:
PIT_Company[PIT_Company['company_name'].str.contains(Company_name, case = False)==True]

Unnamed: 0,company_id,start_date,end_date,company_name,company_url,lei,open_perm_id,naics_code
520,370,2006-07-17,2016-02-23,Microsoft Corporation,http://www.microsoft.com,,,
521,370,2016-02-24,,Microsoft Corporation,http://www.microsoft.com,INR2EJN1ERAN0W5ZP974,4295907168.0,511210.0


# Look up Company Ids based on Ticker

In [46]:
Company_IDs = company_ticker[company_ticker['company_id'].isin(
    company_ticker[company_ticker['ticker_symbol']==Ticker]['company_id'].unique())]['company_id'].unique()
Company_IDs

array([  370,  1247,  1270,  3023,  7478,  9454, 16855, 19549, 19602,
       21515, 21688, 21951, 22218, 23877, 24841, 24931, 25074, 26380,
       26403, 28153, 29069, 29146, 29291, 31099, 31198, 31551, 32318,
       33485, 35904, 36492, 42909])

Filtering on Country, Exhange, and Primary Flag can be usefull to elimite duplicate stock ticker names.

In [47]:
Company_IDs = company_ticker[company_ticker['company_id'].isin(
    company_ticker[(company_ticker['ticker_symbol']==Ticker) & (company_ticker['stock_exchange_country']==Country) & 
                   (company_ticker['stock_exchange_name']==Exchange) ]['company_id'].unique())]['company_id'].unique()
Company_IDs

array([  370,  1247,  1270,  3023,  7478,  9454, 16855, 19549, 19602,
       21515, 21688, 21951, 22218, 23877, 24841, 24931, 25074, 26380,
       26403, 28153, 29069, 29146, 29291, 31099, 31198, 31551, 32318,
       33485, 35904, 36492, 42909])

# Look for all scrape changes based Company IDs

In [48]:
ScrapeLog[(ScrapeLog['company_id'].isin(Company_IDs)) & (ScrapeLog['scrape_changed']==True)].head(10)

Unnamed: 0,company_id,date,scrape_run_complete,scrape_changed
251495,370,2008-11-12,False,True
251496,370,2008-11-13,False,True
251497,370,2008-11-17,False,True
251498,370,2008-12-09,False,True
251499,370,2009-01-16,False,True
251500,370,2009-02-18,False,True
251501,370,2009-03-05,False,True
251502,370,2009-04-02,False,True
251503,370,2009-06-10,False,True
251504,370,2009-07-14,False,True


# Look up company reference file with Company IDs

In [49]:
PIT_Company[PIT_Company['company_id'].isin(Company_IDs)].head(10)

Unnamed: 0,company_id,start_date,end_date,company_name,company_url,lei,open_perm_id,naics_code
520,370,2006-07-17,2016-02-23,Microsoft Corporation,http://www.microsoft.com,,,
521,370,2016-02-24,,Microsoft Corporation,http://www.microsoft.com,INR2EJN1ERAN0W5ZP974,4295907168.0,511210.0
2100,1247,2007-03-23,2016-02-23,"Canesta, Inc.",http://www.canesta.com,,,
2101,1247,2016-02-24,,"Canesta, Inc.",http://www.canesta.com,,4296013117.0,
2141,1270,2007-03-24,2016-02-23,Ensemble Studios,http://www.ensemblestudios.com,,,
2142,1270,2016-02-24,,Ensemble Studios,http://www.ensemblestudios.com,,4298001272.0,
5255,3023,2007-05-01,2016-02-23,LinkedIn,http://www.linkedin.com,,,
5256,3023,2016-02-24,,LinkedIn,http://www.linkedin.com,,5001440114.0,561110.0
12282,7478,2007-09-17,2016-02-23,lynda.com,http://www.lynda.com,,,
12283,7478,2016-02-24,,lynda.com,http://www.lynda.com,,4297174126.0,


# Look up all history for companies that have ever had a specified ticker

In [50]:
company_ticker[(company_ticker['ticker_symbol']==Ticker) & (company_ticker['primary_flag']==True)].head(10)

Unnamed: 0,company_id,start_date,end_date,ticker_symbol,stock_exchange_country,stock_exchange_name,primary_flag
690,370,1986-03-13,,MSFT,US,NAS,True
3479,1247,2010-10-29,,MSFT,US,NAS,True
3527,1270,2001-05-07,,MSFT,US,NAS,True
8791,3023,2016-12-08,,MSFT,US,NAS,True
14781,7478,2016-12-08,,MSFT,US,NAS,True
20213,9454,2020-04-17,,MSFT,US,NAS,True
33955,16855,2008-04-25,,MSFT,US,NAS,True
39188,19549,2007-08-13,2009-10-13,MSFT,US,NAS,True
39372,19602,2007-08-13,,MSFT,US,NAS,True
42494,21515,2012-06-25,,MSFT,US,NAS,True


# Look up Ticker history based on Company ID

In [51]:
company_id = 370

In [52]:
company_ticker[company_ticker['company_id'] == company_id].head(10)

Unnamed: 0,company_id,start_date,end_date,ticker_symbol,stock_exchange_country,stock_exchange_name,primary_flag
679,370,1986-03-13,,0QYP,GB,LON,False
680,370,1986-03-13,,4338,HK,HKG,False
681,370,1986-03-13,,MICRSOFT,HU,BUD,False
682,370,1986-03-13,,MSF,NL,AMS,False
683,370,1986-03-13,,MSF,BE,BRU,False
684,370,1986-03-13,,MSF,DE,ETR,False
685,370,1986-03-13,,MSFT,CO,BOG,False
686,370,1986-03-13,,MSFT,RO,BSE,False
687,370,1986-03-13,,MSFT,PE,LIM,False
688,370,1986-03-13,,MSFT,MX,MEX,False
