In [34]:
import nltk
import numpy as np
import pandas as pd
import pickle
import pprint
import project_helper
import sys
from sec_edgar_downloader import Downloader
from tqdm import tqdm

In [35]:
import wrds

In [36]:
###################
# Connect to WRDS #
###################
conn=wrds.Connection()

Enter your WRDS username [juntao]:jz3587
Enter your password:········
WRDS recommends setting up a .pgpass file.
Create .pgpass file now [y/n]?: y
Created .pgpass file successfully.
Loading library list...
Done


In [37]:
### Get S&P500 Index Membership from CRSP
### I opt for the monthly frequency of the data, 
### but one can choose to work with crsp.dsp500list 
### if more precise date range is needed.

sp500 = conn.raw_sql("""
                        select a.*, b.date, b.ret
                        from crsp.msp500list as a,
                        crsp.msf as b
                        where a.permno=b.permno
                        and b.date >= a.start and b.date<= a.ending
                        and b.date>='01/01/2020'
                        and b.date<='12/31/2021'
                        order by date;
                        """, date_cols=['start', 'ending', 'date'])


get quarterly data

In [38]:
### Add Other Company Identifiers from CRSP.MSENAMES
### - You don't need this step if only PERMNO is required
### - This step aims to add TICKER, SHRCD, EXCHCD and etc. 

mse = conn.raw_sql("""
                        select comnam, ncusip, namedt, nameendt, 
                        permno, shrcd, exchcd, hsiccd, ticker
                        from crsp.msenames
                        """, date_cols=['namedt', 'nameendt'])

# if nameendt is missing then set to today date
mse['nameendt']=mse['nameendt'].fillna(pd.to_datetime('today'))

# Merge with SP500 data
sp500_full = pd.merge(sp500, mse, how = 'left', on = 'permno')

# Impose the date range restrictions
sp500_full = sp500_full.loc[(sp500_full.date>=sp500_full.namedt) \
                            & (sp500_full.date<=sp500_full.nameendt)]


### Add Other Company Identifiers from CRSP.MSENAMES
### - You don't need this step if only PERMNO is required
### - This step aims to add TICKER, SHRCD, EXCHCD and etc. 

mse = conn.raw_sql("""
                        select comnam, ncusip, namedt, nameendt, 
                        permno, shrcd, exchcd, hsiccd, ticker
                        from crsp.msenames
                        """, date_cols=['namedt', 'nameendt'])

# if nameendt is missing then set to today date
mse['nameendt']=mse['nameendt'].fillna(pd.to_datetime('today'))

# Merge with SP500 data
sp500_full = pd.merge(sp500, mse, how = 'left', on = 'permno')

# Impose the date range restrictions
sp500_full = sp500_full.loc[(sp500_full.date>=sp500_full.namedt) \
                            & (sp500_full.date<=sp500_full.nameendt)]


### Add Compustat Identifiers
### - Link with Compustat's GVKEY and IID if need to work with 
###   fundamental data
### - Linkage is done through crsp.ccmxpf_linktable

ccm=conn.raw_sql("""
                  select gvkey, liid as iid, lpermno as permno,
                  linktype, linkprim, linkdt, linkenddt
                  from crsp.ccmxpf_linktable
                  where substr(linktype,1,1)='L'
                  and (linkprim ='C' or linkprim='P')
                  """, date_cols=['linkdt', 'linkenddt'])

# if linkenddt is missing then set to today date
ccm['linkenddt']=ccm['linkenddt'].fillna(pd.to_datetime('today'))

# Merge the CCM data with S&P500 data
# First just link by matching PERMNO
sp500ccm = pd.merge(sp500_full, ccm, how='left', on=['permno'])

# Then set link date bounds
sp500ccm = sp500ccm.loc[(sp500ccm['date']>=sp500ccm['linkdt'])\
                        &(sp500ccm['date']<=sp500ccm['linkenddt'])]

# Rearrange columns for final output

sp500ccm = sp500ccm.drop(columns=['namedt', 'nameendt', 'linktype', \
                                  'linkprim', 'linkdt', 'linkenddt'])
sp500ccm = sp500ccm[['date', 'permno', 'comnam', 'ncusip',\
                     'shrcd', 'exchcd', 'hsiccd', 'ticker', \
                     'gvkey', 'iid', 'start', 'ending', 'ret']]


### Add CIKs and Link with SEC Index Files using CIK

names = conn.raw_sql(""" select gvkey, cik, sic, naics, gind, gsubind from comp.names """)

# Merge sp500 constituents table with names table
sp500cik = pd.merge(sp500ccm, names, on='gvkey',  how='left')
sp500cik.head()

Unnamed: 0,date,permno,comnam,ncusip,shrcd,exchcd,hsiccd,ticker,gvkey,iid,start,ending,ret,cik,sic,naics,gind,gsubind
0,2020-01-31,88837.0,GARMIN LTD,H2906T10,12.0,3.0,6722.0,GRMN,141459,1,2012-12-12,2022-03-31,-0.006253,1121788,3812,334511,252010,25201010
1,2020-01-31,11403.0,CADENCE DESIGN SYSTEMS INC,12738710,11.0,3.0,7372.0,CDNS,13421,1,2017-09-18,2022-03-31,0.039648,813672,7372,511210,451030,45103010
2,2020-01-31,88860.0,ALIGN TECHNOLOGY INC,01625510,11.0,3.0,3843.0,ALGN,141384,1,2017-06-19,2022-03-31,-0.078627,1097149,3843,339114,351010,35101020
3,2020-01-31,25953.0,DOVER CORP,26000310,11.0,1.0,3585.0,DOV,4058,1,1985-10-24,2022-03-31,-0.012233,29905,3585,333415,201060,20106020
4,2020-01-31,75100.0,TIFFANY & CO NEW,88654710,11.0,1.0,5944.0,TIF,13646,1,2000-06-21,2021-01-06,0.002769,98246,5944,448310,255040,25504040


In [39]:
sp500cik

Unnamed: 0,date,permno,comnam,ncusip,shrcd,exchcd,hsiccd,ticker,gvkey,iid,start,ending,ret,cik,sic,naics,gind,gsubind
0,2020-01-31,88837.0,GARMIN LTD,H2906T10,12.0,3.0,6722.0,GRMN,141459,01,2012-12-12,2022-03-31,-0.006253,0001121788,3812,334511,252010,25201010
1,2020-01-31,11403.0,CADENCE DESIGN SYSTEMS INC,12738710,11.0,3.0,7372.0,CDNS,013421,01,2017-09-18,2022-03-31,0.039648,0000813672,7372,511210,451030,45103010
2,2020-01-31,88860.0,ALIGN TECHNOLOGY INC,01625510,11.0,3.0,3843.0,ALGN,141384,01,2017-06-19,2022-03-31,-0.078627,0001097149,3843,339114,351010,35101020
3,2020-01-31,25953.0,DOVER CORP,26000310,11.0,1.0,3585.0,DOV,004058,01,1985-10-24,2022-03-31,-0.012233,0000029905,3585,333415,201060,20106020
4,2020-01-31,75100.0,TIFFANY & CO NEW,88654710,11.0,1.0,5944.0,TIF,013646,01,2000-06-21,2021-01-06,0.002769,0000098246,5944,448310,255040,25504040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12007,2021-12-31,82307.0,DAVITA INC,23918K10,11.0,1.0,8092.0,DVA,061483,01,2008-07-31,2022-03-31,0.203810,0000927066,8090,621492,351020,35102015
12008,2021-12-31,64186.0,CIGNA CORP NEW,12552310,11.0,1.0,6324.0,CI,002547,01,1982-04-08,2022-03-31,0.201824,0001739940,6324,524114,351020,35102015
12009,2021-12-31,61621.0,PAYCHEX INC,70432610,11.0,3.0,8700.0,PAYX,008402,01,1998-10-01,2022-03-31,0.145134,0000723531,8721,541214,451020,45102020
12010,2021-12-31,11955.0,WASTE MANAGEMENT INC DEL,94106L10,11.0,1.0,4953.0,WM,014477,01,1998-07-17,2022-03-31,0.042354,0000823768,4953,562111,202010,20201050


In [40]:
tickers=dict([(i,str(a)) for i, a in zip(sp500cik.ticker, sp500cik.cik)])

In [41]:
t = sorted(tickers.keys())

In [42]:
t.index('UAA')

485

In [43]:
dl = Downloader("/Users/juntao/project_5/documents")
for ticker in t[485:]:
    dl.get('10-K', ticker, after="2020-01-01", before="2021-12-31")
    dl.get('10-Q', ticker, after="2020-01-01", before="2021-12-31")

In [None]:
annual = {}
quarterly = {}

for ticker, cik in tickers.items():
    path = '/Users/juntao/project_5/documents/sec-edgar-filings/' + ticker +'/10-K/'+cik + 0001673379-21-000007/full-submission.txt'