In [62]:
import pandas as pd
import feedparser
import requests
from datetime import date
from bs4 import BeautifulSoup

In [64]:
def build_feed_url(cik):
    base_url = "https://www.sec.gov/cgi-bin/browse-edgar"
    params = {
        'action': 'getcompany',
        'CIK': cik,
        'type': 'SD',
        'owner': 'exclude',
        'start': 0,
        'count': 40,
        'output': 'atom',
    }
    req = requests.Request('GET', base_url, params=params)
    prepped = req.prepare()
    return prepped.url

In [182]:
cik_codes_1 = pd.read_csv('clean_ciks_good.csv')
cik_codes_1 = cik_codes_1.drop('Unnamed: 0', axis=1)
cik_codes_1.head()

Unnamed: 0,cik,companyname,entityid,primaryexchange,primarysymbol,siccode,sicdescription
0,706688,AARON'S INC,2480,NYSE,AAN,7359,"Equipment Rental and Leasing, Not Elsewhere Cl..."
1,824142,AAON INC,938,Nasdaq Global Market,AAON,3585,Air-Conditioning and Warm Air Heating Equipmen...
2,881890,ABAXIS INC,11929,Nasdaq Global Market,ABAX,3829,"Measuring and Controlling Devices, Not Elsewhe..."
3,1551152,ABBVIE INC.,883230,NYSE,ABBV,2834,Pharmaceutical Preparations
4,1800,ABBOTT LABORATORIES,5054,NYSE,ABT,2834,Pharmaceutical Preparations


In [183]:
cik_codes_2 = pd.read_csv('clean_ciks_good_2.csv')
cik_codes_2 = cik_codes_2.drop('Unnamed: 0', axis=1)
cik_codes_2.head()

Unnamed: 0,cik,companyname,entityid,primaryexchange,primarysymbol,siccode,sicdescription
0,1433660,JOHN BEAN TECHNOLOGIES CORP,777176,NYSE,JBT,3550,"Special Industry Machinery, Except Metalworking"
1,72903,XCEL ENERGY INC,1763,NYSE,XEL,4931,Electric and Other Services Combined
2,789933,NACCO INDUSTRIES INC,1939,NYSE,NC,3630,Household Appliances
3,200406,JOHNSON & JOHNSON,10313,NYSE,JNJ,2834,Pharmaceutical Preparations
4,1442145,"VERISK ANALYTICS, INC.",785278,Nasdaq Global Market,VRSK,7374,Computer Processing and Data Preparation and P...


In [189]:
cik_codes = pd.concat([cik_codes_1, cik_codes_2], ignore_index=True)
cik_codes['feed_url'] = cik_codes.cik.apply(build_feed_url)
cik_codes['SD_url_2014'] = ''
cik_codes['SD_url_2015'] = ''
cik_codes['SD_url_2016'] = ''
cik_codes['feed_retrieved_date'] = date.today()  # Note: should do it with the next step, but ok for now
cik_codes.to_csv('all_cik_codes.csv')
cik_codes.head()

Unnamed: 0,cik,companyname,entityid,primaryexchange,primarysymbol,siccode,sicdescription,feed_url,SD_url_2014,SD_url_2015,SD_url_2016,feed_retrieved_date
0,706688,AARON'S INC,2480,NYSE,AAN,7359,"Equipment Rental and Leasing, Not Elsewhere Cl...",https://www.sec.gov/cgi-bin/browse-edgar?owner...,,,,2016-04-30
1,824142,AAON INC,938,Nasdaq Global Market,AAON,3585,Air-Conditioning and Warm Air Heating Equipmen...,https://www.sec.gov/cgi-bin/browse-edgar?owner...,,,,2016-04-30
2,881890,ABAXIS INC,11929,Nasdaq Global Market,ABAX,3829,"Measuring and Controlling Devices, Not Elsewhe...",https://www.sec.gov/cgi-bin/browse-edgar?owner...,,,,2016-04-30
3,1551152,ABBVIE INC.,883230,NYSE,ABBV,2834,Pharmaceutical Preparations,https://www.sec.gov/cgi-bin/browse-edgar?owner...,,,,2016-04-30
4,1800,ABBOTT LABORATORIES,5054,NYSE,ABT,2834,Pharmaceutical Preparations,https://www.sec.gov/cgi-bin/browse-edgar?owner...,,,,2016-04-30


## Get EDGAR RSS Feed of Special Disclosures by Company

In [190]:
def get_feed_and_parse(row):
    feed_url = row.feed_url
    feed = feedparser.parse(feed_url)
    for entry in feed.entries:
        str_date = entry['filing-date']
        date = datetime.strptime(str_date, '%Y-%m-%d')
        year = date.year
        links = entry['links']
        if len(links) > 1:
            print('More than one link')
            print(entry)
        
        if year not in [2014, 2015, 2016]:
            print('Other Year')
            print(entry)
        else:
            column = 'SD_url_%s' % year
            row.loc[column] = links[0]['href']
    return row

In [191]:
# EXPENSIVE - run carefully
#cik_codes_with_feeds = cik_codes.apply(get_feed_and_parse, axis=1)

In [210]:
cik_codes_with_feeds.to_csv('cik_codes_with_feed_urls.csv', index_label='id')

## Get EDGAR Page for a year and scrape out all the disclosure documents

In [213]:
cik_codes_with_feeds = pd.read_csv('cik_codes_with_feed_urls.csv')
cik_codes_with_feeds = cik_codes_with_feeds.drop('id', axis=1)
cik_codes_with_feeds.head()

Unnamed: 0,cik,companyname,entityid,primaryexchange,primarysymbol,siccode,sicdescription,feed_url,SD_url_2014,SD_url_2015,SD_url_2016,feed_retrieved_date
0,706688,AARON'S INC,2480,NYSE,AAN,7359,"Equipment Rental and Leasing, Not Elsewhere Cl...",https://www.sec.gov/cgi-bin/browse-edgar?owner...,http://www.sec.gov/Archives/edgar/data/706688/...,http://www.sec.gov/Archives/edgar/data/706688/...,,2016-04-30
1,824142,AAON INC,938,Nasdaq Global Market,AAON,3585,Air-Conditioning and Warm Air Heating Equipmen...,https://www.sec.gov/cgi-bin/browse-edgar?owner...,http://www.sec.gov/Archives/edgar/data/824142/...,http://www.sec.gov/Archives/edgar/data/824142/...,,2016-04-30
2,881890,ABAXIS INC,11929,Nasdaq Global Market,ABAX,3829,"Measuring and Controlling Devices, Not Elsewhe...",https://www.sec.gov/cgi-bin/browse-edgar?owner...,http://www.sec.gov/Archives/edgar/data/881890/...,http://www.sec.gov/Archives/edgar/data/881890/...,,2016-04-30
3,1551152,ABBVIE INC.,883230,NYSE,ABBV,2834,Pharmaceutical Preparations,https://www.sec.gov/cgi-bin/browse-edgar?owner...,http://www.sec.gov/Archives/edgar/data/1551152...,http://www.sec.gov/Archives/edgar/data/1551152...,,2016-04-30
4,1800,ABBOTT LABORATORIES,5054,NYSE,ABT,2834,Pharmaceutical Preparations,https://www.sec.gov/cgi-bin/browse-edgar?owner...,http://www.sec.gov/Archives/edgar/data/1800/00...,http://www.sec.gov/Archives/edgar/data/1800/00...,,2016-04-30


In [291]:
# Some test data
subset = cik_codes_with_feeds.copy()
subset = subset[0:1]
subset

Unnamed: 0,cik,companyname,entityid,primaryexchange,primarysymbol,siccode,sicdescription,feed_url,SD_url_2014,SD_url_2015,SD_url_2016,feed_retrieved_date
0,706688,AARON'S INC,2480,NYSE,AAN,7359,"Equipment Rental and Leasing, Not Elsewhere Cl...",https://www.sec.gov/cgi-bin/browse-edgar?owner...,http://www.sec.gov/Archives/edgar/data/706688/...,http://www.sec.gov/Archives/edgar/data/706688/...,,2016-04-30


In [302]:
# Data Processing methods

def get_soup(url, i=-1):
    page_response = requests.get(url)
    print(i, page_response.status_code)
    if page_response.status_code != 200:
        return None
    soup = BeautifulSoup(page_response.content, 'html.parser')
    return soup

def get_meta_dict_from_soup(soup):
    company_info = soup.find(class_='companyInfo')
    company_name_cik = company_info.find(class_='companyName').text
    try:
        company_name = company_name_cik.split(' (Filer)')[0]
    except:
        company_name = 'XXX: could not parse 1'
    try:
        cik = company_name_cik.split('CIK:')[1].split('(')[0].lstrip().rstrip()
    except:
        cik = 'XXX: could not parse 1'
    try:
        sec_accession_number = list(soup.find(id='secNum').descendants)[-1].split('\n')[0].lstrip()
    except:
        sec_accession_number = 'XXX: could not parse 1'
    try:
        required = None
        for infoHead in soup.findAll(class_='infoHead'):
            if infoHead.text == 'Accepted':
                required = infoHead
        filing_date = required.next_sibling.nextSibling.text
    except:
        filing_date = 'XXX: could not parse 1'
    return {
        'ret_company_name': company_name,
        'ret_cik': cik,
        'ret_sec_accession_number': sec_accession_number,
        'ret_filing_date': filing_date
    }

base_url = "https://www.sec.gov"
def get_docs_from_soup(soup, meta_dict):
    docs = []
    disclosure_components_table = soup.find(class_='tableFile')
    for row in disclosure_components_table.findAll('tr'):
        cols = row.findAll('td')
        all_data = meta_dict.copy()
        if len(cols) == 0:
            # This is the header row
            continue
        if len(cols) > 2:
            try:
                title = cols[1].text
            except:
                title = 'XXX: could not parse 2'
            try:
                description = cols[2].find('a').text
            except:
                description = 'XXX: could not parse 2'
            try:
                url_part = cols[2].find('a').attrs['href']
                url = '%s%s' % (base_url, url_part)
            except:
                url = 'XXX: could not parse 2'
            all_data.update({
                'ret_title': title,
                'ret_url': url,
                'ret_description': description,
            })
        else:
            # This means that it's probably not the header row but something else wierd is happening
            all_data.update({
                'ret_title': 'XXX: could not parse 3',
                'ret_url': 'XXX: could not parse 3',
                'ret_description': 'XXX: could not parse 3',
            })        
        docs.append(all_data)
    return docs


In [303]:
disclosure_docs = pd.DataFrame()
for i, row in cik_codes_with_feeds.iterrows():
    for col in ['SD_url_2014', 'SD_url_2015']:
        url = row.loc[col]
        if pd.isnull(url):
            # Nothing to do if there's no url
            continue
        soup = get_soup(url, i)
        if not soup:
            # Nothing to do if there's no soup
            continue
        meta_dict = get_meta_dict_from_soup(soup)
        meta_dict.update({
                'req_company_name': row.loc['companyname'],
                'req_cik': row.loc['cik'],
                'req_url': url
            })
        docs = get_docs_from_soup(soup, meta_dict)
        disclosure_docs = pd.concat([disclosure_docs, pd.DataFrame(docs)], ignore_index=True)
disclosure_docs.head()

0 200
0 200
1 200
1 200
2 200
2 200
3 200
3 200
4 200
4 200
5 200
5 200
6 200
7 200
7 200
8 200
8 200
9 200
9 200
10 200
10 200
11 200
11 200
12 200
12 200
13 200
13 200
14 200
14 200
15 200
15 200
16 200
16 200
17 200
17 200
18 200
18 200
19 200
19 200
20 200
20 200
21 200
21 200
22 200
22 200
23 200
23 200
24 200
24 200
25 200
25 200
26 200
26 200
27 200
27 200
28 200
28 200
29 200
30 200
30 200
31 200
31 200
32 200
32 200
33 200
33 200
34 200
34 200
35 200
35 200
36 200
36 200
37 200
38 200
38 200
39 200
39 200
40 200
40 200
41 200
41 200
42 200
42 200
43 200
43 200
44 200
44 200
45 200
45 200
46 200
46 200
47 200
47 200
48 200
48 200
49 200
49 200
50 200
50 200
51 200
51 200
52 200
52 200
53 200
53 200
54 200
54 200
55 200
55 200
56 200
56 200
57 200
57 200
58 200
58 200
59 200
59 200
60 200
60 200
61 200
61 200
62 200
62 200
63 200
63 200
64 200
64 200
65 200
65 200
66 200
66 200
67 200
67 200
68 200
68 200
69 200
69 200
70 200
70 200
71 200
71 200
72 200
72 200
73 200
73 200
74 2

Unnamed: 0,req_cik,req_company_name,req_url,ret_cik,ret_company_name,ret_description,ret_filing_date,ret_sec_accession_number,ret_title,ret_url
0,AARON'S INC,706688,http://www.sec.gov/Archives/edgar/data/706688/...,706688,AARON'S INC,formsdconflictminerals2014.htm,2014-05-28 12:50:08,0000706688-14-000029,SD,https://www.sec.gov/Archives/edgar/data/706688...
1,AARON'S INC,706688,http://www.sec.gov/Archives/edgar/data/706688/...,706688,AARON'S INC,0000706688-14-000029.txt,2014-05-28 12:50:08,0000706688-14-000029,Complete submission text file,https://www.sec.gov/Archives/edgar/data/706688...
2,AARON'S INC,706688,http://www.sec.gov/Archives/edgar/data/706688/...,706688,AARON'S INC,sdconflictminerals2014.htm,2015-05-22 12:06:05,0000706688-15-000148,SD_CONFLICT_MINERALS_2014,https://www.sec.gov/Archives/edgar/data/706688...
3,AARON'S INC,706688,http://www.sec.gov/Archives/edgar/data/706688/...,706688,AARON'S INC,0000706688-15-000148.txt,2015-05-22 12:06:05,0000706688-15-000148,Complete submission text file,https://www.sec.gov/Archives/edgar/data/706688...
4,AAON INC,824142,http://www.sec.gov/Archives/edgar/data/824142/...,824142,AAON INC,a2013formsd.htm,2014-06-02 13:49:18,0000824142-14-000080,AAON2013FORMSD,https://www.sec.gov/Archives/edgar/data/824142...


In [310]:
disclosure_docs.to_csv('disclosure_docs_list.csv')

In [311]:
len(disclosure_docs)

5571