In [62]:
import pandas as pd
import feedparser
import requests
from bs4 import BeautifulSoup

In [64]:
def build_feed_url(cik):
    base_url = "https://www.sec.gov/cgi-bin/browse-edgar"
    params = {
        'action': 'getcompany',
        'CIK': cik,
        'type': 'SD',
        'owner': 'exclude',
        'start': 0,
        'count': 40,
        'output': 'atom',
    }
    req = requests.Request('GET', base_url, params=params)
    prepped = req.prepare()
    return prepped.url

In [68]:
cik_codes = pd.read_csv('clean_ciks_good.csv')
cik_codes = cik_codes.drop('Unnamed: 0', axis=1)
cik_codes['feed_url'] = cik_codes.cik.apply(build_feed_url)
cik_codes['SD_url_2014'] = ''
cik_codes['SD_url_2015'] = ''
cik_codes['SD_url_2016'] = ''
cik_codes.head()

Unnamed: 0,cik,companyname,entityid,primaryexchange,primarysymbol,siccode,sicdescription,feed_url,SD_url_2014,SD_url_2015,SD_url_2016
0,706688,AARON'S INC,2480,NYSE,AAN,7359,"Equipment Rental and Leasing, Not Elsewhere Cl...",https://www.sec.gov/cgi-bin/browse-edgar?owner...,,,
1,824142,AAON INC,938,Nasdaq Global Market,AAON,3585,Air-Conditioning and Warm Air Heating Equipmen...,https://www.sec.gov/cgi-bin/browse-edgar?owner...,,,
2,881890,ABAXIS INC,11929,Nasdaq Global Market,ABAX,3829,"Measuring and Controlling Devices, Not Elsewhe...",https://www.sec.gov/cgi-bin/browse-edgar?owner...,,,
3,1551152,ABBVIE INC.,883230,NYSE,ABBV,2834,Pharmaceutical Preparations,https://www.sec.gov/cgi-bin/browse-edgar?owner...,,,
4,1800,ABBOTT LABORATORIES,5054,NYSE,ABT,2834,Pharmaceutical Preparations,https://www.sec.gov/cgi-bin/browse-edgar?owner...,,,


In [86]:
subset = cik_codes.copy()
subset = subset[0:10]
subset.tail(3)

Unnamed: 0,cik,companyname,entityid,primaryexchange,primarysymbol,siccode,sicdescription,feed_url,SD_url_2014,SD_url_2015,SD_url_2016
7,910638,3D SYSTEMS CORP,5580,NYSE,DDD,7372,Prepackaged Software,https://www.sec.gov/cgi-bin/browse-edgar?owner...,,,
8,66740,3M CO,5248,NYSE,MMM,3841,Surgical and Medical Instruments and Apparatus,https://www.sec.gov/cgi-bin/browse-edgar?owner...,,,
9,815094,ABIOMED INC,4093,Nasdaq Global Market,ABMD,3841,Surgical and Medical Instruments and Apparatus,https://www.sec.gov/cgi-bin/browse-edgar?owner...,,,


## Get EDGAR RSS Feed of Special Disclosures by Company

In [87]:
def get_feed_and_parse(row):
    feed_url = row.feed_url
    feed = feedparser.parse(feed_url)
    for entry in feed.entries:
        str_date = entry['filing-date']
        date = datetime.strptime(str_date, '%Y-%m-%d')
        year = date.year
        links = entry['links']
        if len(links) > 1:
            print('More than one link')
            print(entry)
        
        if year not in [2014, 2015, 2016]:
            print('Other Year')
            print(entry)
        else:
            column = 'SD_url_%s' % year
            row.loc[column] = links[0]['href']
    return row

In [None]:
# EXPENSIVE - run carefully
added = subset.apply(get_feed_and_parse, axis=1)

In [92]:
added[5:10]

Unnamed: 0,cik,companyname,entityid,primaryexchange,primarysymbol,siccode,sicdescription,feed_url,SD_url_2014,SD_url_2015,SD_url_2016
5,1750,AAR CORP,8804,NYSE,AIR,3720,Aircraft And Parts,https://www.sec.gov/cgi-bin/browse-edgar?owner...,http://www.sec.gov/Archives/edgar/data/1750/00...,http://www.sec.gov/Archives/edgar/data/1750/00...,
6,1580808,"A10 NETWORKS, INC.",910463,NYSE,ATEN,3576,Computer Communications Equipment,https://www.sec.gov/cgi-bin/browse-edgar?owner...,,http://www.sec.gov/Archives/edgar/data/1580808...,
7,910638,3D SYSTEMS CORP,5580,NYSE,DDD,7372,Prepackaged Software,https://www.sec.gov/cgi-bin/browse-edgar?owner...,http://www.sec.gov/Archives/edgar/data/910638/...,http://www.sec.gov/Archives/edgar/data/910638/...,
8,66740,3M CO,5248,NYSE,MMM,3841,Surgical and Medical Instruments and Apparatus,https://www.sec.gov/cgi-bin/browse-edgar?owner...,http://www.sec.gov/Archives/edgar/data/66740/0...,http://www.sec.gov/Archives/edgar/data/66740/0...,
9,815094,ABIOMED INC,4093,Nasdaq Global Market,ABMD,3841,Surgical and Medical Instruments and Apparatus,https://www.sec.gov/cgi-bin/browse-edgar?owner...,http://www.sec.gov/Archives/edgar/data/815094/...,http://www.sec.gov/Archives/edgar/data/815094/...,


In [93]:
print(added.SD_url_2014[8])
print(added.SD_url_2015[8])

http://www.sec.gov/Archives/edgar/data/66740/000110465914043157/0001104659-14-043157-index.htm
http://www.sec.gov/Archives/edgar/data/66740/000110465915042396/0001104659-15-042396-index.htm


## Get EDGAR Page for a year and scrape out all the disclosure documents

In [95]:
disclosure_docs = pd.DataFrame()


In [104]:
url = added.SD_url_2014[9]
summary_page_response = requests.get(url)
assert summary_page_response.status_code == 200

In [105]:
soup = BeautifulSoup(summary_page_response.content, 'html.parser')

In [99]:
# Store the requested CIK
# Store the requested Company Name
# Store the returned CIK
# Store the returned Company Name
# Store the returned Accession Number
# Store the returned Filing Date
# Get the rows, store the data

In [179]:
# Requested
#new_row.loc['requested_cik'] = row.loc['cik']
#new_row.loc['companyname'] = row.loc['companyname']
# Returned
company_info = soup.find(class_='companyInfo')
company_name_cik = company_info.find(class_='companyName').text

try:
    company_name = company_name_cik.split(' (Filer)')[0]
except IndexError:
    company_name = 'XXX: could not parse'
try:
    cik = company_name_cik.split('CIK:')[1].split('(')[0].lstrip().rstrip()
except IndexError:
    cik = 'XXX: could not parse'
try:
    sec_accession_number = list(soup.find(id='secNum').descendants)[-1].split('\n')[0].lstrip()
except IndexError:
    sec_accession_number = 'XXX: could not parse'
try:
    required = None
    for infoHead in soup.findAll(class_='infoHead'):
        if infoHead.text == 'Accepted':
            required = infoHead
    filing_date = required.next_sibling.nextSibling.text
except:
    filing_date = ''
filing_date

'2014-06-02 13:43:40'

In [181]:
base_url = "https://www.sec.gov"
disclosure_components_table = soup.find(class_='tableFile')
for row in disclosure_components_table.findAll('tr'):
    cols = row.findAll('td')
    if len(cols) > 1:
        title = cols[1].text
        description = cols[2].find('a').text
        url_part = cols[2].find('a').attrs['href']
        url = '%s/%s' % (base_url, url_part)
        print(title)
        print(description)
        print(url)

FORM SD
d738342dsd.htm
https://www.sec.gov//Archives/edgar/data/815094/000119312514221767/d738342dsd.htm
EX-1.02
d738342dex102.htm
https://www.sec.gov//Archives/edgar/data/815094/000119312514221767/d738342dex102.htm
Complete submission text file
0001193125-14-221767.txt
https://www.sec.gov//Archives/edgar/data/815094/000119312514221767/0001193125-14-221767.txt


## title, description, url |  filing date, type, download_date

'2014-06-02 13:43:40'

In [None]:
    disclosure_components_table = soup.find(class_='tableFile')
    for row in disclosure_components_table.findAll('tr'):
        cols = row.findAll('td')
        if len(cols) > 1:
            print(cols[1].text)
            print(cols[2].find('a'))

## Sandpit

In [18]:
requests.Request(url=base_url, params=params).url

'https://www.sec.gov/cgi-bin/browse-edgar'

In [14]:
response = requests.get(url=base_url, params=params)
assert response.status_code == 200

In [15]:
response.url

'https://www.sec.gov/cgi-bin/browse-edgar?owner=exclude&count=40&output=atom&action=getcompany&CIK=0000320193&start=0&type=SD'

In [8]:
print(fifteen['updated'])
print(fifteen['links'][0]['href'])

2015-02-12T06:07:38-05:00
http://www.sec.gov/Archives/edgar/data/320193/000119312515045292/0001193125-15-045292-index.htm


In [46]:
for entry in feed.entries:
    assert len(entry['links']) == 1
    summary_page = requests.get(entry['links'][0]['href'])
    assert summary_page.status_code == 200
    soup = BeautifulSoup(summary_page.content, 'html.parser')
    disclosure_components_table = soup.find(class_='tableFile')
    for row in disclosure_components_table.findAll('tr'):
        cols = row.findAll('td')
        if len(cols) > 1:
            print(cols[1].text)
            print(cols[2].find('a'))

FORM SD
<a href="/Archives/edgar/data/320193/000119312516523320/d168894dsd.htm">d168894dsd.htm</a>
EX-1.01
<a href="/Archives/edgar/data/320193/000119312516523320/d168894dex101.htm">d168894dex101.htm</a>
GRAPHIC
<a href="/Archives/edgar/data/320193/000119312516523320/g168894ex1_01pg005.jpg">g168894ex1_01pg005.jpg</a>
GRAPHIC
<a href="/Archives/edgar/data/320193/000119312516523320/g168894ex991_pg001.jpg">g168894ex991_pg001.jpg</a>
GRAPHIC
<a href="/Archives/edgar/data/320193/000119312516523320/g168894tx_pg001.jpg">g168894tx_pg001.jpg</a>
Complete submission text file
<a href="/Archives/edgar/data/320193/000119312516523320/0001193125-16-523320.txt">0001193125-16-523320.txt</a>
SD
<a href="/Archives/edgar/data/320193/000119312515045292/d864750dsd.htm">d864750dsd.htm</a>
EX-1.01
<a href="/Archives/edgar/data/320193/000119312515045292/d864750dex101.htm">d864750dex101.htm</a>
GRAPHIC
<a href="/Archives/edgar/data/320193/000119312515045292/g864750ex991_pg001.jpg">g864750ex991_pg001.jpg</a>
GR

In [50]:
from os import path
from os import makedirs

In [51]:
path.exists('_0000320193')

False

In [52]:
makedirs('_0000320193')