In [1]:
import os
import json
import pandas as pd
from bs4 import BeautifulSoup

PATH = 'data/edgar_results/'

In [3]:
dfs = []
for file in os.listdir(PATH):
    if not file.endswith('.json'):
        continue
    data = json.loads(open(PATH + file).read())
    if len(data['entries']) > 0:
        df = pd.DataFrame.from_dict(data['entries'])[['link', 'filing-date', 'size', 'accession-nunber']]
        df['filing_date'] = pd.to_datetime(df['filing-date'])
        df['year'] = df.filing_date.dt.year
        df['cik'] = data['feed']['cik']
        df['conformed_name'] = data['feed']['conformed-name']
        df = df.rename(columns={'accession-nunber': 'accession_number'})  # sec typo
        df = df.drop(['filing-date'], axis=1)
        dfs.append(df)
df = pd.concat(dfs).reset_index(drop=True)

In [4]:
df.tail(2)

Unnamed: 0,link,size,accession_number,filing_date,year,cik,conformed_name
6728,https://www.sec.gov/Archives/edgar/data/135544...,64 KB,0001292814-15-001370,2015-05-29,2015,1355444,EMBRAER S.A.
6729,https://www.sec.gov/Archives/edgar/data/135544...,60 KB,0001292814-14-001394,2014-05-30,2014,1355444,EMBRAER S.A.


In [5]:
df.year.unique()

array([2019, 2018, 2017, 2016, 2015, 2014])

In [6]:
for year in df.year.unique():
    by_year = df[df.year == year]
    print(f'Year {year} has {len(by_year)} SD filings that we were able to gather.')
    by_year.to_csv(f'summary_data/sd-filing-info-{year}.csv')

Year 2019 has 1078 SD filings that we were able to gather.
Year 2018 has 1121 SD filings that we were able to gather.
Year 2017 has 1156 SD filings that we were able to gather.
Year 2016 has 1160 SD filings that we were able to gather.
Year 2015 has 1117 SD filings that we were able to gather.
Year 2014 has 1098 SD filings that we were able to gather.


In [7]:
df.to_csv('summary_data/sd-filing-info-all.csv')

In [8]:
# Note there are some duplicate accession numbers for whatever reason. Not too many though.
df[df.accession_number.duplicated()]

Unnamed: 0,link,size,accession_number,filing_date,year,cik,conformed_name
1450,https://www.sec.gov/Archives/edgar/data/104264...,654 KB,0001558370-19-005313,2019-05-24,2019,1042642,DISH DBS CORP
1451,https://www.sec.gov/Archives/edgar/data/104264...,745 KB,0001558370-18-005068,2018-05-31,2018,1042642,DISH DBS CORP
3419,https://www.sec.gov/Archives/edgar/data/153450...,28 KB,0001534504-15-000043,2015-06-01,2015,1534504,PBF Energy Inc.
3420,https://www.sec.gov/Archives/edgar/data/153450...,27 KB,0001534504-14-000044,2014-06-02,2014,1534504,PBF Energy Inc.
4625,https://www.sec.gov/Archives/edgar/data/140504...,95 KB,0001624826-18-000029,2018-05-31,2018,1405041,Momentive Performance Materials Inc.
4626,https://www.sec.gov/Archives/edgar/data/140504...,100 KB,0001624826-17-000030,2017-05-31,2017,1405041,Momentive Performance Materials Inc.
4627,https://www.sec.gov/Archives/edgar/data/140504...,70 KB,0001624826-16-000055,2016-05-27,2016,1405041,Momentive Performance Materials Inc.
5946,https://www.sec.gov/Archives/edgar/data/158564...,351 KB,0001282266-19-000036,2019-05-30,2019,1585644,"WINDSTREAM SERVICES, LLC"
5947,https://www.sec.gov/Archives/edgar/data/158564...,279 KB,0001282266-18-000030,2018-05-29,2018,1585644,"WINDSTREAM SERVICES, LLC"
5948,https://www.sec.gov/Archives/edgar/data/158564...,287 KB,0001282266-17-000034,2017-05-25,2017,1585644,"WINDSTREAM SERVICES, LLC"


----

Pull out individual document data

In [9]:
def get_document_soup_from_page(page):
    """
    page is requests.Response.content
    """
    soup = BeautifulSoup(page, 'html.parser')
    # Get only one table
    table_attrs = dict(summary='Document Format Files')
    assert len(soup.find_all('table', attrs=table_attrs)) == 1, 'Wrong number of Tables'
    table = soup.find('table', attrs=table_attrs)
    # Check the header row (note order is important)
    header_values = ['Seq', 'Description', 'Document', 'Type', 'Size']
    headers = table.findChildren('th')
    for i, header in enumerate(headers):
        assert header.text == header_values[i]
    # Return Table Rows (but not header row)
    return table.findChildren('tr')[1:]

In [11]:
def get_docs_in_filing(row):
    with open(f'data/edgar_pages/{row.accession_number}.html', 'r') as f:
        html = f.read()
    doc_soup = get_document_soup_from_page(html)
    return len(doc_soup), [row.findChildren('td')[1].get_text() for row in doc_soup]

df[['n_documents_in_filing', 'documents_in_filing_names']] = df.apply(get_docs_in_filing, axis=1, result_type='expand')

In [12]:
df.head(2)

Unnamed: 0,link,size,accession_number,filing_date,year,cik,conformed_name,n_documents_in_filing,documents_in_filing_names
0,https://www.sec.gov/Archives/edgar/data/147473...,45 KB,0001437749-19-011154,2019-05-31,2019,1474735,GENERAC HOLDINGS INC.,5,"[FORM SD, CONFLICT MINERALS REPORT, , , Comple..."
1,https://www.sec.gov/Archives/edgar/data/147473...,39 KB,0001437749-18-010977,2018-05-31,2018,1474735,GENERAC HOLDINGS INC.,3,"[FORM SD, EXHIBIT 1.01, Complete submission te..."


In [13]:
df.to_csv('summary_data/sd-filing-info-all.csv')
for year in df.year.unique():
    df[df.year == year].to_csv(f'summary_data/sd-filing-info-{year}.csv')

In [14]:
# Number of rows per n documents in filing
df.n_documents_in_filing.value_counts()

3     4488
2     1342
4      691
5      108
6       44
7       20
8       13
9        6
12       4
16       3
14       2
11       2
22       1
18       1
10       1
30       1
49       1
17       1
15       1
Name: n_documents_in_filing, dtype: int64

In [15]:
only_3_or_more_docs = df[df.n_documents_in_filing >= 3]
only_3_or_more_docs.head(2)

Unnamed: 0,link,size,accession_number,filing_date,year,cik,conformed_name,n_documents_in_filing,documents_in_filing_names
0,https://www.sec.gov/Archives/edgar/data/147473...,45 KB,0001437749-19-011154,2019-05-31,2019,1474735,GENERAC HOLDINGS INC.,5,"[FORM SD, CONFLICT MINERALS REPORT, , , Comple..."
1,https://www.sec.gov/Archives/edgar/data/147473...,39 KB,0001437749-18-010977,2018-05-31,2018,1474735,GENERAC HOLDINGS INC.,3,"[FORM SD, EXHIBIT 1.01, Complete submission te..."


In [16]:
only_3_or_more_docs.to_csv('summary_data/likely-cmr-filing-info-all.csv')
for year in only_3_or_more_docs.year.unique():
    by_year = only_3_or_more_docs[only_3_or_more_docs.year == year]
    print(f'Year {year} has {len(by_year)} likely CMR filings')
    by_year.to_csv(f'summary_data/likely-cmr-filing-info-{year}.csv')

Year 2019 has 864 likely CMR filings
Year 2018 has 896 likely CMR filings
Year 2017 has 927 likely CMR filings
Year 2016 has 945 likely CMR filings
Year 2015 has 899 likely CMR filings
Year 2014 has 857 likely CMR filings
