In [1]:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup
import io

In [37]:
base_url = r"https://www.sec.gov/"
fullIndex_url = r"https://www.sec.gov/Archives/edgar/full-index/"


normal_url = r"https://www.sec.gov/Archives/edgar/data/1265107/0001265107-19-000004.txt"
json_url = normal_url.replace('-','').replace('.txt','/index.json')

documents_url = r"https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/index.json"

content = requests.get(documents_url).json()

for file in content['directory']['item']:
    
    # Grab the filing summary and create a new url leading to the file so we can download it.
    if file['name'] == 'FilingSummary.xml':

        xml_summary = base_url + content['directory']['name'] + "/" + file['name']
        
        print('-' * 100)
        print('File Name: ' + file['name'])
        print('File Path: ' + xml_summary)
        
        
# define a new base url that represents the filing folder. This will come in handy when we need to download the reports.
base_url = xml_summary.replace('FilingSummary.xml', '')

# request and parse the content
content = requests.get(xml_summary).content
soup = BeautifulSoup(content, 'lxml')

# find the 'myreports' tag because this contains all the individual reports submitted.
reports = soup.find('myreports')

# I want a list to store all the individual components of the report, so create the master list.
master_reports = []

# loop through each report in the 'myreports' tag but avoid the last one as this will cause an error.
for report in reports.find_all('report')[:-1]:

    # let's create a dictionary to store all the different parts we need.
    report_dict = {}
    report_dict['name_short'] = report.shortname.text
    report_dict['name_long'] = report.longname.text
    report_dict['position'] = report.position.text
    report_dict['category'] = report.menucategory.text
    report_dict['url'] = base_url + report.htmlfilename.text
    report_dict['xml'] = base_url + report['instance']

    # append the dictionary to the master list.
    master_reports.append(report_dict)
    
# create the list to hold the statement urls
statements_url = []
xml_url = []

# define the statements we want to look for.
item1 = r"Consolidated Balance Sheets"
item2 = r"Consolidated Statements of Operations and Comprehensive Income (Loss)"
item3 = r"Consolidated Statements of Cash Flows"
item4 = r"Consolidated Statements of Stockholder's (Deficit) Equity"

# store them in a list.
report_list = [item1, item2, item3, item4]

for report_dict in master_reports:
    
    # if the short name can be found in the report list.
    if report_dict['name_short'] in report_list:
        
#         # print some info and store it in the statements url.
#         print('-'*100)
#         print(report_dict['name_short'])
#         print(report_dict['url'])
#         print(report_dict['xml'])
        
        statements_url.append(report_dict['url'])
        xml_url.append(report_dict['xml'])
        
for statement in [statements_url[0]]: # Should do all statements, but focus on balance sheet for now

    # request the statement file content
    content = requests.get(statement).content
    report_soup = BeautifulSoup(content, 'html')

    data = {}

    for row in report_soup.table.find_all('tr'):
        
        if row.th:
            cols = row.find_all('th')
            data['name'] = cols[0].text.strip()
            data['years'] = [element.text.strip() for element in cols[1:]]
            continue
            

        if row.find_all('strong'): # Means a section head
            cols = row.find_all('td')
            key = cols[0].text.strip()
            data[key] = {}

        if row.find_all('strong').__len__() == 0:
            cols = [element.text.strip() for element in row.find_all('td')]
            subcategory = cols[0]
            data[key][subcategory] = cols[1:]
            
with open('./data/bs/balance.txt', 'w') as file:
    file.write(json.dumps(data))

In [None]:
reference_df = pd.DataFrame()
rawdata = {}

fullIndex_url = r"https://www.sec.gov/Archives/edgar/full-index/"

fullIndex = json.loads(requests.get(fullIndex_url + r"index.json").content)

for year in fullIndex['directory']['item']:
    
    year_url = fullIndex_url + year['href']
    yearIndex = json.loads(requests.get(year_url + r"index.json").content)
    print("Parsing data for " + year['name'] )
    
    for quarter in yearIndex['directory']['item']:
        
#         print(year['name']+quarter['name'])
        
        quarter_url = year_url + quarter['href']
        
        
        
        
        try:
            # Request by URL, decode, and read into pandas DF
            raw_df = pd.read_csv(io.StringIO(requests.get(quarter_url + r"master.idx").content.decode('utf-8')),
                    delimiter = '|',
                    skiprows = [0,1,2,3,4,5,6,7,8,10], # Standard format for .idx file
                    index_col=0
                   )
            
            # Insert raw data into dictionary for later use
            rawdata[ str(year['name'] + quarter['name']) ] = raw_df
            
            # Pull only 10-Q & 10-K forms
            df = raw_df[raw_df['Form Type'].str.match(r"10-[QK]$")]["Form Type"].str.extract(r"([QK])")
            df = df.groupby("CIK").aggregate({0: ', '.join})
            df.columns = [ year['name']+ quarter['name']]

            reference_df = df.join(reference_df)

        except:
            print(year['name']+quarter['name'] + " did not contain data.")
            df = pd.DataFrame(index=['CIK'],columns=[ year['name']+ quarter['name']])
        
        try:
            reference_df = df.join(reference_df, how='left')
        except:
            print('Could not join ' + year['name'] + quarter['name'])
#         print(reference_df)


        

        
        
    
    
# year = json.loads(requests.get(fullIndex_url + fullIndex['directory']['item'][0]['href'] + r"index.json").content)
# quarter = json.loads(requests.get(fullIndex + year['directory']['item'][0]['href'] +  + r"index.json").content)

In [None]:
reference_df

In [19]:
pd.read_csv(io.StringIO(requests.get(r"https://www.sec.gov/Archives/edgar/full-index/2011/QTR4/" + r"master.idx").content.decode('utf-8')),
                    delimiter = '|',
                    skiprows = [0,1,2,3,4,5,6,7,8,10],
                    index_col=0
                   )

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 13013584: invalid continuation byte

In [36]:
df

Unnamed: 0_level_0,1993QTR3
CIK,Unnamed: 1_level_1
60512,Q
66740,Q
