In [1]:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup
import os, re, io

In [2]:
if os.path.isfile('./outputs/reference_df.csv'):
    reference_df = pd.read_csv('./outputs/reference_df.csv', index_col=0, dtype= str)


In [213]:
temp = reference_df.filter(regex=("200")).count(axis=1)>24
index_24 = temp[temp == True].index
data = reference_df[reference_df.filter(regex="200").count(axis=1)>24].filter(regex="200")

data = data.transpose()
data = data.set_index(pd.PeriodIndex(data.index.str.replace(r'([0-9]{4})QTR(\d)', r'\1-Q\2').values, freq='Q').to_timestamp())
data = ~data.isna()*1


In [215]:
data

CIK,20,1750,1800,1923,2034,2098,2135,2178,2186,2488,...,1235091,1241199,1243800,1254595,1257640,1260968,1261734,1262279,1263400,1263401
2000-01-01,0,1,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000-04-01,1,1,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2000-07-01,1,0,1,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2000-10-01,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2001-01-01,1,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2001-04-01,1,1,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2001-07-01,1,0,1,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2001-10-01,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2002-01-01,0,1,0,1,1,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2002-04-01,1,1,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [218]:
data[data.rolling(min_periods=1, window=10).sum()>9]

CIK,20,1750,1800,1923,2034,2098,2135,2178,2186,2488,...,1235091,1241199,1243800,1254595,1257640,1260968,1261734,1262279,1263400,1263401
2000-01-01,,,,,,,,,,,...,,,,,,,,,,
2000-04-01,,,,,,,,,,,...,,,,,,,,,,
2000-07-01,,,,,,,,,,,...,,,,,,,,,,
2000-10-01,,,,,,,,,,,...,,,,,,,,,,
2001-01-01,,,,,,,,,,,...,,,,,,,,,,
2001-04-01,,,,,,,,,,,...,,,,,,,,,,
2001-07-01,,,,,,,,,,,...,,,,,,,,,,
2001-10-01,,,,,,,,,,,...,,,,,,,,,,
2002-01-01,,,,,,,,,,,...,,,,,,,,,,
2002-04-01,,,,,1.0,1.0,,,,,...,,,,,,,,,,


In [None]:
base_url = r"https://www.sec.gov/"
fullIndex_url = r"https://www.sec.gov/Archives/edgar/full-index/"


normal_url = r"https://www.sec.gov/Archives/edgar/data/1265107/0001265107-19-000004.txt"
json_url = normal_url.replace('-','').replace('.txt','/index.json')

documents_url = r"https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/index.json"

content = requests.get(documents_url).json()

for file in content['directory']['item']:
    
    # Grab the filing summary and create a new url leading to the file so we can download it.
    if file['name'] == 'FilingSummary.xml':

        xml_summary = base_url + content['directory']['name'] + "/" + file['name']
        
        print('-' * 100)
        print('File Name: ' + file['name'])
        print('File Path: ' + xml_summary)
        
        
# define a new base url that represents the filing folder. This will come in handy when we need to download the reports.
base_url = xml_summary.replace('FilingSummary.xml', '')

# request and parse the content
content = requests.get(xml_summary).content
soup = BeautifulSoup(content, 'lxml')

# find the 'myreports' tag because this contains all the individual reports submitted.
reports = soup.find('myreports')

# I want a list to store all the individual components of the report, so create the master list.
master_reports = []

# loop through each report in the 'myreports' tag but avoid the last one as this will cause an error.
for report in reports.find_all('report')[:-1]:

    # let's create a dictionary to store all the different parts we need.
    report_dict = {}
    report_dict['name_short'] = report.shortname.text
    report_dict['name_long'] = report.longname.text
    report_dict['position'] = report.position.text
    report_dict['category'] = report.menucategory.text
    report_dict['url'] = base_url + report.htmlfilename.text
    report_dict['xml'] = base_url + report['instance']

    # append the dictionary to the master list.
    master_reports.append(report_dict)
    
# create the list to hold the statement urls
statements_url = []
xml_url = []

# define the statements we want to look for.
item1 = r"Consolidated Balance Sheets"
item2 = r"Consolidated Statements of Operations and Comprehensive Income (Loss)"
item3 = r"Consolidated Statements of Cash Flows"
item4 = r"Consolidated Statements of Stockholder's (Deficit) Equity"

# store them in a list.
report_list = [item1, item2, item3, item4]

for report_dict in master_reports:
    
    # if the short name can be found in the report list.
    if report_dict['name_short'] in report_list:
        
#         # print some info and store it in the statements url.
#         print('-'*100)
#         print(report_dict['name_short'])
#         print(report_dict['url'])
#         print(report_dict['xml'])
        
        statements_url.append(report_dict['url'])
        xml_url.append(report_dict['xml'])
        
for statement in [statements_url[0]]: # Should do all statements, but focus on balance sheet for now

    # request the statement file content
    content = requests.get(statement).content
    report_soup = BeautifulSoup(content, 'html')

    data = {}

    for row in report_soup.table.find_all('tr'):
        
        if row.th:
            cols = row.find_all('th')
            data['name'] = cols[0].text.strip()
            data['years'] = [element.text.strip() for element in cols[1:]]
            continue
            

        if row.find_all('strong'): # Means a section head
            cols = row.find_all('td')
            key = cols[0].text.strip()
            data[key] = {}

        if row.find_all('strong').__len__() == 0:
            cols = [element.text.strip() for element in row.find_all('td')]
            subcategory = cols[0]
            data[key][subcategory] = cols[1:]
            
with open('./data/bs/balance.txt', 'w') as file:
    file.write(json.dumps(data))