In [1]:
# First prototype for an SEC filings scraper

In [None]:
import time
import requests
import typing

In [3]:
last_req = time.time() # time at which last request was made

def request(url: str, type: typing.Literal["json", "raw"]) -> dict:
    cur_time = time.time()
    if cur_time-last_req < 0.1: # max request rate is 10/sec
        time.sleep(0.1-(cur_time-last_req))

    req = requests.get(
        url,
        headers = {
            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"
        }
    )
    if req.status_code != 200:
        print(req.content.decode())
        raise RuntimeError(f"Failed request, code {req.status_code}")

    if type == "json":
        return req.json()
    else:
        return req.content.decode()

In [None]:
# Filings are made of multiple documents
# The most useful of which is the main document, which has type = form_num
class Document:

    # <DOCUMENT>
    # <TYPE>
    # <SEQUENCE>
    # <FILENAME>
    # <DESCRIPTION>
    # <TEXT>
    # </TEXT>
    # </DOCUMENT>

    def __init__(self, raw_text: str):
        pass

class Filing:
    def __init__(self, cik: str, accession_num: int | str, form_num: str, date: str):
        self.cik = cik
        self.accession_num = str(accession_num)
        self.form_num = form_num
        self.date = date

    def get():
        # remember to get header data
        pass

    def extract():
        pass

In [5]:
class Company:
    def __init__(self, name: str, cik: str | int, ticker: str, exchange: str):
        self.name = name
        self.cik = str(cik).zfill(10)
        self.ticker = ticker
        self.exchange = exchange

    def get_filings(self):
        filings = []

        def add_to_filings(filings_json):
            num_filings = len(filings_json["accessionNumber"])
            for i in range(num_filings):
                filings.append(Filing(
                    self.cik,
                    filings_json["accessionNumber"][i],
                    filings_json["form"][i],
                    filings_json["reportDate"][i],
                ))

        filings_info = request(f"https://data.sec.gov/submissions/CIK{self.cik}.json", "json")
        extra_filenames = [file["name"] for file in filings_info["filings"]["files"]]

        filings_json = filings_info["filings"]["recent"]
        add_to_filings(filings_json)
        for file_name in extra_filenames:
            filings_json = request(f"https://data.sec.gov/submissions/{file_name}", "json")
            add_to_filings(filings_json)

        return filings

    def get_facts(self):
        return request(f"https://data.sec.gov/api/xbrl/companyfacts/CIK{self.cik}.json", "json")

In [6]:
companies = []
companies_info = request("https://www.sec.gov/files/company_tickers_exchange.json", "json")
for cik, name, ticker, exchange in companies_info["data"]:
    companies.append(Company(name, cik, ticker, exchange))

In [7]:
############# Testing zone ###############

In [None]:
f = Filing("0000320193", "000032019325000079", None, None)

In [9]:
filing_txt = request("https://www.sec.gov/Archives/edgar/data/320193/000032019325000079/0000320193-25-000079.txt", "raw")

In [None]:
# TODO:
    # Extract the filing txt into header and documents
        # Check for <SEC-DOCUMENT> and <DOCUMENT> tags
    # Put each <DOCUMENT> </DOCUMENT> section in its own Document() object
    # Put header data in the Filing() class