In [1]:
%conda install --yes -c defaults -c conda-forge --update-all python=3.8 aiohttp aiodns bs4 cchardet Faker lxml openpyxl pandas PyPDF2 python-dateutil regex tenacity tqdm requests
%conda info

^C

Note: you may need to restart the kernel to use updated packages.

     active environment : base
    active env location : C:\Users\drche\Software\anaconda3
            shell level : 1
       user config file : C:\Users\drche\.condarc
 populated config files : 
          conda version : 23.11.0
    conda-build version : 3.28.2
         python version : 3.8.18.final.0
                 solver : libmamba (default)
       virtual packages : __archspec=1=x86_64
                          __conda=23.11.0=0
                          __cuda=12.0=0
                          __win=0=0
       base environment : C:\Users\drche\Software\anaconda3  (writable)
      conda av data dir : C:\Users\drche\Software\anaconda3\etc\conda
  conda av metadata url : None
           channel URLs : https://repo.anaconda.com/pkgs/main/win-64
                          https://repo.anaconda.com/pkgs/main/noarch
                          https://repo.anaconda.com/pkgs/r/win-64
                          https://rep

# SEC Constants

This section has been commented out since it doesn't need to be run frequently.

In [2]:
import json
import re

from aiohttp import ClientSession

async with ClientSession(raise_for_status=True) as c:
    async with c.get("https://www.sec.gov/edgar/search/js/edgar_full_text_search.js") as res:
        _script = await res.text()

    with open("constants.py", "w", encoding="utf-8") as f:
        f.write("_FORMS = ")
        json.dump({
            form.pop("form"): form
            for form in eval(re.search(
                R"^const forms = (\[\r?\n(?: {4}\{.*?\},*\r?\n)*(?: {4}\{.*?\})\r?\n\])\.sort",
                _script,
                re.MULTILINE
            )[1])
        }, f, indent=4)

        f.write('\n')

        f.write("_LOCATIONS = ")
        json.dump(dict(eval(re.search(
            R"^const locationsArray = (\[\r?\n(?: {4}\[.*?\],\r?\n)*(?: {4}\[.*?\])\r?\n\]);",
            _script,
            re.MULTILINE
        )[1])), f, indent=4)

        f.write('\n')

ClientResponseError: 403, message='Forbidden', url=URL('https://www.sec.gov/edgar/search/js/edgar_full_text_search.js')

## Common Functions

The `chop_periods` function may be replaced by pd.interval_range once the functionality is enhanced in the future.

In [None]:
from datetime import date
from typing import Any, Callable, Dict, Generator, Optional, Tuple

from dateutil.relativedelta import relativedelta
from tqdm import tqdm

def chop_periods(
    start_date: date,
    end_date: date,
    interval: Optional[Dict[str, Any]],
    _format: Callable[[date], str]
) -> Generator[Tuple[str, str], None, None]:
    if interval is None:
        yield _format(start_date), _format(end_date)
        return

    delta = relativedelta(**interval)
    # the end date of each period must have an offset of -1 day since the RESTful API is inclusive in both sides
    offset = relativedelta(days=1)
    next_date = start_date
    while (next_date := (curr_date := next_date) + delta) < end_date:
        yield _format(curr_date), _format(next_date - offset)
    yield _format(curr_date), _format(end_date)

## Primary Section

In [8]:
import asyncio
import json
import logging
import requests
from datetime import date
from itertools import zip_longest
from pathlib import Path
from typing import Any, Dict, Generator, List, Optional, Tuple, Union

import pandas as pd
from aiohttp import ClientSession
from faker import Faker
import regex
from tenacity import retry

from constants import _FORMS, _LOCATIONS
import parsers

_CC_REGEX = regex.compile(R"[\p{Cc}\p{Cf}]+", regex.V1)

# parse the name and CIK from the result (same as the official JavaScript)
_DISPLAY_NAME_REGEX = regex.compile(R"(.*) \(CIK (\d{10})\)", regex.V1)

_FAKER = Faker()

_FORMAT_SPEC = {
    "csv": {
        "suffix": "csv",
        "function_suffix": "csv",
        "extra_args": {
            "encoding": "utf-8"
        }
    },
    "excel": {
        "suffix": "xlsx",
        "function_suffix": "excel",
        "extra_args": {
            "sheet_name": "output"
        }
    }
}

_LOGGER = logging.getLogger(__name__)

_RETRY_SC = {403, 500, 502, 503, 504}

# Replace by a single space in text
_WHITESPACE_REGEX = regex.compile(R"\s+", regex.V1)


def _concat_to_url(cik: str, adsh: str, filename: str) -> str:
    return f"https://www.sec.gov/Archives/edgar/data/{cik}/{adsh}/{filename}"


def _decode(b: bytes, e: str) -> str:
    return b.decode(e)


@retry
async def _download(client: ClientSession, semaphore: asyncio.Semaphore, url: str, user_agent: str) -> Tuple[bytes, str]:
    async with semaphore, client.get(url, headers={
        'User-Agent': user_agent,
        'Accept-Encoding': 'text',
    }) as res:
        await asyncio.sleep(1)
        if res.ok:
            return await res.read(), res.get_encoding()
        if res.status in _RETRY_SC:
            res.raise_for_status()
    _LOGGER.warning(f"{url} file will be skipped: ({res.status}) {res.reason}")
    return b'', "ascii"


@retry
async def _fetch(
    client: ClientSession,
    semaphore: asyncio.Semaphore,
    fixed_query: Dict[str, Any],
    ciks: Optional[List[str]],
    start_date: str,
    end_date: str,
    page_iterator: Dict[str, Any]
) -> Tuple[bytes, str]:
    req_body = {
        **fixed_query,
        **page_iterator,
        "startdt": start_date,
        "enddt": end_date,
        "ciks": ciks
    }
#     print(req_body)
    async with semaphore, client.post("https://efts.sec.gov/LATEST/search-index", json=req_body) as res:
        await asyncio.sleep(1)
        if res.ok:
            return await res.read(), res.get_encoding()
        if res.status in _RETRY_SC:
            res.raise_for_status()
        _LOGGER.warning(f"{ciks}-{start_date}-{end_date} query will be skipped: ({res.status}) {res.reason}")
        return b'', "ascii"


def _iso(d: date):
    return d.isoformat()


def _parse_display_name(s: str, cik: str):
    if s is not None and (m := _DISPLAY_NAME_REGEX.fullmatch(s)):
        if (scik := m[2]) != cik:
            _LOGGER.warning(f"mismatched CIK: {scik} (parsed from \"{s}\") v.s. {cik}")
        return m[1], scik
    return s, cik


def _parse_hit(hit: Dict[str, Any]):
    _id = hit["_id"]
    source = hit["_source"]

    adsh, filename = _id.split(':')
    filename_main, filename_ext = filename.rsplit('.', 1)
    xsl = source["xsl"]
    if xsl and filename_ext.lower() == "xml":
        filename_main = f"{xsl}/{filename_main}"
    filename = f"{filename_main}.{filename_ext}"

    file_nums = source["file_num"]
    film_nums = source["film_num"]
    rows = pd.DataFrame((
        [_id, *_parse_display_name(display_name, cik), loc, _LOCATIONS.get(code, code), file_num, film_num]
        for display_name, cik, loc, code, file_num, film_num in zip_longest(
            source["display_names"],
            source["ciks"],
            source["biz_locations"],
            source["inc_states"],
            file_nums if isinstance(file_nums, list) else [file_nums] if file_nums else (),
            film_nums if isinstance(film_nums, list) else [film_nums] if film_nums else ()
        )
    ), columns=["id", "entity_name", "cik", "located", "incorporated", "file_num", "film_num"], dtype=str, copy=False)

    form = source["form"]
    root_form = source["root_form"]
    form_title = ""
    if root_form in _FORMS:
        form_title = f" ({_FORMS[root_form]['title']})"
    file_type = source["file_type"]
    if not file_type:
        file_type = source["file_description"]
    if not file_type:
        file_type = filename
    ciks = rows["cik"]

    info = pd.Series({
        "id": _id,
        "form_file": f"{form}{form_title}{'' if form == file_type else f' {file_type}'}",
        "file_date": source["file_date"],
        "period_ending": source.get("period_ending", None),
        "file_ext": filename_ext,
        "url": _concat_to_url(ciks[ciks.notnull()].iloc[-1], adsh.replace('-', ''), filename),
        "parser": getattr(parsers, f"_parse_{filename_ext.lower()}", None)
    }, dtype=object, copy=False)
    return rows, info


def _rename(index):
    return f"paragraph{index + 1}"


def _unwrap(hits: Dict[str, Any], no_ciks):
    total_hits = hits["total"]
    if not no_ciks:
        if total_hits["relation"] == "gte":
            _LOGGER.warning(f"The query returns a result exceeding the 10k limit")
    return hits["hits"]

def _check_limit(hits: Dict[str, Any]):
    total_hits = hits["hits"]["total"]
    print("Total fetchable results: " + str(total_hits["value"]))
    return total_hits["value"]

def _split_to_page_and_from(count):
    number_of_pages = count / _EDGAR_RESULTS_PER_PAGE
    from_count = 0
    split_list = []
    print("---Fetching results over a range of pages---")
    print("Total number of pages: " + str(int(number_of_pages)))
    print("Number of results per page: " + str(_EDGAR_RESULTS_PER_PAGE))
    for i in range(1, int(number_of_pages + 1)):
        split_list.append({
            "page": i,
            "from": from_count
        })
        from_count = from_count + _EDGAR_RESULTS_PER_PAGE
        
    return split_list
        

def _mock_request(fixed_query, start_date, end_date):
    data = {
        **fixed_query,
        "startdt": _iso(start_date),
        "enddt": _iso(end_date),
        "ciks": None,
    }
    response = requests.post(
        "https://efts.sec.gov/LATEST/search-index",
        data=json.dumps(data)
    )
    return json.loads(response.text)


def chop_ciks(
    ciks: Optional[Union[Path, int, str, List[Any]]],
    ciks_per_query: int
) -> Generator[Optional[List[str]], None, None]:
    # defaults to None
    _ciks: Optional[List[str]] = None
    # if the provided parameter is a Path, read the CIKs from the file
    if isinstance(ciks, Path):
        try:
            with open(ciks, "r", encoding="UTF-8") as f:
                try:
                    _ciks = [f"{int(cik):010}" for cik in f.read().splitlines()]
                except ValueError as e:
                    raise ValueError(f"{ciks} contains invalid CIKs") from e
        except IOError as e:
            raise ValueError(f"{ciks} is not a valid file") from e
    # if it's an iterable of values, treat all values as CIKs
    elif isinstance(ciks, list):
        try:
            _ciks = [f"{int(cik):010}" for cik in ciks]
        except ValueError as e:
            raise ValueError(f"{ciks} is not a valid CIK list") from e
    # if it's a single string, consider it as a single CIK
    elif isinstance(ciks, str):
        try:
            _ciks = [f"{int(ciks):010}"]
        except ValueError as e:
            raise ValueError(f"{ciks} is not a valid CIK") from e
    # same as previous with the preferred (int) type
    elif isinstance(ciks, int):
        _ciks = [f"{ciks:010}"]

    if _ciks:
        for i in range(0, len(_ciks), ciks_per_query):
            yield _ciks[i:i + ciks_per_query]
    else:
        yield None


async def crawl(
    phrases: List[str],
    filing_types: List[str],
    start_date: date,
    end_date: date,
    interval: Optional[Dict[str, int]],
    ciks: Optional[Union[Path, int, str, List[Any]]],
    ciks_per_query: int,
    buffer_chunk_size: int,
    output_name: str,
    output_format: str,
    user_agent: str
):
    fixed_query: Dict[str, Any] = {
        "q": " ".join(f"\"{phrase}\"" for phrase in phrases),
        "category": "custom",
        "forms": filing_types,
        "dateRange": "custom"
    }

    phrases_regex = regex.compile(
        "|".join(f"(?:{phrase})" for phrase in map(regex.escape, phrases)),
        regex.V1 | regex.IGNORECASE
    )

    semaphore = asyncio.Semaphore(10)
    async with ClientSession() as c:
        print("Fetching search query results...")
        dfs = None
        infos = None
        print("\n")
        if ciks:
            dfs, infos = zip(*[
                _parse_hit(hit)
                for task in tqdm([
                    asyncio.create_task(_fetch(c, semaphore, fixed_query, ciks, *period, {}))
                    for ciks in chop_ciks(ciks, ciks_per_query)
                    for period in chop_periods(start_date, end_date, interval, _iso)
                ])
                for hit in _unwrap(json.loads(_decode(*await task))["hits"], False)
            ])
        else:
            mock_hit = _mock_request(fixed_query, start_date, end_date)
            total = _check_limit(mock_hit)
            pages = _split_to_page_and_from(total)
            print("\n")
            dfs, infos = zip(*[
                _parse_hit(hit)
                for task in tqdm([
                    asyncio.create_task(_fetch(c, semaphore, fixed_query, ciks, *period, page))
                    for page in pages
                    for period in chop_periods(start_date, end_date, interval, _iso)
                ])
                for hit in _unwrap(json.loads(_decode(*await task))["hits"], True)
            ])
        print("Fetching search query results complete")

        df = pd.concat(dfs, ignore_index=True, copy=False)
        df.drop_duplicates(inplace=True, ignore_index=True)
        df.set_index(keys="id", inplace=True, verify_integrity=False)
        del dfs

        info = pd.DataFrame(infos, dtype=object, copy=False)
        info.dropna(subset=["parser"], inplace=True)
        info.drop_duplicates(subset="id", inplace=True, ignore_index=True)
        info.set_index(keys="id", inplace=True)
        del infos

        dl_info = info[["url", "parser"]]
        del info["parser"]
        
        print("Queuing tasks for download of filings...")
        print("\n")
        downloaded = pd.DataFrame([
            pd.Series(filter(phrases_regex.search, (
                _CC_REGEX.sub("", _WHITESPACE_REGEX.sub(" ", s).strip())
                for s in parser(*await task).split("\n\n")
            )), copy=False)
            for div_info in (
                dl_info.iloc[s:s + buffer_chunk_size]
                for s in range(0, info.shape[0], buffer_chunk_size)
            )
            for task, parser in tqdm(zip([
                asyncio.create_task(_download(c, semaphore, url, user_agent))
#                 await _download(c, semaphore, url, user_agent)
                for url in div_info["url"]
            ], div_info["parser"]))
        ], index=info.index, dtype=str, copy=False)
        downloaded.dropna(how="all", inplace=True)
        downloaded.rename(columns=_rename, copy=False, inplace=True)
        del dl_info

    format_spec = _FORMAT_SPEC[output_format]
    getattr(
        df.join(info, how="left").join(downloaded, how="left"),
        f"to_{format_spec['function_suffix']}"
    )(
        Path(f"{output_name}.{format_spec['suffix']}"),
        header=True,
        index=False,
        # index=True,
        **format_spec["extra_args"]
    )
    print("Queuing complete, please wait for the tasks to complete")

## Parameters

This section defines all customisable parameters.

- **PHRASES** (`List[str]`): A list of keywords or phrases to search for. Can be an empty list.

- **DATE_START** & **DATE_END** (both `date`): As indicated by the name. But it should conform to the ISO time format, i.e., YYYY-MM-DD as shown in the example.

- **INTERVAL** (`Optional[Dict[str, int]]`): The interval of each period, `None` implies the whole period will be searched at once. Reducing the interval will result in more queires have to be made, but it will be useful if the number of results returned exceed the maximum capicity (10000) in one query.

- **FILING_TYPES** (`List[str]`): A list of filling types. I can add pre-check for this variable, but since we assume that all inputs are valid, the check was not added.

- **CIKS** (`Optional[Union[Path, int, str, List[Union[int, str]]]]`): A list of CIKs in no more than 10 digits, or it can be a path to the file containing all CIKs for the query.

- **CIKS_PER_QUERY** (`int`): Controls the number of CIKs included in one query. Recommended value is 5, but can be adjusted in case the number of results returned exceed the maximum capacity (10000) in one query.

- **BUFFER_CHUNK_SIZE**: The maximum number of files allowed to be cached in the memory.

- **OUTPUT_NAME**: The file name without the suffix of the output file.

- **OUTPUT_FORMAT**: The file format of the output file.

In [9]:
from datetime import date
from pathlib import Path

_PHRASES = ["keyword one", "keyword two"]

_FILING_TYPES = ["10-K", "10-Q"]

_DATE_START = date.fromisoformat("2001-01-01")
_DATE_END = date.fromisoformat("2020-12-31")

# _INTERVAL = {
#     "years": 0,
#     "months": 1,
#     "weeks": 0,
#     "days": 0
# }
_INTERVAL = None # can be optional

# _CIKS = [1961, "0000003116"] # accept a plain list of the CIKs
# _CIKS = Path("ciks.txt") # accept a file path
# _CIKS = 1961 # accept a single CIK as an integer
# _CIKS = "0000003116" # accept a single CIK as a string
_CIKS = Path('sample_input_file.txt') # can be optional

_CIKS_PER_QUERY = 5 # will be ignored if no CIKs is provided

_BUFFER_CHUNK_SIZE = 100

_EDGAR_RESULTS_PER_PAGE = 100

_OUTPUT_NAME = "sample_output_file"

# _OUTPUT_FORMAT = "csv"
_OUTPUT_FORMAT = "excel"

# User Agent Metadata
_COMPANY_NAME = "The University of Adelaide"
_COMPANY_DOMAIN_ADMIN_EMAIL = "firstname.lastname@adelaide.edu.au"

_USER_AGENT = _COMPANY_NAME + " " + _COMPANY_DOMAIN_ADMIN_EMAIL

await crawl(
    _PHRASES,
    _FILING_TYPES,
    _DATE_START,
    _DATE_END,
    _INTERVAL,
    _CIKS,
    _CIKS_PER_QUERY,
    _BUFFER_CHUNK_SIZE,
    _OUTPUT_NAME,
    _OUTPUT_FORMAT,
    _USER_AGENT
)

Fetching search query results...




100%|██████████████████████████████████████████████████████████████████████████████| 1933/1933 [05:13<00:00,  6.16it/s]


Fetching search query results complete
Queuing tasks for download of filings...




100it [01:12,  1.38it/s]
100it [01:49,  1.09s/it]
100it [01:38,  1.02it/s]
100it [01:00,  1.65it/s]
100it [01:31,  1.09it/s]
93it [01:33,  1.03it/s]https://www.sec.gov/Archives/edgar/data/0000723612/000072361213000018/f10q_agricon33113.htm file will be skipped: (404) Not Found
100it [01:36,  1.04it/s]
100it [01:43,  1.04s/it]
100it [01:22,  1.21it/s]
100it [01:21,  1.22it/s]
100it [02:05,  1.25s/it]
100it [01:29,  1.11it/s]
100it [01:18,  1.27it/s]
100it [01:31,  1.09it/s]
100it [02:00,  1.21s/it]
100it [00:47,  2.09it/s]
100it [01:16,  1.31it/s]
100it [01:18,  1.27it/s]
100it [01:36,  1.04it/s]
100it [01:29,  1.12it/s]
100it [01:56,  1.17s/it]
100it [02:03,  1.23s/it]
100it [01:10,  1.41it/s]
100it [01:58,  1.19s/it]
100it [01:24,  1.19it/s]
100it [01:24,  1.19it/s]
100it [00:54,  1.83it/s]
100it [01:22,  1.21it/s]
100it [01:13,  1.36it/s]
100it [01:38,  1.01it/s]
100it [00:44,  2.25it/s]
100it [01:01,  1.64it/s]
100it [01:33,  1.07it/s]
100it [01:19,  1.25it/s]
100it [01:11,  1.40it/

Queuing complete, please wait for the tasks to complete
