In [1]:
import re
from bs4 import BeautifulSoup, SoupStrainer
import polars as pl

PAGE_NUMBER_PATTERN = re.compile(r"^\s*\d+\s*$")
BUSINESS_PATTERN = re.compile(r"^\s*Item\s*1\b[\.:]?\s*", re.IGNORECASE)
RISK_START_PATTERN = re.compile(r"^\s*Item\s*1A\b[\.:]?\s*", re.IGNORECASE)
RISK_END_PATTERN = re.compile(r"^\s*Item\s*(1B|2)\b[\.:]?\s*", re.IGNORECASE)
# file_path = "documents/10-K/0000100726_2021Q3_0001564590-21-045705.txt"
# with open(file_path, "r", encoding="utf-8") as file:
#     content = file.read()
#     soup = BeautifulSoup(content, "html.parser")
#     paragraphs = [
#         p
#         for p in soup.select("p:not(td p)")
#         if not PAGE_NUMBER_PATTERN.match(p.get_text(strip=True))
#     ]
# for p in paragraphs:
#     print(p.get_text(strip=True))

In [2]:
def parse_sec_filing(file_path: str) -> dict[str, str]:
    """
    Parse SEC 10-K filing and extract key information using a sliding window approach.

    Args:
        file_path: Path to the SEC filing text file

    Returns:
        Dictionary containing company name, business description, and risk factors
    """
    # Read the file
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()

    # Initialize result dictionary
    result = {"Company Name": "", "Business": "", "Risk Factors": ""}

    # Extract company name from SEC header
    company_match = re.search(
        r"COMPANY CONFORMED NAME:\s*(.+?)\s*$", content, re.MULTILINE | re.IGNORECASE
    )
    if company_match:
        result["Company Name"] = company_match.group(1).strip()

    # Parse paragraphs using BeautifulSoup
    soup = BeautifulSoup(content, "lxml")
    paragraphs = [
        p
        for p in soup.select("p:not(td p)")
        if not PAGE_NUMBER_PATTERN.match(p.get_text(strip=True))
    ]

    # Initialize section indices
    business_start = None
    risk_start = None
    risk_end = None

    # Iterate through paragraphs to find section boundaries
    for i, p in enumerate(paragraphs):
        text = p.get_text(strip=True)

        if business_start is None:
            if BUSINESS_PATTERN.match(text):
                business_start = i
        else:
            if risk_start is None:
                if RISK_START_PATTERN.match(text):
                    risk_start = i
            else:
                if risk_end is None and RISK_END_PATTERN.match(text):
                    risk_end = i
                    break  # No need to check further once risk_end is found

    # Extract Business section content
    business_text = []
    if business_start is not None:
        end = risk_start if risk_start is not None else len(paragraphs)
        business_paragraphs = paragraphs[business_start + 1 : end]
        business_text = [
            p.get_text(strip=True)
            for p in business_paragraphs
            if p.get_text(strip=True)
        ]
    result["Business"] = " ".join(business_text)

    # Extract Risk Factors section content
    risk_text = []
    if risk_start is not None:
        end = risk_end if risk_end is not None else len(paragraphs)
        risk_paragraphs = paragraphs[risk_start + 1 : end]
        risk_text = [
            p.get_text(strip=True) for p in risk_paragraphs if p.get_text(strip=True)
        ]
    result["Risk Factors"] = " ".join(risk_text)

    return result


def save_to_parquet(data: list[dict[str, str]], output_file: str):
    df = pl.DataFrame(data)
    df.write_parquet(output_file)


# Function to walk through the directory "documents/10-K" and parse each file concurrently
def parse_directory(directory: str) -> list[dict[str, str]]:
    import os
    from concurrent.futures import ProcessPoolExecutor
    import multiprocessing

    # Get all valid file paths upfront
    file_paths = [
        os.path.join(directory, filename)
        for filename in os.listdir(directory)
        if filename.endswith(".txt")
    ]

    # Use ProcessPoolExecutor for CPU-bound tasks
    # Set max workers to CPU count (or slightly less)
    cpu_count = multiprocessing.cpu_count()
    with ProcessPoolExecutor(max_workers=max(1, cpu_count - 1)) as executor:
        # Map is more efficient than manually managing futures for simple cases
        parsed_data = list(executor.map(parse_sec_filing, file_paths))

    return parsed_data

In [3]:
filings = parse_directory("documents/10-K")

In [8]:
# Save the parsed filings to a Parquet file
save_to_parquet(filings, "parsed_filings.parquet")

In [9]:
df = pl.scan_parquet("/home/jupyter-hangenyuu/EDGARConnect/sec_filings.parquet")
df = df.filter((pl.col("Business") != "") | (pl.col("Risk Factors") != ""))

In [11]:
df.collect()

Company Name,Business,Risk Factors
str,str,str
"""PREVENTION INSURANCE COM INC""","""Business Development & Busines…","""Our plan of operation is to ob…"
"""Qiansui International Group Co…","""Business Development & Busines…","""Our plan of operation is to ob…"
"""SMSA CRANE ACQUISITION CORP.""","""Our business plan is to pursue…","""A smaller reporting company is…"
"""Laredo Oil, Inc.""","""The permits that are required …",""""""
"""XTREME FIGHTING CHAMPIONSHIPS,…","""Throughout this Form 10-K, Xtr…","""We are a smaller reporting com…"
…,…,…
"""PROVIDENT FINANCIAL HOLDINGS I…","""General Provident Financial Ho…","""​ We assume and manage a certa…"
"""PHIBRO ANIMAL HEALTH CORP""","""Overview Phibro Animal Health …","""Risk Factors Summary For a sum…"
"""RPM INTERNATIONAL INC/DE/""","""THE COMPANY RPM International …","""You should carefully consider …"
"""NUTRA PHARMA CORP""","""Introduction Nutra Pharma is a…","""You should carefully consider …"
