In [1]:
import re
from bs4 import BeautifulSoup
import polars as pl

In [None]:
def parse_sec_filing(file_path: str) -> dict[str, str]:
    """
    Parse SEC 10-K filing and extract key information.

    Args:
        file_path: Path to the SEC filing text file

    Returns:
        Dictionary containing company name, business description and risk factors
    """
    # Read the file
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()

    # Initialize result dictionary
    result = {"Company Name": "", "Business": "", "Risk Factors": ""}

    # Extract company name from SEC header
    company_match = re.search(r"COMPANY CONFORMED NAME:\s*(.+)(?:\r\n|\r|\n)", content)
    if company_match:
        result["Company Name"] = company_match.group(1).strip()

    current_section = None
    current_content = []

    soup = BeautifulSoup(content, "lxml")

    # Find all paragraph elements
    paragraphs = soup.find_all("p")

    for p in paragraphs:
        text = p.get_text(strip=True)

        # Check for section headers
        business_match = re.match(r"Item\s*1\.\s*Business", text)
        risk_match = re.match(r"Item\s*1A\.\s*Risk\s*Factors", text)
        # Modified to catch either Item 1B or Item 2
        next_section_match = re.match(r"Item\s*(1B|2)\.", text)

        # Start new section if header found
        if business_match:
            current_section = "Business"
            current_content = []
        elif risk_match:
            if current_section:
                result[current_section] = " ".join(current_content)
            current_section = "Risk Factors"
            current_content = []
        elif next_section_match:
            if current_section:
                result[current_section] = " ".join(current_content)
            current_section = None
        # Add content to current section
        elif current_section and text:
            current_content.append(text)

    # Add the last section
    if current_section and current_content:
        result[current_section] = " ".join(current_content)

    return result


def save_to_parquet(data: list[dict[str, str]], output_file: str):
    df = pl.DataFrame(data)
    df.write_parquet(output_file)

In [3]:
file_path = "documents/10-K/0000100378_2021Q3_0001437749-21-021350.txt"
results = [parse_sec_filing(file_path)]

In [None]:
output_file = "/home/jupyter-hangenyuu/EDGARConnect/sec_filings.parquet"
save_to_parquet(results, output_file)

In [1]:
import polars as pl

In [13]:
df = pl.scan_parquet("/home/jupyter-hangenyuu/EDGARConnect/sec_filings.parquet")
df = df.filter((pl.col("Business") != "") | (pl.col("Risk Factors") != ""))

In [14]:
df.collect()

Company Name,Business,Risk Factors
str,str,str
"""PREVENTION INSURANCE COM INC""","""Business Development & Busines…","""Our plan of operation is to ob…"
"""Qiansui International Group Co…","""Business Development & Busines…","""Our plan of operation is to ob…"
"""SMSA CRANE ACQUISITION CORP.""","""Our business plan is to pursue…","""A smaller reporting company is…"
"""Laredo Oil, Inc.""","""The permits that are required …",""""""
"""XTREME FIGHTING CHAMPIONSHIPS,…","""Throughout this Form 10-K, Xtr…","""We are a smaller reporting com…"
…,…,…
"""PROVIDENT FINANCIAL HOLDINGS I…","""General Provident Financial Ho…","""​ We assume and manage a certa…"
"""PHIBRO ANIMAL HEALTH CORP""","""Overview Phibro Animal Health …","""Risk Factors Summary For a sum…"
"""RPM INTERNATIONAL INC/DE/""","""THE COMPANY RPM International …","""You should carefully consider …"
"""NUTRA PHARMA CORP""","""Introduction Nutra Pharma is a…","""You should carefully consider …"
