In [12]:
import pandas as pd
import time
import requests
from pathlib import Path

## Load Data

In [3]:
HEADERS = {
    "User-Agent": "neda.bahaghighat@leuphana.stud.de (Master thesis research)",
    "Accept-Encoding": "gzip, deflate",
}

In [5]:
def download_filing_html(cik: str, accession: str, primary_doc: str, out_dir: str = "data/raw_filings"):

    cik_int = str(int(cik))
    accession_nodashes = accession.replace("-", "")

    base_dir = f"https://www.sec.gov/Archives/edgar/data/{cik_int}/{accession_nodashes}/"

    index_url = base_dir + "index.json"
    idx_resp = requests.get(index_url, headers=HEADERS)
    idx_resp.raise_for_status()
    idx = idx_resp.json()
    items = idx["directory"]["item"]

    html_name = None
    for it in items:
        if it["name"].lower() == primary_doc.lower():
            html_name = it["name"]
            break

    if html_name is None:
        candidates = [it["name"] for it in items
                      if it["name"].lower().endswith((".htm", ".html"))]

        if not candidates:
            raise ValueError(f"No HTML file found in {index_url}")


        tenk_candidates = [c for c in candidates if "10k" in c.lower()]
        html_name = tenk_candidates[0] if tenk_candidates else candidates[0]

    file_url = base_dir + html_name
    print("   → Requesting:", file_url)

    resp = requests.get(file_url, headers=HEADERS)
    resp.raise_for_status()
    html_text = resp.text

    Path(out_dir).mkdir(parents=True, exist_ok=True)
    out_path = Path(out_dir) / f"{cik_int}_{accession_nodashes}_{html_name}"
    out_path.write_text(html_text, encoding="utf-8")

    return str(out_path), html_text

In [None]:
def get_company_filings(cik: str, form_type: str = "10-K", limit: int = 10) -> pd.DataFrame:

    cik_padded = cik.zfill(10)
    url = f"https://data.sec.gov/submissions/CIK{cik_padded}.json"

    resp = requests.get(url, headers=HEADERS)
    resp.raise_for_status()
    data = resp.json()

    recent = data["filings"]["recent"]
    df = pd.DataFrame({
        "accession": recent["accessionNumber"],
        "form": recent["form"],
        "filing_date": recent["filingDate"],
        "primary_doc": recent["primaryDocument"],
    })

    df = df[df["form"] == form_type].head(limit).reset_index(drop=True)
    return df

In [7]:
def get_cik_from_ticker(ticker: str) -> str:
    url = "https://www.sec.gov/files/company_tickers.json"
    data = requests.get(url, headers=HEADERS).json()
    df = pd.DataFrame(data).T

    cik = df.loc[df["ticker"] == ticker.upper(), "cik_str"].values[0]
    cik = str(cik).zfill(10)
    return cik

In [9]:
#Selected Companies
tickers = ["AAPL", "MSFT", "AMZN", "NVDA", "META"]
companies = {ticker: get_cik_from_ticker(ticker) for ticker in tickers}

print(companies)

{'AAPL': '0000320193', 'MSFT': '0000789019', 'AMZN': '0001018724', 'NVDA': '0001045810', 'META': '0001326801'}


In [10]:
all_filings = {}

for name, cik in companies.items():
    print(f"Fetching filings for {name}...")
    filings_df = get_company_filings(cik=cik, form_type="10-K", limit=1)
    all_filings[name] = filings_df
    time.sleep(0.2)
print("\n=== Filings metadata ===")

for name, df in all_filings.items():
    print(f"\n{name}:")
    print(df)

Fetching filings for AAPL...
Fetching filings for MSFT...
Fetching filings for AMZN...
Fetching filings for NVDA...
Fetching filings for META...

=== Filings metadata ===

AAPL:
              accession  form filing_date        primary_doc
0  0000320193-25-000079  10-K  2025-10-31  aapl-20250927.htm

MSFT:
              accession  form filing_date        primary_doc
0  0000950170-25-100235  10-K  2025-07-30  msft-20250630.htm

AMZN:
              accession  form filing_date        primary_doc
0  0001018724-25-000004  10-K  2025-02-07  amzn-20241231.htm

NVDA:
              accession  form filing_date        primary_doc
0  0001045810-25-000023  10-K  2025-02-26  nvda-20250126.htm

META:
              accession  form filing_date        primary_doc
0  0001326801-25-000017  10-K  2025-01-30  meta-20241231.htm


In [11]:
for name, cik in companies.items():
    df = all_filings[name].iloc[0]
    accession = df["accession"]
    primary_doc = df["primary_doc"]

    print(f"\nDownloading 10-K for {name}...")
    try:
        local_path, html_text = download_filing_html(
            cik=cik,
            accession=accession,
            primary_doc=primary_doc,
            out_dir="data/raw_filings",
        )
        print(f"   ✔ Saved to: {local_path}")
    except requests.HTTPError as e:
        print(f"   ✖ HTTP error for {name} {accession}: {e}")
    except Exception as e:
        print(f"   ✖ Other error for {name} {accession}: {e}")

print("\nDone.")


Downloading 10-K for AAPL...
   → Requesting: https://www.sec.gov/Archives/edgar/data/320193/000032019325000079/aapl-20250927.htm
   ✖ Other error for AAPL 0000320193-25-000079: name 'Path' is not defined

Downloading 10-K for MSFT...
   → Requesting: https://www.sec.gov/Archives/edgar/data/789019/000095017025100235/msft-20250630.htm
   ✖ Other error for MSFT 0000950170-25-100235: name 'Path' is not defined

Downloading 10-K for AMZN...
   → Requesting: https://www.sec.gov/Archives/edgar/data/1018724/000101872425000004/amzn-20241231.htm
   ✖ Other error for AMZN 0001018724-25-000004: name 'Path' is not defined

Downloading 10-K for NVDA...
   → Requesting: https://www.sec.gov/Archives/edgar/data/1045810/000104581025000023/nvda-20250126.htm
   ✖ Other error for NVDA 0001045810-25-000023: name 'Path' is not defined

Downloading 10-K for META...
   → Requesting: https://www.sec.gov/Archives/edgar/data/1326801/000132680125000017/meta-20241231.htm
   ✖ Other error for META 0001326801-25-00

In [13]:
files = list(Path("data/raw_filings").glob("*.htm*"))
files

[PosixPath('data/raw_filings/789019_000095017025100235_msft-20250630.htm'),
 PosixPath('data/raw_filings/1045810_000104581025000023_nvda-20250126.htm'),
 PosixPath('data/raw_filings/1018724_000101872425000004_amzn-20241231.htm'),
 PosixPath('data/raw_filings/1326801_000132680125000017_meta-20241231.htm'),
 PosixPath('data/raw_filings/320193_000032019325000079_aapl-20250927.htm')]