# 01_historical

Runner notebook for **historical ingestion** of SEC 10-K / 10-KA Item 1A (Risk Factors).

This notebook:
- fetches submissions per CIK
- selects the most recent filings
- extracts Item 1A
- chunks + embeds
- stores in Chroma


In [1]:
!pip -q install -e ..


In [2]:
from pathlib import Path
from sec_risk import SecClient, init_chroma, load_seen_accessions
from sec_risk.pipeline import ingest_many_ciks, IngestConfig

USER_AGENT = "Khadija khadijaadnani2000@gmail.com" 
sec = SecClient(user_agent=USER_AGENT)

BASE = Path("./data")
PERSIST_DIR = str(BASE / "chroma_sec")
ARTIFACT_DIR = BASE / "retrieved"
MANIFEST_PATH = ARTIFACT_DIR / "manifest.jsonl"

vectordb = init_chroma(PERSIST_DIR, collection_name="sec_10k_risk_factors")
seen = load_seen_accessions(MANIFEST_PATH)


In [3]:
import posthog
import chromadb
print("posthog:", posthog.__version__)
print("chromadb:", chromadb.__version__)

from langchain_chroma import Chroma
print("langchain_chroma import OK")


posthog: 5.4.0
chromadb: 1.4.0
langchain_chroma import OK


In [5]:
ciks = sec.get_all_ciks()

cfg = IngestConfig(
    top_n_per_cik=3,
    chunk_size=1500,
    chunk_overlap=200,
    batch_size=10_000,
)

df, skips = ingest_many_ciks(
    sec=sec,
    vectordb=vectordb,
    ciks=ciks,
    seen_accessions=seen,
    artifact_dir=ARTIFACT_DIR,
    manifest_path=MANIFEST_PATH,
    cfg=cfg,
    # limit=50,
)

df.head(), skips[:20], vectordb._collection.count()


Upserted 142/142 chunks
Upserted 100/100 chunks
Upserted 173/173 chunks
Upserted 178/178 chunks
Upserted 156/156 chunks
Upserted 25/25 chunks
Upserted 130/130 chunks
Upserted 121/121 chunks
Upserted 116/116 chunks
Upserted 85/85 chunks
Upserted 80/80 chunks
Upserted 82/82 chunks
Upserted 154/154 chunks
Upserted 142/142 chunks
Upserted 42/42 chunks
Upserted 40/40 chunks
Upserted 42/42 chunks
Upserted 65/65 chunks
Upserted 60/60 chunks
Upserted 61/61 chunks
Upserted 73/73 chunks
Upserted 68/68 chunks
Upserted 227/227 chunks
Upserted 224/224 chunks
Upserted 216/216 chunks
Upserted 184/184 chunks
Upserted 173/173 chunks
Upserted 113/113 chunks
Upserted 94/94 chunks
Upserted 93/93 chunks
Upserted 94/94 chunks
Upserted 25/25 chunks
Upserted 23/23 chunks
Upserted 78/78 chunks
Upserted 78/78 chunks
Upserted 1/1 chunks
Upserted 4/4 chunks
Upserted 1/1 chunks
Upserted 81/81 chunks
Upserted 87/87 chunks
Upserted 110/110 chunks
Upserted 103/103 chunks
Upserted 100/100 chunks
Upserted 89/89 chunks


(                  company      cik  filingDate  form       accessionNumber  \
 0            SHOPIFY INC.  1594805  2025-02-11  10-K  0001594805-25-000012   
 1       LAM RESEARCH CORP   707549  2024-08-29  10-K  0000707549-24-000106   
 2  INTUITIVE SURGICAL INC  1035267  2025-01-31  10-K  0001035267-25-000017   
 3  INTUITIVE SURGICAL INC  1035267  2024-01-31  10-K  0001035267-24-000021   
 4  INTUITIVE SURGICAL INC  1035267  2023-02-10  10-K  0001035267-23-000019   
 
                                                  url  chunks  
 0  https://www.sec.gov/Archives/edgar/data/159480...     142  
 1  https://www.sec.gov/Archives/edgar/data/707549...     100  
 2  https://www.sec.gov/Archives/edgar/data/103526...     173  
 3  https://www.sec.gov/Archives/edgar/data/103526...     178  
 4  https://www.sec.gov/Archives/edgar/data/103526...     156  ,
 [('1045810', '0001045810-25-000023:no_item_1a'),
  ('1045810', '0001045810-24-000029:no_item_1a'),
  ('1045810', '0001045810-23-000017:no_