# 02_recent

Runner notebook for ingesting *provided* recent filings.


In [8]:
!pip -q install -e ..


In [9]:
import datetime as dt
import logging
from pathlib import Path

from sec_risk import SecClient, init_chroma, load_seen_accessions, check_date_in_vectordb
from sec_risk.pipeline import ingest_provided_filings, ProvidedFiling, IngestConfig

# --- Setup logging ---
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('sec_daily_ingestion.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# --- config ---
USER_AGENT = "Khadija khadijaadnani2000@gmail.com"
sec = SecClient(user_agent=USER_AGENT)

BASE = Path("./data")
PERSIST_DIR = str(BASE / "chroma_sec")
ARTIFACT_DIR = BASE / "retrieved"
MANIFEST_PATH = ARTIFACT_DIR / "manifest.jsonl"

vectordb = init_chroma(PERSIST_DIR, collection_name="sec_10k_risk_factors")
seen = load_seen_accessions(MANIFEST_PATH)
cfg = IngestConfig(chunk_size=1500, chunk_overlap=200, batch_size=10_000)

# --- run ---
logger.info("="*60)
logger.info("Starting SEC daily ingestion")
logger.info("="*60)

yesterday = dt.date.today() - dt.timedelta(days=4)
logger.info(f"Target date: {yesterday}")

idx_url, rows = sec.fetch_daily_10k_rows(yesterday)

if not rows:
    logger.info("No 10-K filings found in the index.")
    
    has_data = check_date_in_vectordb(vectordb, yesterday)
    
    if has_data:
        logger.info(f"✓ Filings from {yesterday} already exist in the vector database.")
        logger.info("Reason: Already ingested previously")
    else:
        logger.info("ℹ No filings from this date in the vector database.")
        logger.info("Reason: Weekend, federal holiday, or genuinely no 10-K filings on this date")
else:
    logger.info(f"Daily index: {idx_url}")
    logger.info(f"10-K/10-K/A rows: {len(rows)}")

    # Build ProvidedFiling list
    provided = []
    missed = []

    logger.info("Fetching submission details for each filing...")
    for idx, x in enumerate(rows):
        if idx % 10 == 0:
            logger.info(f"Processing {idx}/{len(rows)}...")
        
        cik = x["cik"]
        filing_date = x["date_filed"]
        form = x["form"]
        company = x["company"]

        try:
            subs = sec.fetch_submissions(cik)
            recent = subs.get("filings", {}).get("recent", {})
            forms = recent.get("form", [])
            accession = recent.get("accessionNumber", [])
            primary = recent.get("primaryDocument", [])
            filing_dates = recent.get("filingDate", [])

            found = False
            for i, f in enumerate(forms):
                if f == form and filing_dates[i] == filing_date:
                    acc_num = accession[i]
                    
                    if acc_num in seen:
                        logger.debug(f"Skipping {company} - already ingested")
                        missed.append((cik, company, form, filing_date, "already_seen"))
                        found = True
                        break
                    
                    provided.append(ProvidedFiling(
                        cik=cik,
                        accessionNumber=acc_num,
                        primaryDocument=primary[i],
                        filingDate=filing_date,
                        form=form,
                        company=company,
                    ))
                    found = True
                    break

            if not found:
                missed.append((cik, company, form, filing_date, "not_found_in_submissions_recent"))

        except Exception as e:
            logger.error(f"Error processing {company} (CIK: {cik}): {e}")
            missed.append((cik, company, form, filing_date, f"error:{e}"))

    logger.info(f"Prepared filings: {len(provided)}")
    logger.info(f"Missed/Skipped: {len(missed)}")

    if missed:
        logger.warning("Sample of missed filings:")
        for m in missed[:10]:
            logger.warning(f"  {m}")

    if not provided:
        logger.info("All filings already ingested or no new filings to process.")
    else:
        # Ingest
        logger.info("Starting ingestion...")
        df, skips = ingest_provided_filings(
            sec=sec,
            vectordb=vectordb,
            filings=provided,
            seen_accessions=seen,
            artifact_dir=ARTIFACT_DIR,
            manifest_path=MANIFEST_PATH,
            cfg=cfg,
        )

        logger.info("="*60)
        logger.info("INGESTION COMPLETE")
        logger.info("="*60)
        logger.info(f"Date processed: {target_date}")
        logger.info(f"Total filings found: {len(rows)}")
        logger.info(f"Successfully processed: {len(df)}")
        logger.info(f"Skipped during ingestion: {len(skips)}")
        logger.info(f"Total vectors in DB: {vectordb._collection.count()}")

        if len(df) > 0:
            logger.info("\nProcessed companies:")
            for company in df['company'].head(10):
                logger.info(f"  ✓ {company}")

        if skips:
            logger.warning(f"\nSample of ingestion skips:")
            for skip in skips[:5]:
                logger.warning(f"  {skip}")

        logger.info("\nIngestion complete!")

2025-12-26 14:27:03,668 - INFO - Use pytorch device_name: mps
2025-12-26 14:27:03,669 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2025-12-26 14:27:06,938 - INFO - Starting SEC daily ingestion
2025-12-26 14:27:06,939 - INFO - Target date: 2025-12-22
2025-12-26 14:27:06,940 - INFO - Fetching filings for: 2025-12-22
2025-12-26 14:27:07,830 - INFO - Total filings in index: 0
2025-12-26 14:27:07,831 - INFO - No 10-K/10-K/A filings found for 2025-12-22
2025-12-26 14:27:07,836 - INFO - No 10-K filings found in the index.
2025-12-26 14:27:07,851 - INFO - ℹ No filings from this date in the vector database.
2025-12-26 14:27:07,851 - INFO - Reason: Weekend, federal holiday, or genuinely no 10-K filings on this date
