# Checkpoint B — Parse Stage Validation

Validates Stage 3: raw HTML snapshots → structured CompanyProfiles via LLM extraction.

**Prerequisites:**
- `OPENAI_API_KEY` set in `.env`
- Checkpoint A data exists (or will be collected fresh)

In [None]:
# Cell 1: Setup + imports
import json
from pathlib import Path

from core.config import load_config

sources_path = Path("../config/sources.yaml")
settings, sources = load_config(sources_path)

print(f"Settings loaded: verbose={settings.verbose}, llm_model={settings.llm_model}")
print(f"Sources: {len(sources.sources)} configured")
print(f"Parsed dir: {settings.parsed_dir}")

In [None]:
# Cell 2: Find existing snapshots
from evidence.snapshot import FileSnapshotStore

snapshot_store = FileSnapshotStore(settings.snapshots_dir)
snapshots_dir = Path(settings.snapshots_dir)

# Get the most recent run
run_dirs = sorted(snapshots_dir.iterdir()) if snapshots_dir.exists() else []
if run_dirs:
    latest_run = run_dirs[-1].name
    snapshots = snapshot_store.list_by_run(latest_run)
    print(f"Latest run: {latest_run}")
    print(f"Snapshots: {len(snapshots)}")
    for s in snapshots:
        print(
            f"  {s.source_id}: HTTP {s.status_code} | success={s.success} | {s.content_length} bytes"
        )
else:
    print("No existing snapshots found — run Cell 5 first for a full pipeline run")
    snapshots = []

In [None]:
# Cell 3: Test adapter content extraction on a successful snapshot
from parsing.adapters import get_adapter

test_snapshot = next((s for s in snapshots if s.success), None)
if test_snapshot:
    html_bytes = snapshot_store.get_content(test_snapshot.snapshot_id)
    assert html_bytes is not None, "Content should exist"

    # Get adapter
    adapter = get_adapter(test_snapshot.source_type)
    print(f"Adapter for '{test_snapshot.source_type}': {type(adapter).__name__}")

    # Find source metadata
    source_meta = {}
    for src in sources.sources:
        if src.source_id == test_snapshot.source_id:
            source_meta = {k: str(v) for k, v in src.metadata.items()}
            break

    if adapter:
        content_block = adapter.extract_content(test_snapshot, html_bytes, source_meta)
        print(f"\nMain text: {len(content_block.main_text)} chars")
        print(f"Meta keys: {list(content_block.meta.keys())}")
        print(f"Key links: {len(content_block.key_links)}")
        print(f"Company hint: {content_block.company_hint}")
        print("\n--- First 500 chars ---")
        print(content_block.main_text[:500])
else:
    print("No successful snapshot to test — run Cell 5 first")

In [None]:
# Cell 4: Test LLM extraction on a single snapshot
from core import verbose
from parsing.llm import extract_company_profile

verbose.configure(2)  # DEBUG level

if test_snapshot and adapter:
    profile, log = extract_company_profile(
        content_block=content_block,
        model=settings.llm_model,
        snapshot_id=test_snapshot.snapshot_id,
        source_id=test_snapshot.source_id,
        url=test_snapshot.canonical_url,
    )

    print(f"\nStatus: {log.status}")
    print(f"Tokens: {log.llm_tokens_used}")
    print(f"Duration: {log.duration_ms:.0f}ms")

    if profile:
        print("\n--- CompanyProfile ---")
        print(f"Name: {profile.name}")
        print(f"Domain: {profile.domain}")
        print(f"Summary: {profile.summary}")
        print(f"Tags ({len(profile.tags)}): {profile.tags}")
        print(f"Confidence: {profile.confidence}")
        print(f"Unknowns: {profile.unknowns}")
        print("\n--- Signals ---")
        for sig in profile.signals:
            print(f"  {sig.name} = {sig.value}")
            print(f'    Evidence: "{sig.evidence.text[:80]}..."')
            print(f"    Context: {sig.evidence.context}")
    else:
        print(f"\nErrors: {log.errors}")
else:
    print("Skipped — no snapshot available")

In [None]:
# Cell 5: Run full Checkpoint B pipeline
from orchestration.runner import get_checkpoint_b_results, run_checkpoint_b_async

verbose.configure(2)  # DEBUG level

ctx = await run_checkpoint_b_async(sources_path=sources_path)

print("\nCheckpoint B complete!")
print(f"Run ID: {ctx.run_id}")
print(f"Status: {ctx.status}")
print(
    f"Snapshots: {ctx.metrics.num_snapshots_success} success, {ctx.metrics.num_snapshots_failed} failed"
)
print(
    f"Parse: {ctx.metrics.num_parse_success} success, {ctx.metrics.num_parse_failed} failed"
)

In [None]:
# Cell 6: Inspect persisted files
parsed_dir = Path(settings.parsed_dir) / ctx.run_id

profiles_path = parsed_dir / "profiles.json"
parse_log_path = parsed_dir / "parse_log.json"

print(f"Parsed dir: {parsed_dir}")
print(f"profiles.json exists: {profiles_path.exists()}")
print(f"parse_log.json exists: {parse_log_path.exists()}")

if profiles_path.exists():
    with open(profiles_path) as f:
        profiles_data = json.load(f)
    print(f"\nProfiles: {len(profiles_data)}")
    for p in profiles_data:
        print(f"  {p['name']} ({p['domain']}) — confidence={p['confidence']}")
        print(f"    Tags: {p['tags'][:5]}...")
        print(f"    Signals: {len(p['signals'])}")

In [None]:
# Cell 7: Inspect parse logs
if parse_log_path.exists():
    with open(parse_log_path) as f:
        logs_data = json.load(f)
    print(f"Parse logs: {len(logs_data)}")
    for log_entry in logs_data:
        print(f"  {log_entry['source_id']}: {log_entry['status']}")
        print(f"    Model: {log_entry.get('llm_model', 'N/A')}")
        print(f"    Tokens: {log_entry.get('llm_tokens_used', 0)}")
        print(f"    Duration: {log_entry.get('duration_ms', 0):.0f}ms")
        if log_entry.get("errors"):
            print(f"    Errors: {log_entry['errors']}")
        if log_entry.get("warnings"):
            print(f"    Warnings: {log_entry['warnings']}")

In [None]:
# Cell 8: Validation checklist
results = get_checkpoint_b_results(ctx)

checks = {
    "Pipeline completed": ctx.status.value == "completed",
    "At least 1 snapshot collected": ctx.metrics.num_snapshots_success >= 1,
    "At least 1 profile extracted": len(results["profiles"]) >= 1,
    "Profiles persisted to disk": profiles_path.exists(),
    "Parse logs persisted to disk": parse_log_path.exists(),
    "Profile has name": bool(results["profiles"])
    and bool(results["profiles"][0].get("name")),
    "Profile has tags": bool(results["profiles"])
    and len(results["profiles"][0].get("tags", [])) >= 5,
    "Profile has signals with evidence": bool(results["profiles"])
    and len(results["profiles"][0].get("signals", [])) >= 1,
    "Profile has confidence score": bool(results["profiles"])
    and results["profiles"][0].get("confidence", 0) > 0,
}

print("=== Checkpoint B Validation ===")
all_pass = True
for check, passed in checks.items():
    status = "PASS" if passed else "FAIL"
    if not passed:
        all_pass = False
    print(f"  [{status}] {check}")

print(f"\n{'All checks passed!' if all_pass else 'Some checks FAILED'}")