In [8]:
from pathlib import Path
from sec_risk import init_chroma

BASE = Path("./data")
PERSIST_DIR = str(BASE / "chroma_sec")
MANIFEST_PATH = BASE / "retrieved" / "manifest.jsonl"

# Check ChromaDB
try:
    vectordb = init_chroma(PERSIST_DIR, collection_name="sec_10k_risk_factors")
    seen = load_seen_accessions(MANIFEST_PATH)
    
    print(f"✓ Vector database loaded")
    print(f"✓ Total vectors: {vectordb._collection.count()}")
    print(f"✓ Seen accessions: {len(seen)}")
except Exception as e:
    print(f"✗ ChromaDB not found: {e}")

# Check manifest
if MANIFEST_PATH.exists():
    print(f"✓ Manifest exists at {MANIFEST_PATH}")
else:
    print(f"✗ Manifest not found at {MANIFEST_PATH}")
    print(f"  This is normal if you haven't ingested any documents yet")

✓ Vector database loaded
✓ Total vectors: 12528
✓ Seen accessions: 156
✓ Manifest exists at data/retrieved/manifest.jsonl


In [9]:
# Cell 2: Database statistics
total_docs = vectordb._collection.count()
print(f"Total documents (chunks) in database: {total_docs}")

# Get sample of metadata
sample = vectordb._collection.get(limit=100, include=['metadatas'])
metadatas = sample['metadatas']

# Extract unique values
companies = set(m.get('company') for m in metadatas if m.get('company'))
forms = set(m.get('form') for m in metadatas if m.get('form'))
years = set(m.get('fiscalYear') for m in metadatas if m.get('fiscalYear'))

print(f"\nUnique companies (sample): {len(companies)}")
print(f"Form types: {forms}")
print(f"Fiscal years: {sorted(years) if years else 'N/A'}")

Total documents (chunks) in database: 12528

Unique companies (sample): 1
Form types: {'10-K'}
Fiscal years: [2024, 2025]


In [10]:
# Cell 3: Explore manifest
if MANIFEST_PATH.exists():
    records = []
    with open(MANIFEST_PATH, 'r') as f:
        for line in f:
            if line.strip():
                records.append(json.loads(line))
    
    df_manifest = pd.DataFrame(records)
    print(f"Total filings in manifest: {len(df_manifest)}")
    print(f"\nColumns: {df_manifest.columns.tolist()}")
    print(f"\nSample:")
    display(df_manifest.head())
    
    # Statistics
    print(f"\nFilings by form type:")
    print(df_manifest['form'].value_counts())
    
    print(f"\nTop 10 companies by number of filings:")
    print(df_manifest['company'].value_counts().head(10))

Total filings in manifest: 156

Columns: ['cik', 'company', 'form', 'filingDate', 'accessionNumber', 'url', 'primary', 'section', 'chunks', 'embedding_model']

Sample:


Unnamed: 0,cik,company,form,filingDate,accessionNumber,url,primary,section,chunks,embedding_model
0,789019,MICROSOFT CORP,10-K,2025-07-30,0000950170-25-100235,https://www.sec.gov/Archives/edgar/data/789019...,msft-20250630.htm,Item 1A,88,
1,789019,MICROSOFT CORP,10-K,2024-07-30,0000950170-24-087843,https://www.sec.gov/Archives/edgar/data/789019...,msft-20240630.htm,Item 1A,84,
2,789019,MICROSOFT CORP,10-K,2023-07-27,0000950170-23-035122,https://www.sec.gov/Archives/edgar/data/789019...,msft-20230630.htm,Item 1A,78,
3,1067983,BERKSHIRE HATHAWAY INC,10-K,2025-02-24,0000950170-25-025210,https://www.sec.gov/Archives/edgar/data/106798...,brka-20241231.htm,Item 1A,21,
4,1067983,BERKSHIRE HATHAWAY INC,10-K,2024-02-26,0000950170-24-019719,https://www.sec.gov/Archives/edgar/data/106798...,brka-20231231.htm,Item 1A,19,



Filings by form type:
form
10-K    156
Name: count, dtype: int64

Top 10 companies by number of filings:
company
MICROSOFT CORP                        3
INTUIT INC.                           3
AT&T INC.                             3
TJX COMPANIES INC /DE/                3
O REILLY AUTOMOTIVE INC               3
Uber Technologies, Inc                3
ROYAL CARIBBEAN CRUISES LTD           3
NORTHROP GRUMMAN CORP /DE/            3
DANAHER CORP /DE/                     3
PNC FINANCIAL SERVICES GROUP, INC.    3
Name: count, dtype: int64


In [11]:
# Cell 4: Get all unique companies
all_results = vectordb._collection.get(include=['metadatas'])
all_companies = sorted(set(m.get('company') for m in all_results['metadatas'] if m.get('company')))

print(f"Total unique companies: {len(all_companies)}")
print(f"\nFirst 20 companies:")
for i, company in enumerate(all_companies[:20], 1):
    print(f"{i}. {company}")

Total unique companies: 68

First 20 companies:
1. AMGEN INC
2. AT&T INC.
3. AUTOMATIC DATA PROCESSING INC
4. AbbVie Inc.
5. Aon plc
6. Apollo Global Management, Inc.
7. Arista Networks, Inc.
8. BANK OF AMERICA CORP /DE/
9. BERKSHIRE HATHAWAY INC
10. BOSTON SCIENTIFIC CORP
11. BRISTOL MYERS SQUIBB CO
12. BlackRock, Inc.
13. Brookfield Asset Management Ltd.
14. CADENCE DESIGN SYSTEMS INC
15. CARVANA CO.
16. COCA COLA CO
17. CRH PUBLIC LTD CO
18. Cigna Group
19. CrowdStrike Holdings, Inc.
20. DANAHER CORP /DE/


In [12]:
# Cell 5: Sample document inspection
sample_docs = vectordb._collection.get(limit=3, include=['documents', 'metadatas'])

for i in range(len(sample_docs['ids'])):
    print(f"\n{'='*60}")
    print(f"Document {i+1}")
    print(f"{'='*60}")
    print(f"Company: {sample_docs['metadatas'][i].get('company')}")
    print(f"Form: {sample_docs['metadatas'][i].get('form')}")
    print(f"Filing Date: {sample_docs['metadatas'][i].get('filingDate')}")
    print(f"CIK: {sample_docs['metadatas'][i].get('cik')}")
    print(f"URL: {sample_docs['metadatas'][i].get('url')}")
    print(f"\nContent preview (first 500 chars):")
    print(sample_docs['documents'][i][:500])
    print("...")


Document 1
Company: MICROSOFT CORP
Form: 10-K
Filing Date: 2025-07-30
CIK: 789019
URL: https://www.sec.gov/Archives/edgar/data/789019/000095017025100235/msft-20250630.htm

Content preview (first 500 chars):
Item 1A
...

Document 2
Company: MICROSOFT CORP
Form: 10-K
Filing Date: 2025-07-30
CIK: 789019
URL: https://www.sec.gov/Archives/edgar/data/789019/000095017025100235/msft-20250630.htm

Content preview (first 500 chars):
ITEM 1A. RIS
K FACTORS
Our operations and financial results are subject to various risks and uncertainties, including those described below, that could adversely affect our business, operations, financial condition, results of operations, liquidity, and the trading price of our common stock.
STRATEGIC AND COMPETITIVE RISKS
We face intense competition across all markets for our products and services, which could adversely affect our results of operations.
Competition in the technology sector
Our 
...

Document 3
Company: MICROSOFT CORP
Form: 10-K
Filing Date: 2025-07