In [11]:
# Cell 1: Setup
from pathlib import Path
from sec_risk import init_chroma
import requests
import json

BASE = Path("./data")
PERSIST_DIR = str(BASE / "chroma_sec")
vectordb = init_chroma(PERSIST_DIR, collection_name="sec_10k_risk_factors")

print(f"‚úì Vector DB loaded: {vectordb._collection.count()} vectors")
print(f"‚úì Using Ollama (local LLM)")

‚úì Vector DB loaded: 12528 vectors
‚úì Using Ollama (local LLM)


In [12]:
# Cell 2: Ollama helper function with proper error handling
def ask_ollama(prompt: str, model: str = "llama3.1"):
    """
    Ask Ollama (local LLM) with error handling
    """
    try:
        response = requests.post(
            'http://localhost:11434/api/generate',
            json={
                "model": model,
                "prompt": prompt,
                "stream": False
            },
            timeout=120  # 2 minutes timeout for long responses
        )
        
        response.raise_for_status()
        response_data = response.json()
        
        # Check if response key exists
        if 'response' in response_data:
            return response_data['response']
        else:
            print(f"‚ùå Unexpected response format: {response_data}")
            return None
            
    except requests.exceptions.ConnectionError:
        print("‚ùå Cannot connect to Ollama.")
        print("Make sure Ollama is running: open a terminal and run 'ollama serve'")
        return None
    except requests.exceptions.Timeout:
        print("‚ùå Request timed out. The prompt might be too long.")
        return None
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return None

# Test Ollama connection
print("Testing Ollama connection...")
test = ask_ollama("Say 'Hello, I am ready!' in one sentence.", model="llama3.1")
if test:
    print(f"‚úì Ollama is working: {test}")
else:
    print("‚ö†Ô∏è Ollama test failed")

Testing Ollama connection...
‚úì Ollama is working: Hello, I'm ready!


In [20]:
# Cell 2 : Streaming response
def ask_ollama_stream(prompt: str, model: str = "llama3.1"):
    """
    Stream response from Ollama (see answer as it generates)
    """
    try:
        response = requests.post(
            'http://localhost:11434/api/generate',
            json={
                "model": model,
                "prompt": prompt,
                "stream": True  
            },
            stream=True,
            timeout=120
        )
        
        full_response = ""
        print("ü§ñ Answer: ", end="", flush=True)
        
        for line in response.iter_lines():
            if line:
                json_response = json.loads(line)
                if 'response' in json_response:
                    chunk = json_response['response']
                    print(chunk, end="", flush=True)
                    full_response += chunk
                
                if json_response.get('done', False):
                    break
        
        print()  # New line
        return full_response
        
    except Exception as e:
        print(f"\n‚ùå Error: {e}")
        return None

def ask_with_rag_stream(question: str, n_context: int = 3):
    print(f"üîç Searching...")
    results = vectordb.similarity_search_with_score(question, k=n_context)
    
    context_parts = []
    sources = []
    
    for i, (doc, score) in enumerate(results, 1):
        company = doc.metadata['company']
        date = doc.metadata['filingDate']
        content = doc.page_content[:400]
        
        context_parts.append(f"[{i}] {company}: {content}")
        sources.append({"company": company, "date": date, "score": score})
    
    context = "\n".join(context_parts)
    
    prompt = f"""Based on these risk factors, answer briefly:

{context}

Q: {question}

A:"""

    answer = ask_ollama_stream(prompt, model="llama3.1")
    
    print(f"\nüìö Sources: {', '.join([s['company'] for s in sources])}")
    
    return {"question": question, "answer": answer, "sources": sources}

In [15]:
# Cell 4: Example query 1 - Cybersecurity
response = ask_with_rag("What are the main cybersecurity risks that companies face?")

if response:
    print(f"\n{'='*60}")
    print(f"‚ùì QUESTION: {response['question']}")
    print(f"{'='*60}\n")
    print(f"ü§ñ ANSWER:\n{response['answer']}")
    print(f"\n{'='*60}")
    print(f"üìö SOURCES:")
    print(f"{'='*60}")
    for i, source in enumerate(response['sources'], 1):
        print(f"\n{i}. {source['company']} ({source['date']})")
        print(f"   Similarity Score: {source['score']:.4f}")

üîç Searching for relevant context...
‚úì Retrieved 5 sources
üí≠ Asking Ollama (this may take 30-60 seconds)...
‚úì Answer generated!

‚ùì QUESTION: What are the main cybersecurity risks that companies face?

ü§ñ ANSWER:
Based on the risk factor disclosures from SEC 10-K filings, the main cybersecurity risks that companies face can be synthesized as follows:

**Dependence on Sophisticated Technology**: Companies such as Merck & Co., Inc. (Source 1) and Bristol Myers Squibb Co (Source 3) rely heavily on sophisticated software applications and computing infrastructure, making them vulnerable to disruptions in their operations.

**Cyber-Attacks and Data Breaches**: All the companies mentioned (Merck & Co., Inc., Philip Morris International Inc., Bristol Myers Squibb Co, Johnson & Johnson, and HCA Healthcare, Inc.) highlight the risk of cyber-attacks, which could lead to data breaches, disruption of operations, reputational damage, litigation, regulatory action, significant fines or pe

In [17]:
# Cell 5: Example query 2 - Supply chain
response = ask_with_rag("What supply chain risks do companies mention?", n_context=5)

if response:
    print(f"‚ùì QUESTION: {response['question']}\n")
    print(f"ü§ñ ANSWER:\n{response['answer']}\n")
    print(f"üìä Sources used: {len(response['sources'])}")

üîç Searching for relevant context...
‚úì Retrieved 5 sources
üí≠ Asking Ollama (this may take 30-60 seconds)...
‚úì Answer generated!
‚ùì QUESTION: What supply chain risks do companies mention?

ü§ñ ANSWER:
Based on the provided SEC 10-K filings, I have identified the supply chain risks mentioned by companies across various industries.

**Dependence on Third-Party Suppliers**

Companies like QUALCOMM INC/DE (Sources 1, 2, and 3) and Mondelez International, Inc. (Source 4) highlight their dependence on a limited number of third-party suppliers for procurement, manufacturing, assembly, and testing of products. This dependency poses risks to supply assurance, technology leadership, and reasonable margins.

**Order and Shipment Uncertainties**

QUALCOMM INC/DE (Sources 1, 2, and 3) also mentions order and shipment uncertainties that could negatively impact their results of operations.

**Global Supply Chain Risks**

Mondelez International, Inc. (Source 4) notes the risks associated wit

In [18]:
# Cell 6: Example query 3 - Climate change
response = ask_with_rag(
    "What are companies saying about climate change and environmental regulations?",
    n_context=7
)

if response:
    print(f"‚ùì QUESTION: {response['question']}\n")
    print(f"ü§ñ ANSWER:\n{response['answer']}\n")
    
    print(f"\nüìö SOURCES:")
    for i, s in enumerate(response['sources'], 1):
        print(f"{i}. {s['company']} - {s['date']} (Score: {s['score']:.4f})")

üîç Searching for relevant context...
‚úì Retrieved 7 sources
üí≠ Asking Ollama (this may take 30-60 seconds)...
‚ùå Request timed out. The prompt might be too long.
‚ùì QUESTION: What are companies saying about climate change and environmental regulations?

ü§ñ ANSWER:
‚ö†Ô∏è Ollama did not respond. Check the connection.


üìö SOURCES:
1. EMERSON ELECTRIC CO - 2023-11-13 (Score: 0.5612)
2. Philip Morris International Inc. - 2025-02-06 (Score: 0.5812)
3. CRH PUBLIC LTD CO - 2025-02-26 (Score: 0.5859)
4. CRH PUBLIC LTD CO - 2024-02-29 (Score: 0.5906)
5. DANAHER CORP /DE/ - 2025-02-20 (Score: 0.5927)
6. BRISTOL MYERS SQUIBB CO - 2025-02-12 (Score: 0.6183)
7. BRISTOL MYERS SQUIBB CO - 2024-02-13 (Score: 0.6237)


In [22]:
# Cell 7: Interactive 

your_question = """
What regulatory risks are financial companies most concerned about?
"""

response = ask_with_rag(your_question, n_context=5)

if response:
    print(f"‚ùì Q: {response['question']}\n")
    print(f"ü§ñ A:\n{response['answer']}\n")
    print(f"üìä Used {len(response['sources'])} sources")

üîç Searching for relevant context...
‚úì Retrieved 5 sources
üí≠ Asking Ollama (this may take 30-60 seconds)...
‚úì Answer generated!
‚ùì Q: 
What regulatory risks are financial companies most concerned about?


ü§ñ A:
Based on the provided SEC 10-K filings, it appears that regulatory risks are a significant concern for various financial companies. The following are some of the key regulatory risks that these companies face:

1. **Governmental Regulation**: Companies like PROGRESSIVE CORP/OH/ (Source 2 and Source 3) highlight the impact of governmental regulation on their operations, underwriting, investing, and financing activities. This suggests that regulatory changes can have far-reaching effects on financial institutions.
2. **Litigation, Regulatory Audits, and Investigations**: Cigna Group (Source 4) mentions litigation, regulatory audits, and investigations as risks they face. This indicates that regulatory scrutiny is a significant concern for the insurance industry.
3. **T

In [23]:
# Cell de debug
def debug_rag(question: str, n_context: int = 2):
    """Debug version to see what's happening"""
    print(f"üîç Step 1: Searching for '{question}'...")
    
    # 1. Retrieve
    results = vectordb.similarity_search_with_score(question, k=n_context)
    
    print(f"‚úì Found {len(results)} results\n")
    
    # 2. Show what we found
    print("="*60)
    print("RETRIEVED CONTEXT:")
    print("="*60)
    
    context_parts = []
    for i, (doc, score) in enumerate(results, 1):
        company = doc.metadata['company']
        date = doc.metadata['filingDate']
        content = doc.page_content[:300]
        
        print(f"\n[Source {i}] {company} ({date}) - Score: {score:.4f}")
        print(f"Content: {content}...")
        print()
        
        context_parts.append(f"[Source {i}] {company} ({date})\n{content}")
    
    context = "\n\n".join(context_parts)
    
    # 3. Show the actual prompt
    prompt = f"""You are analyzing SEC 10-K risk factor disclosures.

Here are excerpts from actual SEC filings:

{context}

Question: {question}

Based on the excerpts above, provide a brief answer (2-3 sentences):"""

    print("="*60)
    print("PROMPT BEING SENT TO OLLAMA:")
    print("="*60)
    print(prompt)
    print("\n" + "="*60)
    
    # 4. Ask Ollama
    print("\nüí≠ Asking Ollama...")
    answer = ask_ollama(prompt, model="llama3.1")
    
    print(f"\n{'='*60}")
    print("OLLAMA'S ANSWER:")
    print(f"{'='*60}")
    print(answer)
    print()
    
    return {"question": question, "answer": answer, "context": context}

# Test
debug_rag("What are the main cybersecurity risks?")

üîç Step 1: Searching for 'What are the main cybersecurity risks?'...
‚úì Found 2 results

RETRIEVED CONTEXT:

[Source 1] BRISTOL MYERS SQUIBB CO (2024-02-13) - Score: 0.5125
Content: Information Technology and Cybersecurity Risks
We are dependent on information technology systems and face risk of cybersecurity incidents that could disrupt our business and result in theft of proprietary and confidential information....


[Source 2] BRISTOL MYERS SQUIBB CO (2025-02-12) - Score: 0.5410
Content: Information Technology and Cybersecurity Risks
We are dependent on information technology systems, including artificial intelligence programs, and face risk of cybersecurity incidents that could disrupt our business and result in theft of proprietary, confidential and personal information....

PROMPT BEING SENT TO OLLAMA:
You are analyzing SEC 10-K risk factor disclosures.

Here are excerpts from actual SEC filings:

[Source 1] BRISTOL MYERS SQUIBB CO (2024-02-13)
Information Technology and Cyber

{'question': 'What are the main cybersecurity risks?',
 'answer': "According to the SEC 10-K filings by Bristol Myers Squibb Co., the main cybersecurity risks identified are:\n\nCybersecurity incidents that could disrupt the company's business. This disruption could occur through theft of proprietary and confidential information, which was mentioned in the 2024 filing, or also include personal data as noted in the 2025 filing. Additionally, there is a mention of artificial intelligence programs being part of these IT systems, implying an elevated risk due to their complexity and potential vulnerabilities.",
 'context': '[Source 1] BRISTOL MYERS SQUIBB CO (2024-02-13)\nInformation Technology and Cybersecurity Risks\nWe are dependent on information technology systems and face risk of cybersecurity incidents that could disrupt our business and result in theft of proprietary and confidential information.\n\n[Source 2] BRISTOL MYERS SQUIBB CO (2025-02-12)\nInformation Technology and Cyberse