In [5]:
%pip uninstall chromadb pydantic -y
%pip install chromadb pydantic pydantic-settings

Found existing installation: chromadb 0.3.23
Uninstalling chromadb-0.3.23:
  Successfully uninstalled chromadb-0.3.23
Found existing installation: pydantic 2.12.5
Uninstalling pydantic-2.12.5:
  Successfully uninstalled pydantic-2.12.5
Note: you may need to restart the kernel to use updated packages.
Collecting chromadb
  Using cached chromadb-1.4.0-cp39-abi3-macosx_11_0_arm64.whl.metadata (7.2 kB)
Collecting pydantic
  Using cached pydantic-2.12.5-py3-none-any.whl.metadata (90 kB)
Collecting pydantic-settings
  Using cached pydantic_settings-2.12.0-py3-none-any.whl.metadata (3.4 kB)
Collecting build>=1.0.3 (from chromadb)
  Using cached build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Using cached pybase64-1.4.3-cp314-cp314-macosx_11_0_arm64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Using cached posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
INFO: pip is looking at multiple versions of chromadb to determine wh

In [1]:
%pip show chromadb

Name: chromadb
Version: 1.4.0
Summary: Chroma.
Home-page: https://github.com/chroma-core/chroma
Author: 
Author-email: Jeff Huber <jeff@trychroma.com>, Anton Troynikov <anton@trychroma.com>
License: 
Location: /Users/kyle-anthonyhay/Documents/CODE/Recature-Homework/Revature-Cognizant-Training-Exercises/December/venv/lib/python3.12/site-packages
Requires: bcrypt, build, grpcio, httpx, importlib-resources, jsonschema, kubernetes, mmh3, numpy, onnxruntime, opentelemetry-api, opentelemetry-exporter-otlp-proto-grpc, opentelemetry-sdk, orjson, overrides, posthog, pybase64, pydantic, pypika, pyyaml, rich, tenacity, tokenizers, tqdm, typer, typing-extensions, uvicorn
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [2]:
"""
Exercise 02: RAG Analysis - Starter Code

Analyze RAG behavior and document observations.

Prerequisites:
- Completed Exercise 01

This starter provides the RAG system for analysis.
"""

import chromadb

# ============================================================================
# SETUP - Pre-built RAG System for Analysis
# ============================================================================

print("=" * 60)
print("Exercise 02: RAG Analysis")
print("=" * 60)


class MockLLM:
    """Simple mock LLM for testing."""
    def generate(self, prompt):
        if "Context:" in prompt and len(prompt) > 100:
            context_start = prompt.find("Context:") + 8
            context_end = prompt.find("Question:")
            context = prompt[context_start:context_end].strip()
            return f"Based on the context: {context[:200]}..."
        return "I don't have enough information to answer that."


class SimpleRAG:
    """Pre-built RAG system for analysis exercises."""
    
    def __init__(self):
        self.client = chromadb.Client()
        self.collection = self.client.create_collection("rag_analysis")
        self.llm = MockLLM()
    
    def add_knowledge(self, documents, ids=None):
        if ids is None:
            ids = [f"doc_{i}" for i in range(len(documents))]
        self.collection.add(documents=documents, ids=ids)
        print(f"[RAG] Added {len(documents)} documents")
    
    def query(self, question, k=3):
        results = self.collection.query(
            query_texts=[question], n_results=k,
            include=["documents", "distances"]
        )
        
        context_docs = results['documents'][0]
        distances = results['distances'][0]
        
        context = "\n".join(context_docs)
        prompt = f"""You are a helpful assistant. Answer based ONLY on the context.
If context doesn't contain enough info, say "I don't have enough information."

Context:
{context}

Question: {question}

Answer:"""
        
        answer = self.llm.generate(prompt)
        
        return {
            "question": question,
            "answer": answer,
            "retrieved_docs": context_docs,
            "distances": distances,
            "num_sources": len(context_docs)
        }


# Initialize RAG with knowledge base
rag = SimpleRAG()
rag.add_knowledge([
    "Python was created by Guido van Rossum and first released in 1991.",
    "Machine learning enables systems to learn from experience without explicit programming.",
    "Docker is a platform for running applications in containers.",
    "REST is an architectural style using HTTP methods for networked applications.",
    "Neural networks are computing systems inspired by biological brains.",
    "Git is a version control system created by Linus Torvalds in 2005."
])


Exercise 02: RAG Analysis
[RAG] Added 6 documents


In [3]:

# ============================================================================
# PART 1: RAG Behavior Testing
# ============================================================================

print("\n" + "=" * 60)
print("Part 1: RAG Behavior Testing")
print("=" * 60)

# Direct Match Questions
print("\n--- DIRECT MATCH ---")
direct_questions = [
    "What year was Python released?",
    "What is REST?"
]

# TODO: Test each question and document results
for q in direct_questions:
    result = rag.query(q)
    print(f"Q: {q}")
    print(f"A: {result['answer'][:100]}...")
    print(f"Sources: {result['distances']}")
print("TODO: Test direct match questions")


# Synthesis Required
print("\n--- SYNTHESIS REQUIRED ---")
synthesis_questions = [
    "What are two programming languages mentioned and who created them?"
]
print("TODO: Test synthesis questions")
for q in synthesis_questions:
    result = rag.query(q)
    print(f"\nQ: {q}")
    print(f"A: {result['answer'][:150]}...")
    print(f"Retrieved docs: {result['retrieved_docs']}")
    print(f"Distances: {result['distances']}")

# No Match
print("\n--- NO MATCH ---")
no_match_questions = [
    "What is the capital of France?",
    "How do I make pizza?"
]
print("TODO: Test no-match questions")
for q in no_match_questions:
    result = rag.query(q)
    print(f"\nQ: {q}")
    print(f"A: {result['answer'][:150]}...")
    print(f"Distances (higher = less relevant): {result['distances']}")

# Ambiguous
print("\n--- AMBIGUOUS ---")
ambiguous_questions = [
    "Tell me about technology",
    "What programming tools should I use?"
]
print("TODO: Test ambiguous questions")

for q in ambiguous_questions:
    result = rag.query(q)
    print(f"\nQ: {q}")
    print(f"A: {result['answer'][:150]}...")
    print(f"Retrieved docs: {result['retrieved_docs']}")
    print(f"Distances: {result['distances']}")


Part 1: RAG Behavior Testing

--- DIRECT MATCH ---
Q: What year was Python released?
A: Based on the context: Python was created by Guido van Rossum and first released in 1991.
Git is a ve...
Sources: [0.4295929968357086, 1.6448304653167725, 1.6890949010849]
Q: What is REST?
A: Based on the context: REST is an architectural style using HTTP methods for networked applications.
...
Sources: [0.7795387506484985, 1.6609597206115723, 1.8498444557189941]
TODO: Test direct match questions

--- SYNTHESIS REQUIRED ---
TODO: Test synthesis questions

Q: What are two programming languages mentioned and who created them?
A: Based on the context: Python was created by Guido van Rossum and first released in 1991.
Neural networks are computing systems inspired by biological ...
Retrieved docs: ['Python was created by Guido van Rossum and first released in 1991.', 'Neural networks are computing systems inspired by biological brains.', 'REST is an architectural style using HTTP methods for networked a

In [5]:
# ============================================================================
# PART 2: Edge Case Analysis
# ============================================================================

print("\n" + "=" * 60)
print("Part 2: Edge Case Analysis")
print("=" * 60)

# TODO 2.1: Test edge cases
print("\n--- 1. Empty/Short Queries ---")
short_queries = ["", "?", "python"]
for q in short_queries:
    try:
        result = rag.query(q)
        print(f"\nQ: '{q}'")
        print(f"A: {result['answer'][:100]}...")
        print(f"Distances: {result['distances']}")
    except Exception as e:
        print(f"\nQ: '{q}' -> ERROR: {e}")

print("\n--- 2. Very Long Query ---")
long_query = """I am trying to understand the historical development of programming languages 
and how they have evolved over time, particularly focusing on interpreted languages that were 
designed for ease of use and readability, and I would also like to know about the creators 
of these languages and when they were first released to the public."""
result = rag.query(long_query)
print(f"Q: {long_query[:80]}...")
print(f"A: {result['answer'][:100]}...")
print(f"Distances: {result['distances']}")

print("\n--- 3. Typos ---")
typo_queries = ["waht is mashcine lerning?", "pytohn programming"]
for q in typo_queries:
    result = rag.query(q)
    print(f"\nQ: {q}")
    print(f"A: {result['answer'][:100]}...")
    print(f"Retrieved: {result['retrieved_docs'][0][:60]}...")
    print(f"Distances: {result['distances']}")

print("\n--- 4. Opposite Meaning ---")
opposite_queries = ["What is NOT machine learning?", "What did Git NOT do?"]
for q in opposite_queries:
    result = rag.query(q)
    print(f"\nQ: {q}")
    print(f"A: {result['answer'][:100]}...")
    print(f"Distances: {result['distances']}")

# TODO 2.2: Failure Mode Identification
print("\n" + "=" * 60)
print("FAILURE MODE TABLE:")
print("=" * 60)
print("""
| Failure Mode          | Example Query                  | What Happened                           | Possible Fix                    |
|-----------------------|--------------------------------|-----------------------------------------|---------------------------------|
| No relevant docs      | "What is the capital of France?"| Distances ~1.9, returned unrelated docs | Add relevance threshold (>1.5)  |
| Wrong docs retrieved  | "waht is mashcine lerning?"    | Got REST doc instead of ML (dist 1.64)  | Add spell-check preprocessing   |
| Negation ignored      | "What is NOT machine learning?"| Retrieved ML doc anyway (dist 0.66!)    | Query rewriting or NLU layer    |
| Empty query accepted  | ""                             | Returned random docs (dist ~1.73)       | Validate input before query     |
| Typo misrouting       | "pytohn programming"           | Got Neural Networks first, not Python   | Fuzzy matching or autocorrect   |
""")


Part 2: Edge Case Analysis

--- 1. Empty/Short Queries ---

Q: ''
A: Based on the context: Machine learning enables systems to learn from experience without explicit pro...
Distances: [1.728951096534729, 1.7767951488494873, 1.8401323556900024]

Q: '?'
A: Based on the context: Machine learning enables systems to learn from experience without explicit pro...
Distances: [1.7679197788238525, 1.9057769775390625, 1.9309160709381104]

Q: 'python'
A: Based on the context: Python was created by Guido van Rossum and first released in 1991.
Machine lea...
Distances: [0.9062868356704712, 1.5207011699676514, 1.5896739959716797]

--- 2. Very Long Query ---
Q: I am trying to understand the historical development of programming languages 
a...
A: Based on the context: Python was created by Guido van Rossum and first released in 1991.
Git is a ve...
Distances: [1.1133191585540771, 1.6809802055358887, 1.7537423372268677]

--- 3. Typos ---

Q: waht is mashcine lerning?
A: Based on the context: REST is a

In [6]:
# ============================================================================
# PART 3: Improvement Experiments
# ============================================================================

print("\n" + "=" * 60)
print("Part 3: Improvement Experiments")
print("=" * 60)

# ===================
# TODO 3.1: Test different k values
# ===================
print("\n--- K-VALUE EXPERIMENT ---")
test_question = "What year was Python released?"

for k in [1, 3, 5, 6]:
    result = rag.query(test_question, k=k)
    print(f"\nk={k}:")
    print(f"  Docs retrieved: {len(result['retrieved_docs'])}")
    print(f"  Distances: {[round(d, 2) for d in result['distances']]}")
    print(f"  Top doc: {result['retrieved_docs'][0][:50]}...")

print("""
K-VALUE EXPERIMENT FINDINGS:
| k value | Pros                          | Cons                           | Best For                    |
|---------|-------------------------------|--------------------------------|-----------------------------|
| k=1     | Fast, focused, less noise     | May miss relevant context      | Simple factual questions    |
| k=3     | Good balance of context       | May include some irrelevant    | General purpose (default)   |
| k=5     | More context for synthesis    | Higher chance of noise         | Complex multi-part questions|
| k=6+    | Maximum context coverage      | Includes irrelevant docs       | When unsure what's relevant |
""")

# ===================
# TODO 3.2: Threshold filtering experiment
# ===================
print("\n--- THRESHOLD FILTERING ---")

def query_with_threshold(rag, question, threshold=1.5):
    """Only use documents above relevance threshold."""
    result = rag.query(question, k=5)
    
    filtered_docs = []
    filtered_distances = []
    for doc, dist in zip(result['retrieved_docs'], result['distances']):
        if dist < threshold:
            filtered_docs.append(doc)
            filtered_distances.append(dist)
    
    return {
        "question": question,
        "answer": result['answer'],
        "retrieved_docs": filtered_docs,
        "distances": filtered_distances,
        "filtered_out": len(result['retrieved_docs']) - len(filtered_docs)
    }

# Test threshold on different queries
test_queries = [
    ("What year was Python released?", 1.0),
    ("What is the capital of France?", 1.5),
]

for q, thresh in test_queries:
    result = query_with_threshold(rag, q, threshold=thresh)
    print(f"\nQ: {q} (threshold={thresh})")
    print(f"  Kept: {len(result['retrieved_docs'])} docs")
    print(f"  Filtered out: {result['filtered_out']} docs")
    print(f"  Distances: {[round(d, 2) for d in result['distances']]}")
    if not result['retrieved_docs']:
        print("  -> Would return 'I don't have enough information'")

print("\nThreshold recommendations:")
print("  - threshold < 1.0: Very strict, only highly relevant")
print("  - threshold 1.0-1.5: Balanced filtering")  
print("  - threshold > 1.5: Loose, allows marginal matches")



Part 3: Improvement Experiments

--- K-VALUE EXPERIMENT ---

k=1:
  Docs retrieved: 1
  Distances: [0.43]
  Top doc: Python was created by Guido van Rossum and first r...

k=3:
  Docs retrieved: 3
  Distances: [0.43, 1.64, 1.69]
  Top doc: Python was created by Guido van Rossum and first r...

k=5:
  Docs retrieved: 5
  Distances: [0.43, 1.64, 1.69, 1.69, 1.85]
  Top doc: Python was created by Guido van Rossum and first r...

k=6:
  Docs retrieved: 6
  Distances: [0.43, 1.64, 1.69, 1.69, 1.85, 1.97]
  Top doc: Python was created by Guido van Rossum and first r...

K-VALUE EXPERIMENT FINDINGS:
| k value | Pros                          | Cons                           | Best For                    |
|---------|-------------------------------|--------------------------------|-----------------------------|
| k=1     | Fast, focused, less noise     | May miss relevant context      | Simple factual questions    |
| k=3     | Good balance of context       | May include some irrelevant    | G

In [None]:
# ===================
# TODO 3.3: Document Change Experiments
# ===================
print("\n--- DOCUMENT CHANGE EXPERIMENTS ---")

# Experiment 1: Add contradicting info
print("\n1. CONTRADICTING INFO:")
rag_conflict = SimpleRAG()
rag_conflict.add_knowledge([
    "Python was created by Guido van Rossum and first released in 1991.",
    "Python was created in 2020 by a team at Google.",
])
result = rag_conflict.query("When was Python created?")
print(f"Q: When was Python created?")
print(f"Retrieved docs: {result['retrieved_docs']}")
print(f"Distances: {[round(d, 2) for d in result['distances']]}")
print("-> RAG retrieves BOTH conflicting docs! LLM must handle contradiction.")

# Experiment 2: Add duplicate
print("\n2. DUPLICATE DOCUMENTS:")
rag_dup = SimpleRAG()
rag_dup.add_knowledge([
    "Python was created by Guido van Rossum and first released in 1991.",
    "Python was created by Guido van Rossum and first released in 1991.",
    "Machine learning enables systems to learn from experience.",
], ids=["python1", "python2", "ml"])
result = rag_dup.query("What year was Python released?", k=3)
print(f"Q: What year was Python released?")
print(f"Retrieved docs: {result['retrieved_docs']}")
print(f"Distances: {[round(d, 2) for d in result['distances']]}")
print("-> Duplicates waste retrieval slots with identical content.")

# Experiment 3: Missing document
print("\n3. MISSING DOCUMENT (no Python info):")
rag_missing = SimpleRAG()
rag_missing.add_knowledge([
    "Machine learning enables systems to learn from experience.",
    "Docker is a platform for running applications in containers.",
    "Git is a version control system created by Linus Torvalds.",
])
result = rag_missing.query("What year was Python released?")
print(f"Q: What year was Python released?")
print(f"Retrieved docs: {result['retrieved_docs']}")
print(f"Distances: {[round(d, 2) for d in result['distances']]}")
print("-> Still returns docs! High distances indicate no good match.")

print("""
DOCUMENT CHANGE FINDINGS:
| Scenario              | Observation                           | Mitigation                      |
|-----------------------|---------------------------------------|---------------------------------|
| Contradicting docs    | Both retrieved, LLM sees conflict     | Dedup or version control docs   |
| Duplicate docs        | Wastes k slots with same content      | Deduplicate before indexing     |
| Missing relevant doc  | Returns unrelated docs (high dist)    | Use distance threshold filter   |
""")


In [None]:
# ============================================================================
# PART 4: Written Analysis
# ============================================================================

print("\n" + "=" * 60)
print("Part 4: Written Analysis")
print("=" * 60)

print("""
RAG STRENGTHS:
(What problems does RAG solve well?)
- Grounds LLM responses in actual data (reduces hallucination)
- Works well for direct factual questions
- No retraining needed - just update the document store

RAG LIMITATIONS:
- Always returns docs even when no relevant info exists(with out current implementation)
- Ignores negation ("NOT machine learning" still retrieves ML)
- Typos break retrieval
- Can't synthesize well across multiple unrelated docs

PRODUCTION CONSIDERATIONS:
- Add distance threshold (~1.5) to reject low-quality matches
- Validate/preprocess queries 
- Deduplicate documents before indexing
- Choose k based on use case (k=1 for facts, k=3+ for context)
""")

print("\n" + "=" * 60)
print("Exercise Complete!")
print("=" * 60)
