In [1]:
"""
Exercise 01: Build a RAG Pipeline - Starter Code

Build a Retrieval-Augmented Generation system.

Prerequisites:
- pip install chromadb

Hints:
- Reading 03 (rag-basics-introduction.md) explains the pattern
- Demo 02 (demo_02_basic_rag.py) has a complete implementation
"""

import chromadb

# ============================================================================
# SETUP
# ============================================================================

print("=" * 60)
print("Exercise 01: Build a RAG Pipeline")
print("=" * 60)

client = chromadb.Client()

Exercise 01: Build a RAG Pipeline


In [2]:

# ============================================================================
# PART 1: Knowledge Base Setup
# ============================================================================

print("\n" + "=" * 60)
print("Part 1: Knowledge Base Setup")
print("=" * 60)

# TODO 1.1: Create collection with cosine distance
collection = client.create_collection(
    name="rag_knowledge_base",
    metadata={"hnsw:space": "cosine"}
)
print("TODO: Create 'rag_knowledge_base' collection")


# TODO 1.2: Add knowledge documents
knowledge_docs = [
    "Python was created by Guido van Rossum and first released in 1991. It emphasizes code readability with its notable use of significant indentation.",
    "Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed.",
    "Docker is a platform for developing, shipping, and running applications in containers. Containers are lightweight, standalone packages that include everything needed to run software.",
    "REST (Representational State Transfer) is an architectural style for designing networked applications. It relies on stateless, client-server communication using HTTP methods.",
    "Neural networks are computing systems inspired by biological neural networks. They consist of layers of interconnected nodes (neurons) that process information.",
    "Git is a distributed version control system for tracking changes in source code. It was created by Linus Torvalds in 2005 for Linux kernel development."
]

doc_ids = [f"doc_{i+1}" for i in range(len(knowledge_docs))]

collection.add(documents=knowledge_docs, ids=doc_ids)
print("TODO: Add knowledge documents")


Part 1: Knowledge Base Setup
TODO: Create 'rag_knowledge_base' collection
TODO: Add knowledge documents


In [3]:
# ============================================================================
# PART 2: Implement the Retriever
# ============================================================================

print("\n" + "=" * 60)
print("Part 2: Implement the Retriever")
print("=" * 60)

# TODO 2.1: Create retriever function
def retrieve(question, k=3):
    """
    Retrieve top-k relevant documents for a question.
    
    Returns:
        dict with 'documents' and 'distances'
    
    Hint: Use collection.query(query_texts=[question], n_results=k, ...)
    """
    results = collection.query(
        query_texts=[question],
        n_results=k,
        include=["documents", "distances"]
    )
    return {
        'documents': results['documents'][0],
        'distances': results['distances'][0]
    }


# TODO 2.2: Test the retriever
test_questions = [
    "Who created Python?",
    "What is machine learning?",
    "How does Docker work?"
]

for question in test_questions:
    print(f"\nQ: {question}")
    results = retrieve(question)
    for i, (doc, dist) in enumerate(zip(results['documents'], results['distances'])):
        print(f"  {i+1}. (dist: {dist:.3f}) {doc[:80]}...")



Part 2: Implement the Retriever

Q: Who created Python?
  1. (dist: 0.196) Python was created by Guido van Rossum and first released in 1991. It emphasizes...
  2. (dist: 0.832) Git is a distributed version control system for tracking changes in source code....
  3. (dist: 0.851) Machine learning is a subset of artificial intelligence that enables systems to ...

Q: What is machine learning?
  1. (dist: 0.183) Machine learning is a subset of artificial intelligence that enables systems to ...
  2. (dist: 0.624) Neural networks are computing systems inspired by biological neural networks. Th...
  3. (dist: 0.816) REST (Representational State Transfer) is an architectural style for designing n...

Q: How does Docker work?
  1. (dist: 0.244) Docker is a platform for developing, shipping, and running applications in conta...
  2. (dist: 0.742) Git is a distributed version control system for tracking changes in source code....
  3. (dist: 0.849) REST (Representational State Transfer) is an

In [7]:
# ============================================================================
# PART 3: Implement the Augmenter
# ============================================================================

print("\n" + "=" * 60)
print("Part 3: Implement the Augmenter")
print("=" * 60)

# TODO 3.1: Create prompt builder
def build_prompt(question, context_docs):
    """
    Build a prompt with the question and retrieved context.
    """
    context = "\n".join(context_docs)
    
    prompt = f"""You are a helpful assistant. Answer based ONLY on the context.
            If context doesn't contain enough info, say so.

            Context:
            {context}

            Question: {question}

            Answer:"""
    return prompt


# TODO 3.2: Test the prompt builder
sample_results = retrieve("Who created Python?")
sample_prompt = build_prompt("Who created Python?", sample_results['documents'])
print(sample_prompt)




Part 3: Implement the Augmenter
You are a helpful assistant. Answer based ONLY on the context.
            If context doesn't contain enough info, say so.

            Context:
            Python was created by Guido van Rossum and first released in 1991. It emphasizes code readability with its notable use of significant indentation.
Git is a distributed version control system for tracking changes in source code. It was created by Linus Torvalds in 2005 for Linux kernel development.
Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed.

            Question: Who created Python?

            Answer:


In [11]:
# TODO 4.1: Create Mock LLM
class MockLLM:
    def generate(self, prompt):
        if "Context:" not in prompt or len(prompt) < 100:
            return "I don't have enough information to answer that question."
        
        # Extract the context section
        context_start = prompt.find("Context:") + 8
        question_start = prompt.find("Question:")
        context = prompt[context_start:question_start].strip().lower()
        
        # Extract the question
        question_end = prompt.find("Answer:")
        question = prompt[question_start + 9:question_end].strip().lower()
        
        # Check if context actually relates to question keywords
        keywords = [w for w in question.split() if len(w) > 3]
        matches = sum(1 for kw in keywords if kw in context)
        
        if matches < 1:
            return "I don't have enough information in the context to answer that question."
        
        # Return first sentence of most relevant context
        first_doc = prompt[context_start:question_start].strip().split('\n')[0]
        return f"Based on the context: {first_doc[:150]}..."


# TODO 4.2: Test the generator
llm = MockLLM()

# Test with context (using our previous functions)
sample_results = retrieve("Who created Python?")
sample_prompt = build_prompt("Who created Python?", sample_results['documents'])
print("With context:")
print(llm.generate(sample_prompt))

# Test without context
print("\nWithout context:")
print(llm.generate("What is the weather?"))

With context:
Based on the context: Python was created by Guido van Rossum and first released in 1991. It emphasizes code readability with its notable use of significant indentation....

Without context:
I don't have enough information to answer that question.


In [13]:

# ============================================================================
# PART 5: Put It All Together
# ============================================================================

print("\n" + "=" * 60)
print("Part 5: Complete RAG System")
print("=" * 60)

# TODO 5.1: Create the RAG class
class SimpleRAG:
    """
    Complete RAG system combining retrieval, augmentation, and generation.
    
    Hint: Demo 02's BasicRAG class has this exact structure
    """
    
    def __init__(self):
        # Initialize Chroma client, collection, and MockLLM
        self.client = chromadb.Client()
        self.collection = self.client.get_or_create_collection(
            name="rag_kb",
            metadata={"hnsw:space": "cosine"}
        )
        self.llm = MockLLM()
    
    def add_knowledge(self, documents):
        doc_ids = [f"doc_{i}" for i in range(len(documents))]
        self.collection.add(documents=documents, ids=doc_ids)
    
    
    def query(self, question, k=3):
        """
        Answer a question using RAG.
        """
        # 1. RETRIEVE
        results = self.collection.query(
            query_texts=[question],
            n_results=k,
            include=["documents", "distances"]
        )
        docs = results['documents'][0]
        
        # 2. AUGMENT
        context = "\n".join(docs)
        prompt = f"""You are a helpful assistant. Answer based ONLY on the context.
If context doesn't contain enough info, say so.

Context:
{context}

Question: {question}

Answer:"""
        
        # 3. GENERATE
        answer = self.llm.generate(prompt)
        
        return {
            'question': question,
            'answer': answer,
            'sources_used': docs
        }


#  TODO 5.2: Test the complete system
rag = SimpleRAG()

# Add knowledge
rag.add_knowledge([
    "Python was created by Guido van Rossum and first released in 1991.",
    "Neural networks are computing systems inspired by biological neural networks.",
    "Git is a distributed version control system created by Linus Torvalds in 2005."
])

test_rag_questions = [
    "Who created Python and when?",
    "What are neural networks?",
    "How do I use Git?",
    "What is the best pizza topping?"
]

for q in test_rag_questions:
    result = rag.query(q)
    print(f"\nQ: {result['question']}")
    print(f"A: {result['answer']}")
    print(f"Sources: {len(result['sources_used'])} docs retrieved")


Part 5: Complete RAG System

Q: Who created Python and when?
A: Based on the context: Python was created by Guido van Rossum and first released in 1991....
Sources: 3 docs retrieved

Q: What are neural networks?
A: Based on the context: Neural networks are computing systems inspired by biological neural networks....
Sources: 3 docs retrieved

Q: How do I use Git?
A: I don't have enough information in the context to answer that question.
Sources: 3 docs retrieved

Q: What is the best pizza topping?
A: I don't have enough information in the context to answer that question.
Sources: 3 docs retrieved


In [14]:
# ============================================================================
# PART 6: Enhanced Output
# ============================================================================

print("\n" + "=" * 60)
print("Part 6: Enhanced Output")
print("=" * 60)

# TODO 6.1 & 6.2: Create display function
def display_answer(result):
    """Pretty print the RAG result."""
    print("\n" + "-" * 40)
    print(f'Question: "{result["question"]}"')
    print("-" * 40)
    print(f"\nAnswer: {result['answer']}")
    print(f"\nSources: {len(result['sources_used'])} documents used")
    for i, source in enumerate(result['sources_used'], 1):
        print(f'  {i}. "{source[:50]}..."')
    print("-" * 40)


# Test with all questions
for q in test_rag_questions:
    result = rag.query(q)
    display_answer(result)

print("\n" + "=" * 60)
print("Exercise Complete!")
print("=" * 60)



Part 6: Enhanced Output

----------------------------------------
Question: "Who created Python and when?"
----------------------------------------

Answer: Based on the context: Python was created by Guido van Rossum and first released in 1991....

Sources: 3 documents used
  1. "Python was created by Guido van Rossum and first r..."
  2. "Git is a distributed version control system create..."
  3. "Neural networks are computing systems inspired by ..."
----------------------------------------

----------------------------------------
Question: "What are neural networks?"
----------------------------------------

Answer: Based on the context: Neural networks are computing systems inspired by biological neural networks....

Sources: 3 documents used
  1. "Neural networks are computing systems inspired by ..."
  2. "Git is a distributed version control system create..."
  3. "Python was created by Guido van Rossum and first r..."
----------------------------------------

--------------