In [29]:
"""
Exercise 02: Build a Semantic Search System - Starter Code

Build a complete search engine over a document corpus.

Prerequisites:
- pip install chromadb sentence-transformers

Hints:
- Demo 02 shows a complete search pipeline
- Reading 03 covers k-NN search logic
- Reading 04 covers score interpretation
"""

import chromadb
import time
client = chromadb.Client()

In [30]:
# ============================================================================
# PART 1: Document Ingestion
# ============================================================================

print("\n" + "=" * 60)
print("Part 1: Document Ingestion")
print("=" * 60)

# TODO 1.1: Create collection with Cosine distance
# Hint: metadata={"hnsw:space": "cosine"}
collection = client.get_or_create_collection(name="tech_blog", metadata={"hnsw:space": "cosine"})

print("TODO: Create 'tech_blog' collection with cosine distance")


# TODO 1.2: Add the corpus with metadata
documents = [
    "Introduction to Python programming for beginners",
    "Advanced machine learning techniques using neural networks",
    "How to deploy applications using Docker containers",
    "Building REST APIs with Flask and FastAPI",
    "Understanding data structures: arrays, lists, and trees",
    "Deep learning fundamentals: CNNs and RNNs explained",
    "Kubernetes for container orchestration at scale",
    "Natural language processing with transformers",
    "Database design patterns for scalable applications",
    "Getting started with cloud computing on AWS"
]

ids = [f"doc_{i+1}" for i in range(len(documents))]

# TODO: Define metadata for each document
# Categories: "programming", "ai", "devops", "data"
# Difficulty: "beginner", "intermediate", "advanced"
metadatas = [
    {"category": "programming", "difficulty": "beginner"},  # doc_1
    {"category": "ai", "difficulty": "advanced"},           # doc_2
    {"category": "devops", "difficulty": "intermediate"},   # doc_3
    {"category": "data", "difficulty": "beginner"},         # doc_4
    {"category": "programming", "difficulty": "intermediate"}, # doc_5
    {"category": "ai", "difficulty": "beginner"},           # doc_6
    {"category": "devops", "difficulty": "advanced"},       # doc_7
    {"category": "ai", "difficulty": "expert"},     # doc_8
    {"category": "programming", "difficulty": "advanced"},  # doc_9
    {"category": "ai", "difficulty": "intermediate"},       # doc_10
]

collection.add(documents=documents, ids=ids, metadatas=metadatas)
print("TODO: Add documents with metadata")




Part 1: Document Ingestion
TODO: Create 'tech_blog' collection with cosine distance
TODO: Add documents with metadata


In [31]:

# ============================================================================
# PART 2: Search Implementation
# ============================================================================

print("\n" + "=" * 60)
print("Part 2: Search Implementation")
print("=" * 60)

# TODO 2.1: Implement basic search function
def search(query, k=5):
    """
    Search for documents similar to the query.
    
    Args:
        query: Natural language search query
        k: Number of results to return
        
    Returns:
        dict with documents, distances, and ids
    """
    # Hint: collection.query(query_texts=[query], n_results=k, include=[...])
    return collection.query(query_texts=[query], n_results=k, include=["documents", "distances"])


# TODO 2.2: Convert distance to relevance score (0-100)
def distance_to_score(distance):
    """
    Convert Chroma distance to a 0-100 relevance score.
    
    Lower distance -> Higher score
    
    Hint: score = max(0, (1 - distance) * 100)
    """
    return max(0, (1 - distance) * 100)


# TODO 2.3: Implement filtered search
def search_by_category(query, category, k=3):
    """
    Search within a specific category.
    
    Hint: Add where={"category": category} to query()
    """
    return collection.query(query_texts=[query], n_results=k, include=["documents", "distances", "ids"], where={"category": category})


Part 2: Search Implementation


In [33]:

# ============================================================================
# PART 3: Search Quality Testing
# ============================================================================

print("\n" + "=" * 60)
print("Part 3: Search Quality Testing")
print("=" * 60)

# TODO 3.1: Test these queries
test_queries = [
    "How do I start learning to code?",
    "AI and neural networks",
    "deploying apps to production",
    "working with data and databases"
]

print("TODO: Test each query and analyze top 3 results")
for query in test_queries:
    results = search(query, k=3)
    print(f"\nQuery: '{query}'")
    # Analyze and print results
    for i in range(len(results['documents'][0])):
        doc = results['documents'][0][i]
        dist = results['distances'][0][i]
        score = distance_to_score(dist)
        print(f"  {i+1}. [{score:.1f}%] {doc}")



# TODO 3.2: Test edge cases
print("\nEdge Case Tests:")
print("  1. No match: 'Italian cooking recipes' - Results: ___")
print("  2. Exact match: 'Python programming for beginners' - Results: ___")
print("  3. Partial: 'Python' - Results: ___")


# TODO 3.3: Implement threshold filter
def search_with_threshold(query, threshold=50, k=5):
    """
    Only return results above the threshold score.
    """
    results = search(query, k=k)
    filtered_docs = []
    filtered_scores = []
    
    for i in range(len(results['documents'][0])):
        score = distance_to_score(results['distances'][0][i])
        if score >= threshold:
            filtered_docs.append(results['documents'][0][i])
            filtered_scores.append(score)
    
    return {"documents": filtered_docs, "scores": filtered_scores}

print("=" * 60)
print("TODO: Test threshold filter")
print("=" * 60)
result = search_with_threshold("AI and neural networks", threshold=50)
print(f"Results above 50%: {result}")

print("\nThreshold recommendations:")
print("  High-precision (score > 50): Only very relevant")
print("  High-recall (score > 20): Don't miss related docs")



Part 3: Search Quality Testing
TODO: Test each query and analyze top 3 results

Query: 'How do I start learning to code?'
  1. [46.6%] Introduction to Python programming for beginners
  2. [25.8%] Getting started with cloud computing on AWS
  3. [18.5%] Understanding data structures: arrays, lists, and trees

Query: 'AI and neural networks'
  1. [55.1%] Advanced machine learning techniques using neural networks
  2. [50.9%] Deep learning fundamentals: CNNs and RNNs explained
  3. [25.2%] Natural language processing with transformers

Query: 'deploying apps to production'
  1. [59.0%] How to deploy applications using Docker containers
  2. [26.3%] Kubernetes for container orchestration at scale
  3. [18.4%] Getting started with cloud computing on AWS

Query: 'working with data and databases'
  1. [49.5%] Database design patterns for scalable applications
  2. [45.6%] Understanding data structures: arrays, lists, and trees
  3. [24.7%] Introduction to Python programming for beginners

E

In [35]:

# ============================================================================
# PART 4: Pretty Print Results
# ============================================================================

print("\n" + "=" * 60)
print("Part 4: Pretty Print Results")
print("=" * 60)

# TODO 4.1: Implement display_results function
def display_results(results, query, elapsed_time=0):
    print("-" * 40)
    print(f'Search: "{query}"')
    print("-" * 40)
    
    docs = results['documents'][0]
    dists = results['distances'][0]
    
    for i in range(len(docs)):
        score = distance_to_score(dists[i])
        print(f"\n  #{i+1} [Score: {score:.0f}] {docs[i]}")
    
    print("\n" + "-" * 40)
    print(f"Found {len(docs)} results in {elapsed_time:.3f} seconds")
    print("-" * 40)


print("TODO: Implement pretty print and test with a sample query")
import time
start = time.time()
results = search("machine learning", k=3)
display_results(results, "machine learning", time.time() - start)

print("\n" + "=" * 60)
print("Exercise Complete!")
print("=" * 60)



Part 4: Pretty Print Results
TODO: Implement pretty print and test with a sample query
----------------------------------------
Search: "machine learning"
----------------------------------------

  #1 [Score: 60] Advanced machine learning techniques using neural networks

  #2 [Score: 31] Deep learning fundamentals: CNNs and RNNs explained

  #3 [Score: 31] Natural language processing with transformers

----------------------------------------
Found 3 results in 0.058 seconds
----------------------------------------

Exercise Complete!
