# üîç Week 7: Hybrid Retrieval

**Learning Objectives:**
1. Understand lexical vs semantic search trade-offs
2. Implement BM25 for lexical search
3. Build hybrid retrieval combining both approaches
4. Evaluate retrieval quality with precision/recall

---

In [None]:
import numpy as np
from collections import Counter
import math
import re

---
# Section 1: Theory
---

## Lexical vs Semantic Search

| Aspect | Lexical (BM25) | Semantic (Embeddings) |
|--------|---------------|----------------------|
| Matches | Exact keywords | Meaning/context |
| "bank account" | ‚úÖ Finds exact | ‚úÖ Finds "savings" |
| Synonyms | ‚ùå Misses | ‚úÖ Captures |
| Rare terms | ‚úÖ Excellent | ‚ö†Ô∏è May miss |

## Why Hybrid?
Combines strengths of both: keyword precision + semantic understanding

---
# Section 2: Hands-On Implementation
---

In [None]:
class BM25:
    """BM25 lexical search implementation."""
    
    def __init__(self, k1=1.5, b=0.75):
        self.k1 = k1
        self.b = b
        self.docs = []
        self.doc_freqs = []
        self.idf = {}
        self.avg_dl = 0
    
    def _tokenize(self, text):
        return re.findall(r'\w+', text.lower())
    
    def fit(self, documents):
        self.docs = documents
        self.doc_freqs = [Counter(self._tokenize(d)) for d in documents]
        self.avg_dl = sum(len(df) for df in self.doc_freqs) / len(documents)
        
        # Calculate IDF
        df = Counter()
        for doc_freq in self.doc_freqs:
            for term in doc_freq:
                df[term] += 1
        
        N = len(documents)
        for term, freq in df.items():
            self.idf[term] = math.log((N - freq + 0.5) / (freq + 0.5) + 1)
    
    def search(self, query, top_k=5):
        query_terms = self._tokenize(query)
        scores = []
        
        for i, doc_freq in enumerate(self.doc_freqs):
            score = 0
            dl = sum(doc_freq.values())
            
            for term in query_terms:
                if term in doc_freq:
                    tf = doc_freq[term]
                    idf = self.idf.get(term, 0)
                    numerator = tf * (self.k1 + 1)
                    denominator = tf + self.k1 * (1 - self.b + self.b * dl / self.avg_dl)
                    score += idf * numerator / denominator
            
            scores.append((i, score))
        
        scores.sort(key=lambda x: x[1], reverse=True)
        return scores[:top_k]

In [None]:
class HybridRetriever:
    """Combines BM25 and semantic search."""
    
    def __init__(self, alpha=0.5):
        self.alpha = alpha  # Weight for semantic
        self.bm25 = BM25()
        self.embeddings = []
        self.docs = []
    
    def _embed(self, text):
        # Simulated embedding
        np.random.seed(hash(text) % 2**32)
        return np.random.randn(384)
    
    def fit(self, documents):
        self.docs = documents
        self.bm25.fit(documents)
        self.embeddings = [self._embed(d) for d in documents]
    
    def _normalize_scores(self, scores):
        if not scores:
            return scores
        max_s = max(s for _, s in scores)
        min_s = min(s for _, s in scores)
        if max_s == min_s:
            return [(i, 1.0) for i, _ in scores]
        return [(i, (s - min_s) / (max_s - min_s)) for i, s in scores]
    
    def search(self, query, top_k=5):
        # Lexical search
        bm25_results = self.bm25.search(query, top_k=len(self.docs))
        bm25_scores = dict(self._normalize_scores(bm25_results))
        
        # Semantic search
        query_emb = self._embed(query)
        semantic_scores = []
        for i, emb in enumerate(self.embeddings):
            sim = np.dot(query_emb, emb) / (np.linalg.norm(query_emb) * np.linalg.norm(emb))
            semantic_scores.append((i, sim))
        semantic_scores = dict(self._normalize_scores(semantic_scores))
        
        # Combine scores
        combined = []
        for i in range(len(self.docs)):
            bm25_s = bm25_scores.get(i, 0)
            sem_s = semantic_scores.get(i, 0)
            combined_score = self.alpha * sem_s + (1 - self.alpha) * bm25_s
            combined.append((i, combined_score, self.docs[i]))
        
        combined.sort(key=lambda x: x[1], reverse=True)
        return combined[:top_k]

In [None]:
# Test hybrid retrieval
documents = [
    "Machine learning is a subset of artificial intelligence",
    "Deep neural networks power modern AI systems",
    "Python is popular for data science and ML",
    "Natural language processing enables text understanding",
    "Vector databases store embeddings efficiently"
]

retriever = HybridRetriever(alpha=0.5)
retriever.fit(documents)

query = "AI and machine learning"
results = retriever.search(query, top_k=3)

print(f"Query: '{query}'\n")
print("Hybrid Search Results:")
for idx, score, doc in results:
    print(f"  [{score:.4f}] {doc}")

---
# Section 3: Evaluation Metrics
---

In [None]:
def precision_at_k(retrieved, relevant, k):
    """Precision@K: fraction of retrieved docs that are relevant."""
    retrieved_k = set(retrieved[:k])
    relevant_set = set(relevant)
    return len(retrieved_k & relevant_set) / k

def recall_at_k(retrieved, relevant, k):
    """Recall@K: fraction of relevant docs that are retrieved."""
    retrieved_k = set(retrieved[:k])
    relevant_set = set(relevant)
    if not relevant_set:
        return 0
    return len(retrieved_k & relevant_set) / len(relevant_set)

def mrr(retrieved, relevant):
    """Mean Reciprocal Rank."""
    for i, doc in enumerate(retrieved):
        if doc in relevant:
            return 1 / (i + 1)
    return 0

# Example
retrieved = [0, 2, 1, 4, 3]
relevant = [0, 1]

print(f"Precision@3: {precision_at_k(retrieved, relevant, 3):.2f}")
print(f"Recall@3: {recall_at_k(retrieved, relevant, 3):.2f}")
print(f"MRR: {mrr(retrieved, relevant):.2f}")

---
# Section 4: Unit Tests
---

In [None]:
def run_tests():
    print("Running Unit Tests...\n")
    
    # Test BM25
    bm25 = BM25()
    bm25.fit(["hello world", "world peace"])
    results = bm25.search("hello", top_k=1)
    assert results[0][0] == 0
    print("‚úì BM25 search test passed")
    
    # Test Precision
    assert precision_at_k([0, 1, 2], [0, 1], 2) == 1.0
    print("‚úì Precision@K test passed")
    
    # Test MRR
    assert mrr([2, 1, 0], [0]) == 1/3
    print("‚úì MRR test passed")
    
    print("\nüéâ All tests passed!")

run_tests()

---
# Section 5: Interview Prep
---

### Q1: When would you use hybrid search?
**Answer:** When you need both keyword precision (product names, codes) and semantic understanding (concepts, synonyms).

### Q2: How do you tune the alpha parameter?
**Answer:** Use a validation set with labeled relevance. Grid search over alpha values. Optimize for your target metric (NDCG, MRR).

---
# Section 6: Deliverable
---

**Created:** `hybrid_retriever.py` with BM25 + semantic search

**Next Week:** Re-ranking