# Search Enhancements - Interactive Learning
# تحسينات البحث - تعلم تفاعلي

This notebook covers:
- Trie data structure for autocomplete
- Auto-suggest types (document, query, topic)
- Query expansion strategies
- Faceted search implementation

يغطي هذا المفكرة:
- هيكل البيانات Trie للإكمال التلقائي
- أنواع الاقتراح (مستند، استعلام، موضوع)
- استراتيجيات توسيع الاستعلامات
- تنفيذ البحث المجزوء

## Part 1: Trie Data Structure
## الجزء 1: هيكل البيانات Trie

In [None]:
# Trie implementation for efficient autocomplete
from collections import defaultdict

class TrieNode:
    """Node in trie data structure."""
    __slots__ = ['children', 'is_end', 'score']
    
    def __init__(self):
        self.children = defaultdict(TrieNode)
        self.is_end = False
        self.score = 0.0

class Trie:
    """Trie data structure for prefix search."""
    
    def __init__(self):
        self.root = TrieNode()
        self._size = 0
    
    def insert(self, word: str, score: float = 1.0):
        """Insert word into trie."""
        node = self.root
        for char in word.lower():
            node = node.children[char]
        node.is_end = True
        node.score = score
        self._size += 1
    
    def autocomplete(self, prefix: str, limit: int = 10):
        """Get autocomplete suggestions for prefix."""
        node = self.root
        
        # Navigate to prefix end
        for char in prefix.lower():
            if char not in node.children:
                return []
            node = node.children[char]
        
        # Collect all words from this node
        results = []
        
        def dfs(current_node, current_word):
            if len(results) >= limit:
                return
            if current_node.is_end:
                results.append((current_word, current_node.score))
            for char, child_node in current_node.children.items():
                dfs(child_node, current_word + char)
        
        dfs(node, prefix)
        return results

# Create and populate trie
trie = Trie()
documents = [
    "project_plan.pdf", "project_specs.docx", "project_timeline.pdf",
    "rag_architecture.md", "rag_guide.pdf", "rag_tutorial.md",
    "machine_learning.pdf", "deep_learning.pdf", "neural_networks.pdf",
]

for doc in documents:
    trie.insert(doc, score=1.0)

print(f"Trie populated with {trie._size} documents")

In [None]:
# Test autocomplete
test_prefixes = ["proj", "rag", "machine", "deep"]

for prefix in test_prefixes:
    print(f"\nPrefix: '{prefix}'")
    suggestions = trie.autocomplete(prefix, limit=5)
    
    for doc_name, score in suggestions:
        print(f"  - {doc_name} (score: {score:.2f})")

### Exercise 1: Implement Synonym-Based Expansion
### تمرين 1: تنفيذ التوسيع المستند إلى المرادفات

In [None]:
# TODO: Implement synonym-based query expansion

SYNONYMS = {
    "rag": ["retrieval augmented generation", "retrieval-augmented"],
    "vector": ["embedding", "representation", "feature vector"],
    "search": ["query", "find", "retrieve"],
    "document": ["file", "record", "entry", "paper"],
}

def expand_with_synonyms(query: str) -> list:
    """Expand query using synonym mapping."""
    expanded = [query.lower()]
    words = query.lower().split()
    
    for word in words:
        if word in SYNONYMS:
            for synonym in SYNONYMS[word]:
                # Replace word with synonym in original query
                expanded.append(
                    " ".join(synonym if w != word else w for w in words)
                )
    
    return list(set(expanded))  # Remove duplicates

# Test
test_queries = ["rag architecture", "vector search", "find document"]

for query in test_queries:
    print(f"\nQuery: '{query}'")
    expanded = expand_with_synonyms(query)
    for exp in expanded:
        print(f"  - {exp}")

### Solution / الحل

In [None]:
# Solution

SYNONYMS = {
    "rag": ["retrieval augmented generation", "retrieval-augmented"],
    "vector": ["embedding", "representation", "feature vector"],
    "search": ["query", "find", "retrieve"],
    "document": ["file", "record", "entry", "paper"],
}

def expand_with_synonyms(query: str) -> list:
    """Expand query using synonym mapping."""
    expanded = [query.lower()]
    words = query.lower().split()
    
    for word in words:
        if word in SYNONYMS:
            for synonym in SYNONYMS[word]:
                expanded.append(
                    " ".join(synonym if w != word else w for w in words)
                )
    
    return list(set(expanded))

for query in test_queries:
    print(f"\nQuery: '{query}'")
    expanded = expand_with_synonyms(query)
    print(f"Expansions: {len(expanded)}")
    for exp in expanded:
        print(f"  - {exp}")

## Part 2: Faceted Search
## الجزء 2: البحث المجزوء

In [None]:
# Implement faceted search
from datetime import datetime, timedelta

sample_documents = [
    {"name": "doc1.pdf", "status": "indexed", "type": "pdf", "size_bytes": 100*1024, "date": datetime(2024, 1, 15)},
    {"name": "doc2.pdf", "status": "indexed", "type": "pdf", "size_bytes": 500*1024, "date": datetime(2024, 1, 20)},
    {"name": "doc3.txt", "status": "indexed", "type": "text", "size_bytes": 10*1024, "date": datetime(2024, 1, 25)},
    {"name": "doc4.pdf", "status": "processing", "type": "pdf", "size_bytes": 2000*1024, "date": datetime(2024, 1, 10)},
    {"name": "doc5.pdf", "status": "failed", "type": "pdf", "size_bytes": 5000*1024, "date": datetime(2024, 1, 5)},
]

# Status facets
status_counts = {}
for doc in sample_documents:
    status = doc['status']
    status_counts[status] = status_counts.get(status, 0) + 1

print("Status Facets:")
for status, count in sorted(status_counts.items()):
    print(f"  {status}: {count}")

# Content type facets
type_counts = {}
for doc in sample_documents:
    dtype = doc['type']
    type_counts[dtype] = type_counts.get(dtype, 0) + 1

print("\nContent Type Facets:")
for dtype, count in sorted(type_counts.items()):
    print(f"  {dtype}: {count}")

In [None]:
# Size range facets
RANGES = [
    ("0-100KB", 0, 100 * 1024),
    ("100KB-1MB", 100 * 1024, 1024 * 1024),
    ("1MB-10MB", 1024 * 1024, 10 * 1024 * 1024),
    ("10MB+", 10 * 1024 * 1024, float('inf')),
]

print("\nSize Range Facets:")
for name, min_size, max_size in RANGES:
    count = sum(1 for d in sample_documents if min_size <= d['size_bytes'] < max_size)
    print(f"  {name}: {count}")

In [None]:
# Date range facets
now = datetime.now()
RANGES = [
    ("Last 7 days", now - timedelta(days=7), now),
    ("Last 30 days", now - timedelta(days=30), now),
    ("Older than 30 days", None, now - timedelta(days=30)),
]

print("\nDate Range Facets:")
for name, min_date, max_date in RANGES:
    if min_date:
        count = sum(1 for d in sample_documents if min_date <= d['date'] < max_date)
    else:
        count = sum(1 for d in sample_documents if d['date'] < max_date)
    print(f"  {name}: {count}")

### Exercise 2: Custom Facet
### تمرين 2: جانب مخصص

In [None]:
# TODO: Implement custom facet calculation

def compute_custom_facet(documents, field_name, extractor):
    """Compute custom facet based on field extractor."""
    facet_values = {}
    
    for doc in documents:
        value = extractor(doc)
        facet_values[value] = facet_values.get(value, 0) + 1
    
    return facet_values

# Example: Extract document extension as facet
def get_extension(doc):
    return doc['name'].split('.')[-1] if '.' in doc['name'] else 'unknown'

print("Custom Facet (File Extension):")
extension_facet = compute_custom_facet(sample_documents, 'extension', get_extension)
for ext, count in sorted(extension_facet.items()):
    print(f"  {ext}: {count}")

## Part 3: Performance Comparison
## الجزء 3: مقارنة الأداء

In [None]:
# Compare trie vs linear search performance
import time
import random

# Generate large document list
doc_names = [f"document_{i}.pdf" for i in range(10000)]

# Build trie
trie = Trie()
for name in doc_names:
    trie.insert(name)

# Test prefixes
test_prefixes = ["doc", "document", "file", "report"]

print("Performance Comparison (10,000 documents):\n")
print(f"{'Prefix':<15} {'Trie (ms)':<15} {'Linear (ms)':<15} {'Speedup':<10}")
print("-" * 60)

for prefix in test_prefixes:
    # Trie search
    start = time.time()
    trie_results = trie.autocomplete(prefix, limit=10)
    trie_time = (time.time() - start) * 1000
    
    # Linear search
    start = time.time()
    linear_results = [d for d in doc_names if d.lower().startswith(prefix)][:10]
    linear_time = (time.time() - start) * 1000
    
    speedup = linear_time / trie_time if trie_time > 0 else 1.0
    
    print(f"{prefix:<15} {trie_time:<15.2f} {linear_time:<15.2f} {speedup:<10.2f}x")

## Part 4: Relevance Scoring
## الجزء 4: الترتيب حسب الصلة

In [None]:
# Implement relevance scoring for suggestions
from datetime import datetime, timedelta

class Suggestion:
    def __init__(self, text, s_type, score=1.0, last_accessed=None):
        self.text = text
        self.type = s_type
        self.score = score
        self.last_accessed = last_accessed
    
    def calculate_relevance(self, query, now=None):
        """Calculate relevance score based on multiple factors."""
        now = now or datetime.now()
        
        # Factor 1: Prefix match quality
        if self.text.lower().startswith(query.lower()):
            self.score *= 1.5  # Boost for prefix match
        
        # Factor 2: Recency
        if self.last_accessed:
            days_since = (now - self.last_accessed).total_seconds() / 86400
            if days_since < 7:
                self.score *= 1.2  # Recent documents get boost
            elif days_since < 30:
                self.score *= 1.05
        
        return self.score

# Create suggestions with different relevance factors
now = datetime.now()
suggestions = [
    Suggestion("project_plan.pdf", "document", score=1.0, last_accessed=now - timedelta(days=1)),
    Suggestion("project_specs.docx", "document", score=1.0, last_accessed=now - timedelta(days=30)),
    Suggestion("project_timeline.pdf", "document", score=1.0, last_accessed=now - timedelta(days=60)),
    Suggestion("retrieval augmented generation", "query", score=0.9),
    Suggestion("rag architecture", "query", score=0.8),
]

query = "proj"
print(f"Relevance scoring for query: '{query}'\n")

for s in suggestions:
    relevance = s.calculate_relevance(query)
    print(f"{s.text:<40} {s.type:<10} {relevance:<10.2f}")

## Summary / الملخص

**Key concepts covered / المفاهيم الرئيسية المشمولة:**

1. **Trie data structure** - Efficient prefix-based autocomplete
2. **Auto-suggest types** - Document, query, and topic suggestions
3. **Query expansion** - Synonyms and LLM-based expansions
4. **Faceted search** - Organizing results by categories
5. **Performance optimization** - Trie vs linear search comparison
6. **Relevance scoring** - Multi-factor scoring for better results

**النقاط الرئيسية المشمولة:**

1. **هيكل البيانات Trie** - إكمال تلقائي فعال بالبادئات
2. **أنواع الاقتراح** - اقتراحات المستندات والاستعلامات والمواضيع
3. **توسيع الاستعلامات** - المرادفات والتوسيع المستند إلى LLM
4. **البحث المجزوء** - تنظيم النتائج حسب الفئات
5. **تحسين الأداء** - مقارنة Trie مع البحث الخطي
6. **الترتيب حسب الصلة** - تسجيل متعدد العوامل لنتائج أفضل