In [None]:
# Context Optimization for RAG Systems
# Stage 2, Notebook 16: 上下文優化 - 去重/MMR/Token Budget

# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
## Cell 2: Load Dependencies and Existing Index
import json
import numpy as np
from pathlib import Path
from typing import List, Tuple, Dict, Any
from dataclasses import dataclass
from rapidfuzz import fuzz
import tiktoken
import faiss
from sentence_transformers import SentenceTransformer

# Load embedding model
print("Loading BGE-M3 model...")
embedding_model = SentenceTransformer("BAAI/bge-m3",
                                     cache_folder=f"{AI_CACHE_ROOT}/hf")
embed_dim = embedding_model.get_sentence_embedding_dimension()
print(f"Embedding dimension: {embed_dim}")

# Load or create sample index and chunks
index_path = "indices/general.faiss"
chunks_path = "indices/chunks.jsonl"

if Path(index_path).exists() and Path(chunks_path).exists():
    # Load existing index
    index = faiss.read_index(index_path)
    chunks = []
    with open(chunks_path, 'r', encoding='utf-8') as f:
        for line in f:
            chunks.append(json.loads(line))
    print(f"Loaded {len(chunks)} chunks and FAISS index")
else:
    # Create sample data for demonstration
    print("Creating sample data for demonstration...")
    sample_texts = [
        "人工智慧是電腦科學的一個分支，致力於創建能夠執行通常需要人類智慧的任務的系統。",
        "機器學習是人工智慧的子領域，使用演算法和統計模型來使電腦系統能夠改善其性能。",
        "深度學習是機器學習的一種方法，使用具有多層的人工神經網路。",
        "自然語言處理（NLP）是人工智慧的一個領域，專注於電腦與人類語言之間的互動。",
        "電腦視覺是人工智慧的另一個重要分支，使機器能夠解釋和理解視覺世界。",
        "人工智慧技術廣泛應用於醫療、金融、交通等各個領域。",
        "機器學習演算法包括監督學習、無監督學習和強化學習等不同類型。",
        "深度學習網路包含輸入層、隱藏層和輸出層，能夠學習複雜的模式。",
        "自然語言處理技術包括文本分類、情感分析、機器翻譯等應用。",
        "電腦視覺應用包括圖像識別、物體檢測、人臉識別等技術。"
    ]

    # Create embeddings
    embeddings = embedding_model.encode(sample_texts, normalize_embeddings=True)
    embeddings = embeddings.astype(np.float32)

    # Create FAISS index
    index = faiss.IndexFlatIP(embed_dim)
    index.add(embeddings)

    # Create chunks data
    chunks = []
    for i, text in enumerate(sample_texts):
        chunks.append({
            "id": i,
            "text": text,
            "meta": {"source_id": f"sample_{i}", "page": 1}
        })

    # Save for reuse
    Path("indices").mkdir(exist_ok=True)
    faiss.write_index(index, index_path)
    with open(chunks_path, 'w', encoding='utf-8') as f:
        for chunk in chunks:
            f.write(json.dumps(chunk, ensure_ascii=False) + '\n')

    print(f"Created {len(chunks)} sample chunks")

# Initialize tokenizer for token counting
tokenizer = tiktoken.get_encoding("cl100k_base")  # GPT-3.5/4 tokenizer

In [None]:
## Cell 3: Duplicate Removal Implementation
@dataclass
class RetrievalResult:
    """檢索結果數據結構"""
    text: str
    meta: Dict[str, Any]
    score: float
    chunk_id: int

def remove_duplicates(results: List[RetrievalResult],
                     similarity_threshold: float = 0.85) -> List[RetrievalResult]:
    """
    Remove duplicate or highly similar chunks using rapidfuzz

    Args:
        results: List of retrieval results
        similarity_threshold: Similarity threshold (0-1), above which chunks are considered duplicates

    Returns:
        Deduplicated list of results
    """
    if not results:
        return results

    deduplicated = []

    for current in results:
        is_duplicate = False

        # Check against already accepted chunks
        for accepted in deduplicated:
            # Use token_sort_ratio for better handling of word order differences
            similarity = fuzz.token_sort_ratio(current.text, accepted.text) / 100.0

            if similarity > similarity_threshold:
                is_duplicate = True
                # Keep the one with higher score
                if current.score > accepted.score:
                    deduplicated.remove(accepted)
                    deduplicated.append(current)
                break

        if not is_duplicate:
            deduplicated.append(current)

    print(f"Deduplication: {len(results)} -> {len(deduplicated)} chunks "
          f"(removed {len(results) - len(deduplicated)} duplicates)")

    return deduplicated

# Test deduplication
def test_deduplication():
    """Test the deduplication function"""
    test_results = [
        RetrievalResult("人工智慧是電腦科學的分支", {"source": "doc1"}, 0.9, 0),
        RetrievalResult("人工智能是計算機科學的分支", {"source": "doc2"}, 0.8, 1),  # Similar
        RetrievalResult("機器學習是AI的子領域", {"source": "doc3"}, 0.7, 2),
        RetrievalResult("深度學習使用神經網路", {"source": "doc4"}, 0.6, 3),
    ]

    print("Original results:")
    for i, result in enumerate(test_results):
        print(f"  {i}: {result.text[:30]}... (score: {result.score})")

    deduplicated = remove_duplicates(test_results, similarity_threshold=0.85)

    print("\nAfter deduplication:")
    for i, result in enumerate(deduplicated):
        print(f"  {i}: {result.text[:30]}... (score: {result.score})")

test_deduplication()

In [None]:
## Cell 4: MMR (Maximal Marginal Relevance) Implementation
def compute_mmr(query_embedding: np.ndarray,
                doc_embeddings: np.ndarray,
                selected_indices: List[int],
                lambda_param: float = 0.5) -> np.ndarray:
    """
    Compute MMR scores for remaining documents

    Args:
        query_embedding: Query vector (1, dim)
        doc_embeddings: Document vectors (n, dim)
        selected_indices: Already selected document indices
        lambda_param: Balance between relevance (1.0) and diversity (0.0)

    Returns:
        MMR scores for all documents
    """
    # Relevance scores (cosine similarity with query)
    relevance_scores = np.dot(doc_embeddings, query_embedding.T).flatten()

    if not selected_indices:
        # If no documents selected yet, return pure relevance
        return relevance_scores

    # Diversity scores (max similarity with already selected documents)
    selected_embeddings = doc_embeddings[selected_indices]
    similarity_matrix = np.dot(doc_embeddings, selected_embeddings.T)
    max_similarity = np.max(similarity_matrix, axis=1)

    # MMR formula: λ * relevance - (1-λ) * max_similarity
    mmr_scores = lambda_param * relevance_scores - (1 - lambda_param) * max_similarity

    return mmr_scores

def mmr_rerank(results: List[RetrievalResult],
               query: str,
               k: int = 5,
               lambda_param: float = 0.5) -> List[RetrievalResult]:
    """
    Rerank results using MMR to balance relevance and diversity

    Args:
        results: Initial retrieval results
        query: Original query text
        k: Number of results to return
        lambda_param: Balance parameter (1.0=pure relevance, 0.0=pure diversity)

    Returns:
        MMR-reranked results
    """
    if len(results) <= k:
        return results

    # Get embeddings for query and all documents
    query_embedding = embedding_model.encode([query], normalize_embeddings=True)
    doc_texts = [r.text for r in results]
    doc_embeddings = embedding_model.encode(doc_texts, normalize_embeddings=True)

    selected_indices = []
    remaining_indices = list(range(len(results)))

    for _ in range(min(k, len(results))):
        if not remaining_indices:
            break

        # Compute MMR scores for remaining documents
        mmr_scores = compute_mmr(query_embedding, doc_embeddings,
                               selected_indices, lambda_param)

        # Find best remaining document
        best_idx = None
        best_score = float('-inf')

        for idx in remaining_indices:
            if mmr_scores[idx] > best_score:
                best_score = mmr_scores[idx]
                best_idx = idx

        # Move from remaining to selected
        selected_indices.append(best_idx)
        remaining_indices.remove(best_idx)

    # Return reranked results
    mmr_results = [results[i] for i in selected_indices]

    print(f"MMR reranking: {len(results)} -> {len(mmr_results)} chunks "
          f"(λ={lambda_param})")

    return mmr_results

# Test MMR
def test_mmr():
    """Test MMR functionality"""
    # Create test query and results
    query = "人工智慧機器學習"

    # Simulate retrieval results
    test_results = []
    for i, chunk in enumerate(chunks[:8]):  # Use first 8 chunks
        # Simulate similarity scores
        score = 0.9 - i * 0.1  # Decreasing scores
        test_results.append(RetrievalResult(
            text=chunk["text"],
            meta=chunk["meta"],
            score=score,
            chunk_id=chunk["id"]
        ))

    print("Original results (by score):")
    for i, result in enumerate(test_results):
        print(f"  {i}: {result.text[:50]}... (score: {result.score:.2f})")

    # Apply MMR with different lambda values
    for lambda_val in [0.3, 0.5, 0.8]:
        print(f"\nMMR results (λ={lambda_val}):")
        mmr_results = mmr_rerank(test_results, query, k=5, lambda_param=lambda_val)
        for i, result in enumerate(mmr_results):
            print(f"  {i}: {result.text[:50]}... (score: {result.score:.2f})")

test_mmr()

In [None]:
## Cell 5: Token Budget Management
def count_tokens(text: str) -> int:
    """Count tokens in text using tiktoken"""
    return len(tokenizer.encode(text))

def fit_token_budget(results: List[RetrievalResult],
                    max_tokens: int = 3000,
                    reserve_tokens: int = 500) -> List[RetrievalResult]:
    """
    Trim results to fit within token budget

    Args:
        results: Retrieval results to trim
        max_tokens: Maximum total tokens for context
        reserve_tokens: Tokens to reserve for system/user messages

    Returns:
        Trimmed results that fit within budget
    """
    available_tokens = max_tokens - reserve_tokens
    current_tokens = 0
    fitted_results = []

    for result in results:
        result_tokens = count_tokens(result.text)

        if current_tokens + result_tokens <= available_tokens:
            fitted_results.append(result)
            current_tokens += result_tokens
        else:
            # Try to fit partial text if it's the first result
            if not fitted_results and result_tokens > available_tokens:
                # Truncate the text to fit
                truncated_text = truncate_to_tokens(result.text, available_tokens)
                if truncated_text:
                    truncated_result = RetrievalResult(
                        text=truncated_text,
                        meta=result.meta,
                        score=result.score,
                        chunk_id=result.chunk_id
                    )
                    fitted_results.append(truncated_result)
                    current_tokens = count_tokens(truncated_text)
            break

    print(f"Token budget: {current_tokens}/{available_tokens} tokens used "
          f"({len(fitted_results)}/{len(results)} chunks)")

    return fitted_results

def truncate_to_tokens(text: str, max_tokens: int) -> str:
    """
    Truncate text to fit within token limit

    Args:
        text: Text to truncate
        max_tokens: Maximum tokens allowed

    Returns:
        Truncated text
    """
    tokens = tokenizer.encode(text)
    if len(tokens) <= max_tokens:
        return text

    # Truncate and decode, try to end at sentence boundary
    truncated_tokens = tokens[:max_tokens]
    truncated_text = tokenizer.decode(truncated_tokens)

    # Try to end at a sentence boundary (Chinese punctuation)
    for punct in ['。', '！', '？', '；']:
        last_punct = truncated_text.rfind(punct)
        if last_punct > len(truncated_text) * 0.7:  # Don't cut too much
            return truncated_text[:last_punct + 1]

    return truncated_text

# Test token budget management
def test_token_budget():
    """Test token budget functionality"""
    # Create test results with known token counts
    test_results = []
    for i, chunk in enumerate(chunks[:6]):
        test_results.append(RetrievalResult(
            text=chunk["text"],
            meta=chunk["meta"],
            score=0.9 - i * 0.1,
            chunk_id=chunk["id"]
        ))

    print("Original results:")
    total_tokens = 0
    for i, result in enumerate(test_results):
        tokens = count_tokens(result.text)
        total_tokens += tokens
        print(f"  {i}: {tokens} tokens - {result.text[:50]}...")
    print(f"Total: {total_tokens} tokens")

    # Test different budget limits
    for budget in [500, 300, 150]:
        print(f"\nBudget limit: {budget} tokens")
        fitted = fit_token_budget(test_results, max_tokens=budget, reserve_tokens=50)
        print(f"Fitted {len(fitted)} chunks")

test_token_budget()

In [None]:
## Cell 6: Score Threshold Filtering
def filter_by_score(results: List[RetrievalResult],
                   min_score: float = 0.3) -> List[RetrievalResult]:
    """
    Filter results by minimum relevance score

    Args:
        results: Retrieval results to filter
        min_score: Minimum score threshold

    Returns:
        Filtered results above threshold
    """
    filtered = [r for r in results if r.score >= min_score]

    print(f"Score filtering: {len(results)} -> {len(filtered)} chunks "
          f"(threshold: {min_score})")

    return filtered

def adaptive_threshold(results: List[RetrievalResult],
                      min_results: int = 3,
                      score_gap_threshold: float = 0.2) -> float:
    """
    Compute adaptive score threshold based on score distribution

    Args:
        results: Retrieval results (should be sorted by score desc)
        min_results: Minimum number of results to keep
        score_gap_threshold: Minimum gap to consider a natural cut-off

    Returns:
        Adaptive threshold score
    """
    if len(results) <= min_results:
        return 0.0

    scores = [r.score for r in results]

    # Look for natural score gaps after min_results
    for i in range(min_results, len(scores) - 1):
        score_gap = scores[i] - scores[i + 1]
        if score_gap >= score_gap_threshold:
            threshold = scores[i + 1] + score_gap / 2
            print(f"Adaptive threshold: {threshold:.3f} (gap at position {i})")
            return threshold

    # Fallback: use median score if no clear gap
    median_idx = len(scores) // 2
    threshold = scores[median_idx]
    print(f"Adaptive threshold: {threshold:.3f} (median fallback)")
    return threshold

# Test score filtering
def test_score_filtering():
    """Test score filtering functionality"""
    # Create test results with varying scores
    test_scores = [0.95, 0.87, 0.76, 0.45, 0.32, 0.18, 0.12, 0.05]
    test_results = []

    for i, score in enumerate(test_scores):
        test_results.append(RetrievalResult(
            text=f"測試文本 {i+1}: " + chunks[i % len(chunks)]["text"],
            meta={"source": f"test_{i}"},
            score=score,
            chunk_id=i
        ))

    print("Original results:")
    for i, result in enumerate(test_results):
        print(f"  {i}: score={result.score:.3f} - {result.text[:40]}...")

    # Test fixed threshold
    print(f"\nFixed threshold (0.4):")
    filtered_fixed = filter_by_score(test_results, min_score=0.4)

    # Test adaptive threshold
    print(f"\nAdaptive threshold:")
    adaptive_thresh = adaptive_threshold(test_results)
    filtered_adaptive = filter_by_score(test_results, min_score=adaptive_thresh)

test_score_filtering()

In [None]:
## Cell 7: Integrated Context Optimizer
class ContextOptimizer:
    """
    Complete context optimization pipeline
    """

    def __init__(self,
                 similarity_threshold: float = 0.85,
                 mmr_lambda: float = 0.5,
                 max_tokens: int = 3000,
                 reserve_tokens: int = 500,
                 min_score: float = 0.3,
                 use_adaptive_threshold: bool = True):
        self.similarity_threshold = similarity_threshold
        self.mmr_lambda = mmr_lambda
        self.max_tokens = max_tokens
        self.reserve_tokens = reserve_tokens
        self.min_score = min_score
        self.use_adaptive_threshold = use_adaptive_threshold

    def optimize(self,
                results: List[RetrievalResult],
                query: str,
                target_k: int = 5) -> List[RetrievalResult]:
        """
        Apply complete optimization pipeline

        Args:
            results: Raw retrieval results
            query: Original query
            target_k: Target number of results

        Returns:
            Optimized results
        """
        print(f"\n=== Context Optimization Pipeline ===")
        print(f"Input: {len(results)} chunks")

        # Step 1: Score filtering
        if self.use_adaptive_threshold:
            threshold = adaptive_threshold(results)
            filtered = filter_by_score(results, min_score=threshold)
        else:
            filtered = filter_by_score(results, min_score=self.min_score)

        if not filtered:
            print("Warning: No results after score filtering!")
            return []

        # Step 2: Deduplication
        deduplicated = remove_duplicates(filtered, self.similarity_threshold)

        # Step 3: MMR reranking for diversity
        mmr_results = mmr_rerank(deduplicated, query,
                               k=min(target_k * 2, len(deduplicated)),
                               lambda_param=self.mmr_lambda)

        # Step 4: Token budget fitting
        final_results = fit_token_budget(mmr_results,
                                       self.max_tokens,
                                       self.reserve_tokens)

        print(f"Final output: {len(final_results)} chunks")

        return final_results

    def optimize_with_stats(self,
                          results: List[RetrievalResult],
                          query: str,
                          target_k: int = 5) -> Tuple[List[RetrievalResult], Dict]:
        """
        Optimize with detailed statistics

        Returns:
            Tuple of (optimized_results, stats_dict)
        """
        stats = {
            "original_count": len(results),
            "original_tokens": sum(count_tokens(r.text) for r in results),
            "original_avg_score": np.mean([r.score for r in results]) if results else 0,
        }

        optimized = self.optimize(results, query, target_k)

        stats.update({
            "final_count": len(optimized),
            "final_tokens": sum(count_tokens(r.text) for r in optimized),
            "final_avg_score": np.mean([r.score for r in optimized]) if optimized else 0,
            "compression_ratio": len(optimized) / max(1, len(results)),
            "token_reduction": 1 - (sum(count_tokens(r.text) for r in optimized) /
                                  max(1, sum(count_tokens(r.text) for r in results)))
        })

        return optimized, stats

# Initialize optimizer
optimizer = ContextOptimizer(
    similarity_threshold=0.8,
    mmr_lambda=0.6,  # Slightly favor diversity
    max_tokens=2000,
    reserve_tokens=500,
    use_adaptive_threshold=True
)

print("Context Optimizer initialized successfully!")

In [None]:
## Cell 8: Performance Comparison
def simulate_retrieval(query: str, k: int = 10) -> List[RetrievalResult]:
    """
    Simulate retrieval results for testing
    """
    # Encode query
    query_embedding = embedding_model.encode([query], normalize_embeddings=True)

    # Search in index
    scores, indices = index.search(query_embedding.astype(np.float32), k)

    # Convert to RetrievalResult objects
    results = []
    for i, (idx, score) in enumerate(zip(indices[0], scores[0])):
        if idx >= 0 and idx < len(chunks):  # Valid index
            results.append(RetrievalResult(
                text=chunks[idx]["text"],
                meta=chunks[idx]["meta"],
                score=float(score),
                chunk_id=chunks[idx]["id"]
            ))

    return results

def compare_optimization_strategies(query: str):
    """
    Compare different optimization strategies
    """
    print(f"\n=== Optimization Strategy Comparison ===")
    print(f"Query: {query}")

    # Get raw retrieval results
    raw_results = simulate_retrieval(query, k=8)

    if not raw_results:
        print("No retrieval results found!")
        return

    print(f"\nRaw retrieval: {len(raw_results)} chunks")
    for i, result in enumerate(raw_results):
        tokens = count_tokens(result.text)
        print(f"  {i}: score={result.score:.3f}, tokens={tokens} - {result.text[:50]}...")

    # Strategy 1: No optimization (just top-k)
    baseline = raw_results[:5]
    baseline_tokens = sum(count_tokens(r.text) for r in baseline)
    print(f"\nBaseline (top-5): {len(baseline)} chunks, {baseline_tokens} tokens")

    # Strategy 2: Score filtering only
    score_filtered = filter_by_score(raw_results, min_score=0.3)[:5]
    score_tokens = sum(count_tokens(r.text) for r in score_filtered)
    print(f"Score filtering: {len(score_filtered)} chunks, {score_tokens} tokens")

    # Strategy 3: Deduplication only
    dedup_only = remove_duplicates(raw_results, similarity_threshold=0.8)[:5]
    dedup_tokens = sum(count_tokens(r.text) for r in dedup_only)
    print(f"Deduplication: {len(dedup_only)} chunks, {dedup_tokens} tokens")

    # Strategy 4: Full optimization
    optimized, stats = optimizer.optimize_with_stats(raw_results, query, target_k=5)

    print(f"\n=== Optimization Statistics ===")
    for key, value in stats.items():
        if isinstance(value, float):
            print(f"{key}: {value:.3f}")
        else:
            print(f"{key}: {value}")

    return {
        "baseline": baseline,
        "score_filtered": score_filtered,
        "dedup_only": dedup_only,
        "optimized": optimized,
        "stats": stats
    }

# Test with different queries
test_queries = [
    "人工智慧和機器學習的關係",
    "深度學習神經網路",
    "自然語言處理應用"
]

comparison_results = {}
for query in test_queries:
    comparison_results[query] = compare_optimization_strategies(query)

In [None]:
## Cell 9: Smoke Test - End-to-End Optimization
def smoke_test_optimization():
    """
    Comprehensive smoke test for context optimization
    """
    print("=== Context Optimization Smoke Test ===")

    # Test query
    test_query = "人工智慧技術應用"

    # Get retrieval results
    raw_results = simulate_retrieval(test_query, k=8)

    if not raw_results:
        print("❌ No retrieval results - check index and chunks")
        return False

    print(f"✓ Retrieved {len(raw_results)} raw results")

    # Test optimization
    try:
        optimized_results, stats = optimizer.optimize_with_stats(
            raw_results, test_query, target_k=5
        )

        print(f"✓ Optimization completed: {stats['original_count']} -> {stats['final_count']} chunks")
        print(f"✓ Token reduction: {stats['token_reduction']:.1%}")
        print(f"✓ Average score: {stats['original_avg_score']:.3f} -> {stats['final_avg_score']:.3f}")

        # Verify results are valid
        if not optimized_results:
            print("❌ No optimized results returned")
            return False

        # Check for token budget compliance
        total_tokens = sum(count_tokens(r.text) for r in optimized_results)
        max_allowed = optimizer.max_tokens - optimizer.reserve_tokens

        if total_tokens > max_allowed:
            print(f"❌ Token budget exceeded: {total_tokens} > {max_allowed}")
            return False

        print(f"✓ Token budget respected: {total_tokens}/{max_allowed} tokens")

        # Check for duplicate removal
        unique_texts = set(r.text for r in optimized_results)
        if len(unique_texts) != len(optimized_results):
            print("❌ Duplicates found in optimized results")
            return False

        print(f"✓ No duplicates in final results")

        # Display final optimized results
        print(f"\n=== Final Optimized Results ===")
        for i, result in enumerate(optimized_results):
            tokens = count_tokens(result.text)
            print(f"{i+1}. Score: {result.score:.3f}, Tokens: {tokens}")
            print(f"   Text: {result.text[:80]}...")
            print(f"   Source: {result.meta.get('source_id', 'N/A')}")

        print("\n✅ All smoke tests passed!")
        return True

    except Exception as e:
        print(f"❌ Optimization failed: {str(e)}")
        return False

# Run smoke test
success = smoke_test_optimization()

if success:
    print("\n🎉 Context optimization is working correctly!")
    print("\nKey features verified:")
    print("- ✓ Duplicate removal with configurable similarity threshold")
    print("- ✓ MMR reranking for relevance-diversity balance")
    print("- ✓ Token budget management with smart truncation")
    print("- ✓ Adaptive score threshold filtering")
    print("- ✓ End-to-end optimization pipeline")
else:
    print("\n⚠️  Some issues detected - check implementation")

# Save configuration for reuse
config = {
    "similarity_threshold": optimizer.similarity_threshold,
    "mmr_lambda": optimizer.mmr_lambda,
    "max_tokens": optimizer.max_tokens,
    "reserve_tokens": optimizer.reserve_tokens,
    "min_score": optimizer.min_score,
    "use_adaptive_threshold": optimizer.use_adaptive_threshold
}

print(f"\n📝 Current optimizer configuration:")
for key, value in config.items():
    print(f"   {key}: {value}")

## 🎯 What We Built

1. **去重演算法**: 使用 `rapidfuzz` 檢測文本相似度，移除冗餘段落
2. **MMR 重排**: 平衡相關性與多樣性，避免結果過於相似
3. **Token Budget 管理**: 動態截斷，確保不超過 context window 限制
4. **自適應分數閾值**: 根據分數分布自動決定過濾門檻
5. **完整優化流水線**: 整合所有策略的 `ContextOptimizer` 類別

## ⚠️ Pitfalls

- **過度去重**: 相似度閾值設太低會移除相關但不完全重複的內容
- **MMR λ 參數**: 需要根據具體應用調整相關性與多樣性的平衡
- **Token 計算**: 不同 tokenizer 結果差異大，需要留足夠 buffer
- **分數標準化**: 不同檢索方法的分數範圍可能不同，需要統一處理

## 🔄 Next Steps

1. **實驗不同參數組合**: 針對你的資料集調優 threshold, λ, token budget
2. **整合到檢索流水線**: 修改 `nb14` 的檢索函數加入優化步驟
3. **添加評估指標**: 測量優化前後的 Recall@k 和 relevance 變化
4. **支援更多策略**: 如 cluster-based 去重、semantic chunking 等