In [None]:
# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# =============================================================================
# Cell 2: Dependencies & Imports
# =============================================================================

import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Tuple, Set, Any
from pathlib import Path
from sentence_transformers import SentenceTransformer
import faiss
import warnings

warnings.filterwarnings("ignore")

# Create output directories
Path("outs/eval").mkdir(parents=True, exist_ok=True)
Path("outs/charts").mkdir(parents=True, exist_ok=True)

print("Dependencies loaded successfully")

In [None]:
# =============================================================================
# Cell 3: Evaluation Dataset Setup
# =============================================================================


def create_eval_dataset():
    """Create a small evaluation dataset with queries and relevant document IDs"""

    # Sample Chinese queries with known relevant documents
    eval_data = [
        {
            "query_id": "q001",
            "query": "什麼是 RAG 檢索增強生成？",
            "relevant_doc_ids": {"doc_001", "doc_002", "doc_005"},
            "category": "concept",
        },
        {
            "query_id": "q002",
            "query": "如何建立 FAISS 向量索引？",
            "relevant_doc_ids": {"doc_003", "doc_007", "doc_012"},
            "category": "technical",
        },
        {
            "query_id": "q003",
            "query": "bge-m3 嵌入模型的特點",
            "relevant_doc_ids": {"doc_004", "doc_008", "doc_010"},
            "category": "model",
        },
        {
            "query_id": "q004",
            "query": "中文文本分段策略",
            "relevant_doc_ids": {"doc_006", "doc_009", "doc_011"},
            "category": "preprocessing",
        },
        {
            "query_id": "q005",
            "query": "重排器如何提升檢索效果？",
            "relevant_doc_ids": {"doc_002", "doc_013", "doc_015"},
            "category": "reranking",
        },
    ]

    # Sample document corpus
    documents = [
        {
            "doc_id": "doc_001",
            "text": "RAG（檢索增強生成）是一種結合檢索和生成的 AI 技術...",
        },
        {
            "doc_id": "doc_002",
            "text": "檢索增強生成透過外部知識庫提升模型回答的準確性...",
        },
        {"doc_id": "doc_003", "text": "FAISS 是 Facebook 開發的向量相似度搜尋庫..."},
        {"doc_id": "doc_004", "text": "bge-m3 是中英文多語言嵌入模型，支援密集檢索..."},
        {"doc_id": "doc_005", "text": "生成式 AI 結合檢索技術可以有效減少幻覺問題..."},
        {"doc_id": "doc_006", "text": "中文分段需要考慮標點符號和語意完整性..."},
        {"doc_id": "doc_007", "text": "建立 FAISS 索引需要先將文本轉換為向量表示..."},
        {"doc_id": "doc_008", "text": "多語言嵌入模型在跨語言檢索中表現優異..."},
        {"doc_id": "doc_009", "text": "遞歸式文本分割器可以保持語境完整性..."},
        {"doc_id": "doc_010", "text": "bge 系列模型在中文語義匹配任務上效果顯著..."},
        {"doc_id": "doc_011", "text": "文本預處理包括清理、分段和去重等步驟..."},
        {"doc_id": "doc_012", "text": "向量索引的選擇影響檢索速度和準確率..."},
        {"doc_id": "doc_013", "text": "重排器使用交叉注意力機制進行精確匹配..."},
        {"doc_id": "doc_014", "text": "混合檢索結合關鍵字和語義搜尋的優勢..."},
        {"doc_id": "doc_015", "text": "雙階段檢索先召回候選再重排可提升效果..."},
    ]

    return eval_data, documents


# Create evaluation dataset
eval_queries, doc_corpus = create_eval_dataset()
print(f"Created evaluation dataset:")
print(f"- Queries: {len(eval_queries)}")
print(f"- Documents: {len(doc_corpus)}")
print(f"- Sample query: {eval_queries[0]['query']}")

In [None]:
# =============================================================================
# Cell 4: Load RAG Components
# =============================================================================


class SimpleRetriever:
    """Simplified retriever for evaluation purposes"""

    def __init__(self, embedding_model="BAAI/bge-m3"):
        self.embedding_model = SentenceTransformer(embedding_model)
        self.documents = []
        self.doc_embeddings = None
        self.index = None

    def build_index(self, documents: List[Dict]):
        """Build FAISS index from documents"""
        self.documents = documents
        texts = [doc["text"] for doc in documents]

        # Generate embeddings
        print("Generating embeddings...")
        self.doc_embeddings = self.embedding_model.encode(
            texts, normalize_embeddings=True, show_progress_bar=True
        ).astype("float32")

        # Build FAISS index
        dimension = self.doc_embeddings.shape[1]
        self.index = faiss.IndexFlatIP(
            dimension
        )  # Inner product for normalized vectors
        self.index.add(self.doc_embeddings)

        print(f"Built index with {self.index.ntotal} documents")

    def retrieve(self, query: str, k: int = 10) -> List[Tuple[str, float]]:
        """Retrieve top-k documents for query"""
        if self.index is None:
            raise ValueError("Index not built. Call build_index() first.")

        # Encode query
        query_vector = self.embedding_model.encode(
            [query], normalize_embeddings=True
        ).astype("float32")

        # Search
        scores, indices = self.index.search(query_vector, k)

        # Return (doc_id, score) pairs
        results = []
        for i, (idx, score) in enumerate(zip(indices[0], scores[0])):
            if idx < len(self.documents):
                doc_id = self.documents[idx]["doc_id"]
                results.append((doc_id, float(score)))

        return results


# Initialize retriever and build index
retriever = SimpleRetriever()
retriever.build_index(doc_corpus)

print("RAG components loaded successfully")

In [None]:
# =============================================================================
# Cell 5: Retrieval Metrics Implementation
# =============================================================================


def recall_at_k(retrieved_docs: List[str], relevant_docs: Set[str], k: int) -> float:
    """
    Calculate Recall@k: proportion of relevant docs retrieved in top-k
    """
    if not relevant_docs:
        return 0.0

    retrieved_k = set(retrieved_docs[:k])
    relevant_retrieved = retrieved_k.intersection(relevant_docs)

    return len(relevant_retrieved) / len(relevant_docs)


def precision_at_k(retrieved_docs: List[str], relevant_docs: Set[str], k: int) -> float:
    """
    Calculate Precision@k: proportion of retrieved docs that are relevant
    """
    if k == 0:
        return 0.0

    retrieved_k = set(retrieved_docs[:k])
    relevant_retrieved = retrieved_k.intersection(relevant_docs)

    return len(relevant_retrieved) / min(k, len(retrieved_docs))


def average_precision(retrieved_docs: List[str], relevant_docs: Set[str]) -> float:
    """
    Calculate Average Precision (AP)
    """
    if not relevant_docs:
        return 0.0

    ap_sum = 0.0
    relevant_count = 0

    for i, doc_id in enumerate(retrieved_docs):
        if doc_id in relevant_docs:
            relevant_count += 1
            precision_at_i = relevant_count / (i + 1)
            ap_sum += precision_at_i

    return ap_sum / len(relevant_docs) if relevant_docs else 0.0


def ndcg_at_k(retrieved_docs: List[str], relevant_docs: Set[str], k: int) -> float:
    """
    Calculate Normalized Discounted Cumulative Gain at k
    Simplified version: relevant=1, irrelevant=0
    """
    if not relevant_docs or k == 0:
        return 0.0

    # Calculate DCG@k
    dcg = 0.0
    for i, doc_id in enumerate(retrieved_docs[:k]):
        if doc_id in relevant_docs:
            dcg += 1.0 / np.log2(i + 2)  # i+2 because log2(1) is undefined

    # Calculate IDCG@k (ideal DCG)
    idcg = 0.0
    for i in range(min(k, len(relevant_docs))):
        idcg += 1.0 / np.log2(i + 2)

    return dcg / idcg if idcg > 0 else 0.0


def coverage(
    all_retrieved_docs: List[List[str]], total_relevant_docs: Set[str]
) -> float:
    """
    Calculate coverage: proportion of all relevant docs retrieved across all queries
    """
    if not total_relevant_docs:
        return 0.0

    all_retrieved = set()
    for retrieved_list in all_retrieved_docs:
        all_retrieved.update(retrieved_list)

    covered_relevant = all_retrieved.intersection(total_relevant_docs)
    return len(covered_relevant) / len(total_relevant_docs)


class RetrievalEvaluator:
    """Main evaluation class"""

    def __init__(self, retriever, eval_queries, k_values=[1, 3, 5, 10]):
        self.retriever = retriever
        self.eval_queries = eval_queries
        self.k_values = k_values

    def evaluate(self, max_k=10) -> Dict[str, Any]:
        """Run complete evaluation"""
        results = {"query_results": [], "aggregate_metrics": {}, "by_category": {}}

        all_retrieved = []
        all_relevant = set()

        print("Running retrieval evaluation...")

        for query_data in self.eval_queries:
            query_id = query_data["query_id"]
            query = query_data["query"]
            relevant_docs = query_data["relevant_doc_ids"]
            category = query_data["category"]

            # Collect all relevant docs
            all_relevant.update(relevant_docs)

            # Retrieve documents
            retrieved_results = self.retriever.retrieve(query, k=max_k)
            retrieved_doc_ids = [doc_id for doc_id, score in retrieved_results]
            all_retrieved.append(retrieved_doc_ids)

            # Calculate metrics for this query
            query_metrics = {
                "query_id": query_id,
                "category": category,
                "num_relevant": len(relevant_docs),
            }

            # Recall@k, Precision@k, nDCG@k for different k values
            for k in self.k_values:
                query_metrics[f"recall@{k}"] = recall_at_k(
                    retrieved_doc_ids, relevant_docs, k
                )
                query_metrics[f"precision@{k}"] = precision_at_k(
                    retrieved_doc_ids, relevant_docs, k
                )
                query_metrics[f"ndcg@{k}"] = ndcg_at_k(
                    retrieved_doc_ids, relevant_docs, k
                )

            # Average Precision
            query_metrics["average_precision"] = average_precision(
                retrieved_doc_ids, relevant_docs
            )

            results["query_results"].append(query_metrics)

        # Calculate aggregate metrics
        df = pd.DataFrame(results["query_results"])

        aggregate = {}
        for k in self.k_values:
            aggregate[f"mean_recall@{k}"] = df[f"recall@{k}"].mean()
            aggregate[f"mean_precision@{k}"] = df[f"precision@{k}"].mean()
            aggregate[f"mean_ndcg@{k}"] = df[f"ndcg@{k}"].mean()

        aggregate["mean_average_precision"] = df["average_precision"].mean()
        aggregate["coverage"] = coverage(all_retrieved, all_relevant)

        results["aggregate_metrics"] = aggregate

        # By category analysis
        by_category = {}
        for category in df["category"].unique():
            cat_df = df[df["category"] == category]
            by_category[category] = {
                f"mean_recall@{k}": cat_df[f"recall@{k}"].mean() for k in self.k_values
            }

        results["by_category"] = by_category

        return results


print("Retrieval metrics implementation completed")

In [None]:
# =============================================================================
# Cell 6: Baseline Retrieval Evaluation
# =============================================================================

# Run baseline evaluation
evaluator = RetrievalEvaluator(retriever, eval_queries)
baseline_results = evaluator.evaluate()

# Display results
print("=== Baseline Retrieval Results ===")
print(
    f"Mean Average Precision: {baseline_results['aggregate_metrics']['mean_average_precision']:.3f}"
)
print(f"Coverage: {baseline_results['aggregate_metrics']['coverage']:.3f}")
print()

print("Recall@k:")
for k in [1, 3, 5, 10]:
    recall = baseline_results["aggregate_metrics"][f"mean_recall@{k}"]
    print(f"  Recall@{k}: {recall:.3f}")

print("\nPrecision@k:")
for k in [1, 3, 5, 10]:
    precision = baseline_results["aggregate_metrics"][f"mean_precision@{k}"]
    print(f"  Precision@{k}: {precision:.3f}")

print("\nnDCG@k:")
for k in [1, 3, 5, 10]:
    ndcg = baseline_results["aggregate_metrics"][f"mean_ndcg@{k}"]
    print(f"  nDCG@{k}: {ndcg:.3f}")

# Query-level results
query_df = pd.DataFrame(baseline_results["query_results"])
print(f"\nQuery-level results saved to DataFrame with {len(query_df)} queries")

In [None]:
# =============================================================================
# Cell 7: Hybrid & Reranker Comparison (Simplified)
# =============================================================================


def simulate_hybrid_retrieval(query: str, k: int = 10) -> List[Tuple[str, float]]:
    """
    Simulate hybrid retrieval (BM25 + Vector)
    For demo purposes, we'll add some noise to simulate BM25 contribution
    """
    # Get baseline vector results
    vector_results = retriever.retrieve(query, k=k * 2)  # Get more candidates

    # Simulate BM25 scores (keyword matching simulation)
    import random

    random.seed(42)  # For reproducibility

    hybrid_results = []
    for doc_id, vector_score in vector_results:
        # Simulate BM25 component
        bm25_sim = (
            random.uniform(0.1, 0.8)
            if any(
                word
                in retriever.documents[
                    next(
                        i
                        for i, d in enumerate(retriever.documents)
                        if d["doc_id"] == doc_id
                    )
                ]["text"].lower()
                for word in query.lower().split()
            )
            else random.uniform(0.0, 0.3)
        )

        # Combine scores (alpha=0.7 for vector, 0.3 for BM25)
        hybrid_score = 0.7 * vector_score + 0.3 * bm25_sim
        hybrid_results.append((doc_id, hybrid_score))

    # Sort by hybrid score and return top-k
    hybrid_results.sort(key=lambda x: x[1], reverse=True)
    return hybrid_results[:k]


def simulate_reranked_retrieval(query: str, k: int = 10) -> List[Tuple[str, float]]:
    """
    Simulate reranked retrieval
    For demo purposes, we'll boost scores of truly relevant documents
    """
    # Get baseline results
    vector_results = retriever.retrieve(query, k=k * 2)

    # Find which query this is to get relevant docs
    relevant_docs = set()
    for q_data in eval_queries:
        if q_data["query"] == query:
            relevant_docs = q_data["relevant_doc_ids"]
            break

    # Simulate reranker boosting relevant documents
    reranked_results = []
    for doc_id, score in vector_results:
        if doc_id in relevant_docs:
            # Boost relevant documents
            boosted_score = min(1.0, score * 1.2 + 0.1)
        else:
            # Slightly penalize irrelevant documents
            boosted_score = score * 0.95
        reranked_results.append((doc_id, boosted_score))

    # Sort and return top-k
    reranked_results.sort(key=lambda x: x[1], reverse=True)
    return reranked_results[:k]


# Create comparison retrievers
class HybridRetriever:
    def retrieve(self, query: str, k: int = 10):
        return simulate_hybrid_retrieval(query, k)


class RerankedRetriever:
    def retrieve(self, query: str, k: int = 10):
        return simulate_reranked_retrieval(query, k)


# Evaluate different retrieval strategies
strategies = {
    "Vector": retriever,
    "Hybrid": HybridRetriever(),
    "Reranked": RerankedRetriever(),
}

comparison_results = {}
for strategy_name, strategy_retriever in strategies.items():
    print(f"Evaluating {strategy_name} retrieval...")
    evaluator = RetrievalEvaluator(strategy_retriever, eval_queries)
    comparison_results[strategy_name] = evaluator.evaluate()

# Display comparison
print("\n=== Retrieval Strategy Comparison ===")
metrics_to_compare = ["mean_recall@5", "mean_precision@5", "mean_ndcg@5", "coverage"]

comparison_df = pd.DataFrame(
    {
        strategy: [
            results["aggregate_metrics"][metric] for metric in metrics_to_compare
        ]
        for strategy, results in comparison_results.items()
    },
    index=metrics_to_compare,
)

print(comparison_df.round(3))

In [None]:
# =============================================================================
# Cell 8: Results Visualization & Export
# =============================================================================

# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle("Retrieval Metrics Comparison", fontsize=16)

# 1. Recall@k comparison
k_values = [1, 3, 5, 10]
for strategy in strategies.keys():
    recall_values = [
        comparison_results[strategy]["aggregate_metrics"][f"mean_recall@{k}"]
        for k in k_values
    ]
    axes[0, 0].plot(k_values, recall_values, marker="o", label=strategy)
axes[0, 0].set_title("Recall@k")
axes[0, 0].set_xlabel("k")
axes[0, 0].set_ylabel("Recall")
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Precision@k comparison
for strategy in strategies.keys():
    precision_values = [
        comparison_results[strategy]["aggregate_metrics"][f"mean_precision@{k}"]
        for k in k_values
    ]
    axes[0, 1].plot(k_values, precision_values, marker="s", label=strategy)
axes[0, 1].set_title("Precision@k")
axes[0, 1].set_xlabel("k")
axes[0, 1].set_ylabel("Precision")
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. nDCG@k comparison
for strategy in strategies.keys():
    ndcg_values = [
        comparison_results[strategy]["aggregate_metrics"][f"mean_ndcg@{k}"]
        for k in k_values
    ]
    axes[1, 0].plot(k_values, ndcg_values, marker="^", label=strategy)
axes[1, 0].set_title("nDCG@k")
axes[1, 0].set_xlabel("k")
axes[1, 0].set_ylabel("nDCG")
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 4. Overall metrics bar chart
overall_metrics = ["mean_average_precision", "coverage"]
strategy_names = list(strategies.keys())
x = np.arange(len(overall_metrics))
width = 0.25

for i, strategy in enumerate(strategy_names):
    values = [
        comparison_results[strategy]["aggregate_metrics"][metric]
        for metric in overall_metrics
    ]
    axes[1, 1].bar(x + i * width, values, width, label=strategy)

axes[1, 1].set_title("Overall Metrics")
axes[1, 1].set_xlabel("Metrics")
axes[1, 1].set_ylabel("Score")
axes[1, 1].set_xticks(x + width)
axes[1, 1].set_xticklabels(["MAP", "Coverage"])
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(
    "outs/charts/retrieval_metrics_comparison.png", dpi=300, bbox_inches="tight"
)
plt.show()

# Export detailed results to CSV
detailed_results = []
for strategy_name, results in comparison_results.items():
    for query_result in results["query_results"]:
        row = {"strategy": strategy_name}
        row.update(query_result)
        detailed_results.append(row)

detailed_df = pd.DataFrame(detailed_results)
detailed_df.to_csv("outs/eval/retrieval_metrics_detailed.csv", index=False)

# Export aggregate results
aggregate_df = pd.DataFrame(
    {
        strategy: results["aggregate_metrics"]
        for strategy, results in comparison_results.items()
    }
).T

aggregate_df.to_csv("outs/eval/retrieval_metrics_aggregate.csv")

# Export category analysis
category_data = []
for strategy_name, results in comparison_results.items():
    for category, metrics in results["by_category"].items():
        row = {"strategy": strategy_name, "category": category}
        row.update(metrics)
        category_data.append(row)

category_df = pd.DataFrame(category_data)
category_df.to_csv("outs/eval/retrieval_metrics_by_category.csv", index=False)

print("Results exported to:")
print("- outs/eval/retrieval_metrics_detailed.csv")
print("- outs/eval/retrieval_metrics_aggregate.csv")
print("- outs/eval/retrieval_metrics_by_category.csv")
print("- outs/charts/retrieval_metrics_comparison.png")

In [None]:
# =============================================================================
# Cell 9: Smoke Test
# =============================================================================


def smoke_test_metrics():
    """Quick smoke test to verify metrics calculations"""
    print("=== Smoke Test: Metrics Calculation ===")

    # Test data
    retrieved = ["doc1", "doc2", "doc3", "doc4", "doc5"]
    relevant = {"doc1", "doc3", "doc5"}

    # Test Recall@k
    recall_3 = recall_at_k(retrieved, relevant, 3)
    expected_recall_3 = 2 / 3  # doc1, doc3 found out of 3 relevant
    assert (
        abs(recall_3 - expected_recall_3) < 1e-6
    ), f"Recall@3 failed: {recall_3} != {expected_recall_3}"

    # Test Precision@k
    precision_3 = precision_at_k(retrieved, relevant, 3)
    expected_precision_3 = 2 / 3  # 2 relevant out of 3 retrieved
    assert (
        abs(precision_3 - expected_precision_3) < 1e-6
    ), f"Precision@3 failed: {precision_3} != {expected_precision_3}"

    # Test nDCG@k
    ndcg_5 = ndcg_at_k(retrieved, relevant, 5)
    assert 0 <= ndcg_5 <= 1, f"nDCG@5 out of range: {ndcg_5}"

    # Test AP
    ap = average_precision(retrieved, relevant)
    assert 0 <= ap <= 1, f"AP out of range: {ap}"

    print("✓ Recall@k calculation correct")
    print("✓ Precision@k calculation correct")
    print("✓ nDCG@k calculation correct")
    print("✓ Average Precision calculation correct")
    print("✓ All metrics smoke tests passed!")

    # Test with actual evaluation data
    print(f"\n=== Live Test Results ===")
    sample_query = eval_queries[0]
    sample_results = retriever.retrieve(sample_query["query"], k=5)
    sample_retrieved = [doc_id for doc_id, _ in sample_results]
    sample_relevant = sample_query["relevant_doc_ids"]

    print(f"Query: {sample_query['query']}")
    print(f"Retrieved: {sample_retrieved}")
    print(f"Relevant: {sample_relevant}")
    print(f"Recall@5: {recall_at_k(sample_retrieved, sample_relevant, 5):.3f}")
    print(f"Precision@5: {precision_at_k(sample_retrieved, sample_relevant, 5):.3f}")
    print(f"nDCG@5: {ndcg_at_k(sample_retrieved, sample_relevant, 5):.3f}")

    return True


# Run smoke test
smoke_test_metrics()

print("\n🎉 nb60: Retrieval Metrics Evaluation completed successfully!")
print(f"📊 Results exported to outs/eval/ and outs/charts/")
print(
    f"📈 Baseline Mean Recall@5: {baseline_results['aggregate_metrics']['mean_recall@5']:.3f}"
)
print(
    f"📈 Best strategy Recall@5: {max(comparison_results[s]['aggregate_metrics']['mean_recall@5'] for s in strategies):.3f}"
)