In [None]:
# === E3: Retrieval-Generation Evaluation System ===
# 檢索生成分離評估系統 (Decoupled RAG Evaluation)

# Cell 1: Environment Setup and Shared Cache
import os, pathlib, torch
import warnings

warnings.filterwarnings("ignore")

# Shared cache bootstrap
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "/mnt/ai/cache")
for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)

print(f"[Cache] Root: {AI_CACHE_ROOT}")
print(f"[GPU] Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"[GPU] Device: {torch.cuda.get_device_name(0)}")

In [None]:
# Install required packages
try:
    import evaluate
    import bert_score
    from sentence_transformers import SentenceTransformer
    import faiss

    print("[Packages] All evaluation packages loaded successfully")
except ImportError as e:
    print(f"[Error] Missing package: {e}")
    print("Run: pip install evaluate bert-score sentence-transformers faiss-cpu")

In [None]:
# ===================================================================
# Cell 2: Evaluation Dataset Preparation
# 評估資料集準備

import pandas as pd
import numpy as np
from datasets import Dataset, load_dataset
import json
from typing import List, Dict, Tuple, Optional


class EvaluationDatasetManager:
    """Manages evaluation datasets for RAG systems"""

    def __init__(self):
        self.eval_data = None
        self.documents = None

    def create_synthetic_qa_dataset(self, size: int = 50) -> Dataset:
        """Create synthetic QA dataset for evaluation"""

        # Sample documents (technology domain)
        documents = [
            "Python is a high-level programming language known for its simplicity and readability. It was created by Guido van Rossum in 1991.",
            "Machine learning is a subset of artificial intelligence that enables computers to learn and improve from experience without being explicitly programmed.",
            "Deep learning uses neural networks with multiple layers to model and understand complex patterns in data.",
            "Natural language processing (NLP) is a field of AI that focuses on the interaction between computers and human language.",
            "Computer vision is a field of AI that trains computers to interpret and understand visual information from the world.",
            "Transformers are a type of neural network architecture that has revolutionized natural language processing tasks.",
            "BERT (Bidirectional Encoder Representations from Transformers) is a pre-trained language model developed by Google.",
            "GPT (Generative Pre-trained Transformer) is an autoregressive language model that generates human-like text.",
            "Retrieval-Augmented Generation (RAG) combines information retrieval with text generation for better factual accuracy.",
            "Vector databases store high-dimensional vectors and enable efficient similarity search for AI applications.",
        ]

        # Create QA pairs based on documents
        qa_pairs = [
            {
                "query": "Who created Python and when?",
                "relevant_doc_ids": [0],
                "ground_truth": "Python was created by Guido van Rossum in 1991.",
                "context": documents[0],
            },
            {
                "query": "What is machine learning?",
                "relevant_doc_ids": [1],
                "ground_truth": "Machine learning is a subset of artificial intelligence that enables computers to learn from experience without explicit programming.",
                "context": documents[1],
            },
            {
                "query": "How do deep learning and neural networks relate?",
                "relevant_doc_ids": [2],
                "ground_truth": "Deep learning uses neural networks with multiple layers to model complex patterns in data.",
                "context": documents[2],
            },
            {
                "query": "What does NLP focus on?",
                "relevant_doc_ids": [3],
                "ground_truth": "NLP focuses on the interaction between computers and human language.",
                "context": documents[3],
            },
            {
                "query": "What is the purpose of computer vision?",
                "relevant_doc_ids": [4],
                "ground_truth": "Computer vision trains computers to interpret and understand visual information.",
                "context": documents[4],
            },
        ]

        # Expand to requested size by cycling through patterns
        expanded_qa = []
        for i in range(size):
            base_qa = qa_pairs[i % len(qa_pairs)]
            expanded_qa.append(
                {
                    "id": f"eval_{i:03d}",
                    "query": base_qa["query"],
                    "relevant_doc_ids": base_qa["relevant_doc_ids"],
                    "ground_truth": base_qa["ground_truth"],
                    "context": base_qa["context"],
                }
            )

        self.documents = documents
        self.eval_data = Dataset.from_list(expanded_qa)

        print(f"[Dataset] Created {len(expanded_qa)} evaluation examples")
        print(f"[Dataset] {len(documents)} documents in knowledge base")

        return self.eval_data

    def load_msmarco_subset(self, size: int = 100) -> Optional[Dataset]:
        """Load MS MARCO QA subset (if available)"""
        try:
            # Try to load MS MARCO from HuggingFace
            dataset = load_dataset(
                "ms_marco", "v1.1", split="validation", streaming=True
            )

            # Take first `size` examples and convert to our format
            examples = []
            for i, example in enumerate(dataset):
                if i >= size:
                    break

                if example.get("answers") and len(example["answers"]) > 0:
                    examples.append(
                        {
                            "id": f"msmarco_{i:03d}",
                            "query": example["query"],
                            "relevant_doc_ids": [0],  # Simplified for demo
                            "ground_truth": example["answers"][0],
                            "context": example.get("passages", [{}])[0].get(
                                "passage_text", ""
                            ),
                        }
                    )

            if examples:
                self.eval_data = Dataset.from_list(examples)
                print(f"[Dataset] Loaded {len(examples)} MS MARCO examples")
                return self.eval_data

        except Exception as e:
            print(f"[Warning] Could not load MS MARCO: {e}")
            print("[Fallback] Using synthetic dataset...")

        return None


# Initialize dataset manager
dataset_manager = EvaluationDatasetManager()

# Try MS MARCO first, fallback to synthetic
eval_dataset = dataset_manager.load_msmarco_subset(size=50)
if eval_dataset is None:
    eval_dataset = dataset_manager.create_synthetic_qa_dataset(size=50)

print("\n[Sample] First evaluation example:")
print(f"Query: {eval_dataset[0]['query']}")
print(f"Ground Truth: {eval_dataset[0]['ground_truth']}")
print(f"Context: {eval_dataset[0]['context'][:100]}...")

In [None]:
# ===================================================================
# Cell 3: Retrieval Evaluation System
# 檢索評估系統

import math
from collections import defaultdict


class RetrievalEvaluator:
    """Evaluates retrieval performance with standard IR metrics"""

    def __init__(
        self, embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
    ):
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.document_embeddings = None
        self.faiss_index = None

    def index_documents(self, documents: List[str]) -> None:
        """Create FAISS index for documents"""
        print(f"[Indexing] Encoding {len(documents)} documents...")

        # Generate embeddings
        embeddings = self.embedding_model.encode(documents, show_progress_bar=True)
        self.document_embeddings = embeddings

        # Create FAISS index
        dimension = embeddings.shape[1]
        self.faiss_index = faiss.IndexFlatIP(
            dimension
        )  # Inner product for cosine similarity

        # Normalize embeddings for cosine similarity
        faiss.normalize_L2(embeddings)
        self.faiss_index.add(embeddings.astype(np.float32))

        print(
            f"[Indexing] Created FAISS index with {self.faiss_index.ntotal} documents"
        )

    def retrieve(self, query: str, k: int = 10) -> Tuple[List[int], List[float]]:
        """Retrieve top-k documents for a query"""
        if self.faiss_index is None:
            raise ValueError("Documents must be indexed first")

        # Encode query
        query_embedding = self.embedding_model.encode([query])
        faiss.normalize_L2(query_embedding)

        # Search
        scores, doc_ids = self.faiss_index.search(query_embedding.astype(np.float32), k)

        return doc_ids[0].tolist(), scores[0].tolist()

    def calculate_recall_at_k(
        self, retrieved_docs: List[int], relevant_docs: List[int], k: int
    ) -> float:
        """Calculate Recall@K"""
        if not relevant_docs:
            return 0.0

        retrieved_k = set(retrieved_docs[:k])
        relevant_set = set(relevant_docs)

        intersection = len(retrieved_k.intersection(relevant_set))
        return intersection / len(relevant_set)

    def calculate_precision_at_k(
        self, retrieved_docs: List[int], relevant_docs: List[int], k: int
    ) -> float:
        """Calculate Precision@K"""
        if k == 0:
            return 0.0

        retrieved_k = set(retrieved_docs[:k])
        relevant_set = set(relevant_docs)

        intersection = len(retrieved_k.intersection(relevant_set))
        return intersection / k

    def calculate_mrr(
        self, retrieved_docs: List[int], relevant_docs: List[int]
    ) -> float:
        """Calculate Mean Reciprocal Rank"""
        relevant_set = set(relevant_docs)

        for rank, doc_id in enumerate(retrieved_docs, 1):
            if doc_id in relevant_set:
                return 1.0 / rank

        return 0.0

    def calculate_ndcg_at_k(
        self, retrieved_docs: List[int], relevant_docs: List[int], k: int
    ) -> float:
        """Calculate NDCG@K (simplified binary relevance)"""

        def dcg_at_k(relevance_scores: List[int], k: int) -> float:
            dcg = 0.0
            for i in range(min(k, len(relevance_scores))):
                dcg += relevance_scores[i] / math.log2(i + 2)
            return dcg

        # Binary relevance: 1 if relevant, 0 if not
        retrieved_k = retrieved_docs[:k]
        relevant_set = set(relevant_docs)

        # Actual relevance scores for retrieved docs
        actual_scores = [1 if doc_id in relevant_set else 0 for doc_id in retrieved_k]

        # Ideal relevance scores (all relevant docs first)
        ideal_scores = [1] * min(len(relevant_docs), k) + [0] * max(
            0, k - len(relevant_docs)
        )

        actual_dcg = dcg_at_k(actual_scores, k)
        ideal_dcg = dcg_at_k(ideal_scores, k)

        return actual_dcg / ideal_dcg if ideal_dcg > 0 else 0.0


# Initialize retrieval evaluator
retrieval_evaluator = RetrievalEvaluator()

# Index documents
if dataset_manager.documents:
    retrieval_evaluator.index_documents(dataset_manager.documents)
else:
    print("[Warning] No documents available for indexing")

# Test retrieval evaluation
if len(eval_dataset) > 0:
    sample_query = eval_dataset[0]["query"]
    retrieved_ids, scores = retrieval_evaluator.retrieve(sample_query, k=5)

    print(f"\n[Retrieval Test] Query: {sample_query}")
    print(f"[Retrieval Test] Retrieved doc IDs: {retrieved_ids}")
    print(f"[Retrieval Test] Scores: {[f'{s:.3f}' for s in scores]}")

    # Calculate metrics for this example
    relevant_ids = eval_dataset[0]["relevant_doc_ids"]
    recall_5 = retrieval_evaluator.calculate_recall_at_k(retrieved_ids, relevant_ids, 5)
    precision_5 = retrieval_evaluator.calculate_precision_at_k(
        retrieved_ids, relevant_ids, 5
    )
    mrr = retrieval_evaluator.calculate_mrr(retrieved_ids, relevant_ids)
    ndcg_5 = retrieval_evaluator.calculate_ndcg_at_k(retrieved_ids, relevant_ids, 5)

    print(f"[Metrics] Recall@5: {recall_5:.3f}")
    print(f"[Metrics] Precision@5: {precision_5:.3f}")
    print(f"[Metrics] MRR: {mrr:.3f}")
    print(f"[Metrics] NDCG@5: {ndcg_5:.3f}")

In [None]:
# ===================================================================
# Cell 4: Generation Evaluation System
# 生成評估系統

from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import re


class GenerationEvaluator:
    """Evaluates generation quality with faithfulness and groundedness metrics"""

    def __init__(self, device: str = "auto"):
        self.device = device
        self.nli_model = None
        self.bert_scorer = None
        self._load_models()

    def _load_models(self):
        """Load models for evaluation"""
        try:
            # Load NLI model for faithfulness/consistency checking
            print("[Loading] NLI model for faithfulness evaluation...")
            self.nli_model = pipeline(
                "text-classification",
                model="microsoft/DialoGPT-medium",  # Lightweight alternative
                device=0 if torch.cuda.is_available() else -1,
            )

            # Initialize BERTScore
            print("[Loading] BERTScore for semantic similarity...")
            self.bert_scorer = bert_score

        except Exception as e:
            print(f"[Warning] Could not load some evaluation models: {e}")
            print("[Fallback] Using simplified heuristic evaluations")

    def calculate_faithfulness(self, generated_text: str, source_context: str) -> float:
        """
        Calculate faithfulness score (0-1)
        Measures if generated text is consistent with source context
        """
        if not generated_text or not source_context:
            return 0.0

        # Simplified faithfulness using lexical overlap
        # In production, use NLI models like DeBERTa

        generated_words = set(generated_text.lower().split())
        context_words = set(source_context.lower().split())

        if len(generated_words) == 0:
            return 0.0

        # Calculate overlap ratio
        overlap = len(generated_words.intersection(context_words))
        faithfulness = overlap / len(generated_words)

        return min(faithfulness, 1.0)

    def calculate_groundedness(
        self, generated_text: str, source_documents: List[str]
    ) -> float:
        """
        Calculate groundedness score (0-1)
        Measures if generated text is supported by source documents
        """
        if not generated_text or not source_documents:
            return 0.0

        # Combine all source documents
        combined_context = " ".join(source_documents)

        # Use faithfulness calculation as proxy for groundedness
        return self.calculate_faithfulness(generated_text, combined_context)

    def calculate_attribution(
        self, generated_text: str, source_context: str
    ) -> Dict[str, float]:
        """
        Calculate attribution metrics
        Returns dict with citation_accuracy, source_coverage, etc.
        """
        attribution_metrics = {}

        # Check for explicit citations (simplified)
        citation_pattern = r"\[(\d+)\]|\(source: ?\d+\)"
        citations = re.findall(citation_pattern, generated_text)

        attribution_metrics["citation_count"] = len(citations)
        attribution_metrics["has_citations"] = len(citations) > 0

        # Calculate source coverage (how much of context is reflected)
        if source_context:
            context_sentences = source_context.split(".")
            covered_sentences = 0

            for sentence in context_sentences:
                if sentence.strip() and any(
                    word in generated_text.lower()
                    for word in sentence.lower().split()[:3]
                ):
                    covered_sentences += 1

            attribution_metrics["source_coverage"] = (
                covered_sentences / len(context_sentences) if context_sentences else 0.0
            )
        else:
            attribution_metrics["source_coverage"] = 0.0

        return attribution_metrics

    def calculate_bert_score(
        self, generated_text: str, reference_text: str
    ) -> Dict[str, float]:
        """Calculate BERTScore for semantic similarity"""
        try:
            P, R, F1 = self.bert_scorer.score(
                [generated_text], [reference_text], lang="en", verbose=False
            )

            return {
                "bert_precision": P.item(),
                "bert_recall": R.item(),
                "bert_f1": F1.item(),
            }

        except Exception as e:
            print(f"[Warning] BERTScore calculation failed: {e}")
            return {"bert_precision": 0.0, "bert_recall": 0.0, "bert_f1": 0.0}

    def calculate_factual_consistency(
        self, generated_text: str, ground_truth: str
    ) -> float:
        """
        Simple factual consistency check using keyword overlap
        """
        if not generated_text or not ground_truth:
            return 0.0

        # Extract key information (simplified)
        def extract_key_terms(text: str) -> set:
            # Remove common words and extract potential facts
            common_words = {
                "the",
                "a",
                "an",
                "and",
                "or",
                "but",
                "in",
                "on",
                "at",
                "to",
                "for",
                "of",
                "with",
                "by",
                "is",
                "are",
                "was",
                "were",
                "be",
                "been",
                "being",
            }
            words = set(text.lower().split())
            return words - common_words

        generated_terms = extract_key_terms(generated_text)
        truth_terms = extract_key_terms(ground_truth)

        if len(truth_terms) == 0:
            return 0.0

        overlap = len(generated_terms.intersection(truth_terms))
        consistency = overlap / len(truth_terms)

        return min(consistency, 1.0)


# Initialize generation evaluator
generation_evaluator = GenerationEvaluator()

# Test generation evaluation
sample_generated = (
    "Python was created by Guido van Rossum in 1991 and is known for its simplicity."
)
sample_context = dataset_manager.documents[0] if dataset_manager.documents else ""
sample_ground_truth = eval_dataset[0]["ground_truth"]

print("\n[Generation Eval Test]")
print(f"Generated: {sample_generated}")
print(f"Context: {sample_context}")
print(f"Ground Truth: {sample_ground_truth}")

# Calculate generation metrics
faithfulness = generation_evaluator.calculate_faithfulness(
    sample_generated, sample_context
)
groundedness = generation_evaluator.calculate_groundedness(
    sample_generated, [sample_context]
)
attribution = generation_evaluator.calculate_attribution(
    sample_generated, sample_context
)
bert_scores = generation_evaluator.calculate_bert_score(
    sample_generated, sample_ground_truth
)
factual_consistency = generation_evaluator.calculate_factual_consistency(
    sample_generated, sample_ground_truth
)

print(f"\n[Generation Metrics]")
print(f"Faithfulness: {faithfulness:.3f}")
print(f"Groundedness: {groundedness:.3f}")
print(f"Attribution: {attribution}")
print(f"BERTScore: {bert_scores}")
print(f"Factual Consistency: {factual_consistency:.3f}")

In [None]:
# ===================================================================
# Cell 5: End-to-End RAG Evaluation
# 端到端 RAG 評估


class RAGEvaluator:
    """End-to-end evaluation of RAG systems"""

    def __init__(
        self,
        retrieval_evaluator: RetrievalEvaluator,
        generation_evaluator: GenerationEvaluator,
    ):
        self.retrieval_eval = retrieval_evaluator
        self.generation_eval = generation_evaluator

    def evaluate_rag_system(
        self,
        eval_dataset: Dataset,
        documents: List[str],
        generator_func: callable,
        k_values: List[int] = [1, 3, 5, 10],
    ) -> Dict:
        """
        Comprehensive RAG system evaluation

        Args:
            eval_dataset: Dataset with queries, relevant docs, ground truth
            documents: Document corpus
            generator_func: Function that takes (query, context) -> generated_text
            k_values: List of k values for Recall@k, Precision@k evaluation
        """
        results = {
            "retrieval_metrics": defaultdict(list),
            "generation_metrics": defaultdict(list),
            "end_to_end_metrics": defaultdict(list),
        }

        print(f"[RAG Eval] Evaluating {len(eval_dataset)} examples...")

        for idx, example in enumerate(eval_dataset):
            if idx % 10 == 0:
                print(f"[Progress] {idx}/{len(eval_dataset)}")

            query = example["query"]
            relevant_doc_ids = example["relevant_doc_ids"]
            ground_truth = example["ground_truth"]

            # Step 1: Retrieval evaluation
            retrieved_ids, scores = self.retrieval_eval.retrieve(query, k=max(k_values))

            for k in k_values:
                recall_k = self.retrieval_eval.calculate_recall_at_k(
                    retrieved_ids, relevant_doc_ids, k
                )
                precision_k = self.retrieval_eval.calculate_precision_at_k(
                    retrieved_ids, relevant_doc_ids, k
                )
                ndcg_k = self.retrieval_eval.calculate_ndcg_at_k(
                    retrieved_ids, relevant_doc_ids, k
                )

                results["retrieval_metrics"][f"recall@{k}"].append(recall_k)
                results["retrieval_metrics"][f"precision@{k}"].append(precision_k)
                results["retrieval_metrics"][f"ndcg@{k}"].append(ndcg_k)

            # MRR (computed once per query)
            mrr = self.retrieval_eval.calculate_mrr(retrieved_ids, relevant_doc_ids)
            results["retrieval_metrics"]["mrr"].append(mrr)

            # Step 2: Generation evaluation
            # Use top-3 retrieved documents as context
            top_retrieved_docs = [
                documents[doc_id]
                for doc_id in retrieved_ids[:3]
                if doc_id < len(documents)
            ]

            if top_retrieved_docs:
                combined_context = "\n".join(top_retrieved_docs)

                # Generate response using provided generator function
                try:
                    generated_text = generator_func(query, combined_context)
                except Exception as e:
                    print(f"[Warning] Generation failed for example {idx}: {e}")
                    generated_text = ""

                # Calculate generation metrics
                faithfulness = self.generation_eval.calculate_faithfulness(
                    generated_text, combined_context
                )
                groundedness = self.generation_eval.calculate_groundedness(
                    generated_text, top_retrieved_docs
                )
                factual_consistency = (
                    self.generation_eval.calculate_factual_consistency(
                        generated_text, ground_truth
                    )
                )
                bert_scores = self.generation_eval.calculate_bert_score(
                    generated_text, ground_truth
                )
                attribution = self.generation_eval.calculate_attribution(
                    generated_text, combined_context
                )

                results["generation_metrics"]["faithfulness"].append(faithfulness)
                results["generation_metrics"]["groundedness"].append(groundedness)
                results["generation_metrics"]["factual_consistency"].append(
                    factual_consistency
                )
                results["generation_metrics"]["bert_f1"].append(bert_scores["bert_f1"])
                results["generation_metrics"]["source_coverage"].append(
                    attribution["source_coverage"]
                )

                # End-to-end metrics (combine retrieval + generation)
                # E2E score: weighted combination of retrieval and generation performance
                retrieval_score = recall_k  # Use Recall@3 as proxy
                generation_score = (faithfulness + factual_consistency) / 2
                e2e_score = 0.4 * retrieval_score + 0.6 * generation_score

                results["end_to_end_metrics"]["e2e_score"].append(e2e_score)

        # Aggregate results (compute means)
        aggregated_results = {}
        for category in [
            "retrieval_metrics",
            "generation_metrics",
            "end_to_end_metrics",
        ]:
            aggregated_results[category] = {}
            for metric, values in results[category].items():
                if values:  # Only if we have values
                    aggregated_results[category][metric] = {
                        "mean": np.mean(values),
                        "std": np.std(values),
                        "count": len(values),
                    }

        return aggregated_results


# Simple generator function for testing
def simple_generator(query: str, context: str) -> str:
    """Simple generator that creates responses based on context"""
    # This is a simplified generator for demonstration
    # In practice, you'd use an actual LLM here

    context_sentences = context.split(".")[:2]  # Take first 2 sentences
    relevant_context = ". ".join(s.strip() for s in context_sentences if s.strip())

    # Simple template-based generation
    if "who" in query.lower() or "when" in query.lower():
        return f"Based on the provided context: {relevant_context}."
    elif "what" in query.lower():
        return f"According to the information: {relevant_context}."
    else:
        return f"The answer is: {relevant_context}."


# Initialize RAG evaluator
rag_evaluator = RAGEvaluator(retrieval_evaluator, generation_evaluator)

# Run evaluation on a subset for demonstration
eval_subset = eval_dataset.select(
    range(min(10, len(eval_dataset)))
)  # First 10 examples

print("\n[RAG Evaluation] Running end-to-end evaluation...")
rag_results = rag_evaluator.evaluate_rag_system(
    eval_subset, dataset_manager.documents, simple_generator, k_values=[1, 3, 5]
)

In [None]:
# ===================================================================
# Cell 6: Evaluation Report Generation
# 評估報告生成

import matplotlib.pyplot as plt
import seaborn as sns


class EvaluationReporter:
    """Generate comprehensive evaluation reports"""

    def __init__(self):
        plt.style.use("default")
        sns.set_palette("husl")

    def print_results_summary(self, results: Dict) -> None:
        """Print formatted evaluation results"""
        print("\n" + "=" * 60)
        print("🔍 RAG SYSTEM EVALUATION REPORT")
        print("=" * 60)

        # Retrieval metrics
        if "retrieval_metrics" in results:
            print("\n📊 RETRIEVAL PERFORMANCE")
            print("-" * 30)
            for metric, stats in results["retrieval_metrics"].items():
                print(
                    f"{metric:15}: {stats['mean']:.3f} ± {stats['std']:.3f} (n={stats['count']})"
                )

        # Generation metrics
        if "generation_metrics" in results:
            print("\n✍️  GENERATION PERFORMANCE")
            print("-" * 30)
            for metric, stats in results["generation_metrics"].items():
                print(
                    f"{metric:15}: {stats['mean']:.3f} ± {stats['std']:.3f} (n={stats['count']})"
                )

        # End-to-end metrics
        if "end_to_end_metrics" in results:
            print("\n🎯 END-TO-END PERFORMANCE")
            print("-" * 30)
            for metric, stats in results["end_to_end_metrics"].items():
                print(
                    f"{metric:15}: {stats['mean']:.3f} ± {stats['std']:.3f} (n={stats['count']})"
                )

        print("\n" + "=" * 60)

    def create_performance_plots(self, results: Dict) -> None:
        """Create visualization plots for evaluation results"""
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        fig.suptitle("RAG System Evaluation Results", fontsize=16, fontweight="bold")

        # Plot 1: Retrieval Metrics
        if "retrieval_metrics" in results:
            retrieval_data = results["retrieval_metrics"]
            metrics = list(retrieval_data.keys())
            means = [retrieval_data[m]["mean"] for m in metrics]
            stds = [retrieval_data[m]["std"] for m in metrics]

            axes[0, 0].bar(range(len(metrics)), means, yerr=stds, capsize=5, alpha=0.7)
            axes[0, 0].set_xticks(range(len(metrics)))
            axes[0, 0].set_xticklabels(metrics, rotation=45, ha="right")
            axes[0, 0].set_title("Retrieval Performance Metrics")
            axes[0, 0].set_ylabel("Score")
            axes[0, 0].grid(True, alpha=0.3)

        # Plot 2: Generation Metrics
        if "generation_metrics" in results:
            generation_data = results["generation_metrics"]
            metrics = list(generation_data.keys())
            means = [generation_data[m]["mean"] for m in metrics]
            stds = [generation_data[m]["std"] for m in metrics]

            axes[0, 1].bar(
                range(len(metrics)),
                means,
                yerr=stds,
                capsize=5,
                alpha=0.7,
                color="orange",
            )
            axes[0, 1].set_xticks(range(len(metrics)))
            axes[0, 1].set_xticklabels(metrics, rotation=45, ha="right")
            axes[0, 1].set_title("Generation Performance Metrics")
            axes[0, 1].set_ylabel("Score")
            axes[0, 1].grid(True, alpha=0.3)

        # Plot 3: Recall@K comparison
        if "retrieval_metrics" in results:
            recall_metrics = {
                k: v
                for k, v in results["retrieval_metrics"].items()
                if k.startswith("recall@")
            }
            k_values = [k.split("@")[1] for k in recall_metrics.keys()]
            recall_means = [recall_metrics[k]["mean"] for k in recall_metrics.keys()]

            axes[1, 0].plot(
                k_values, recall_means, marker="o", linewidth=2, markersize=8
            )
            axes[1, 0].set_title("Recall@K Performance")
            axes[1, 0].set_xlabel("K (Number of Retrieved Documents)")
            axes[1, 0].set_ylabel("Recall@K")
            axes[1, 0].grid(True, alpha=0.3)

        # Plot 4: Overall Performance Radar
        if all(
            category in results
            for category in ["retrieval_metrics", "generation_metrics"]
        ):
            # Select key metrics for radar chart
            key_metrics = []
            values = []

            if "recall@5" in results["retrieval_metrics"]:
                key_metrics.append("Recall@5")
                values.append(results["retrieval_metrics"]["recall@5"]["mean"])

            if "mrr" in results["retrieval_metrics"]:
                key_metrics.append("MRR")
                values.append(results["retrieval_metrics"]["mrr"]["mean"])

            if "faithfulness" in results["generation_metrics"]:
                key_metrics.append("Faithfulness")
                values.append(results["generation_metrics"]["faithfulness"]["mean"])

            if "factual_consistency" in results["generation_metrics"]:
                key_metrics.append("Factual Consistency")
                values.append(
                    results["generation_metrics"]["factual_consistency"]["mean"]
                )

            if key_metrics and len(key_metrics) >= 3:
                # Create radar chart
                angles = np.linspace(0, 2 * np.pi, len(key_metrics), endpoint=False)
                values += values[:1]  # Complete the circle
                angles = np.concatenate((angles, [angles[0]]))

                axes[1, 1].plot(angles, values, "o-", linewidth=2, label="RAG System")
                axes[1, 1].fill(angles, values, alpha=0.25)
                axes[1, 1].set_xticks(angles[:-1])
                axes[1, 1].set_xticklabels(key_metrics)
                axes[1, 1].set_ylim(0, 1)
                axes[1, 1].set_title("Overall Performance Profile")
                axes[1, 1].grid(True)
            else:
                axes[1, 1].text(
                    0.5,
                    0.5,
                    "Insufficient data\nfor radar chart",
                    ha="center",
                    va="center",
                    transform=axes[1, 1].transAxes,
                )
                axes[1, 1].set_title("Overall Performance Profile")

        plt.tight_layout()
        plt.show()

    def generate_recommendations(self, results: Dict) -> List[str]:
        """Generate improvement recommendations based on evaluation results"""
        recommendations = []

        # Check retrieval performance
        if "retrieval_metrics" in results:
            retrieval_data = results["retrieval_metrics"]

            if (
                "recall@5" in retrieval_data
                and retrieval_data["recall@5"]["mean"] < 0.5
            ):
                recommendations.append(
                    "🔍 Low Recall@5 detected. Consider: (1) Using better embedding models "
                    "(e.g., bge-large), (2) Improving document chunking strategy, "
                    "(3) Adding query expansion or reformulation"
                )

            if "mrr" in retrieval_data and retrieval_data["mrr"]["mean"] < 0.3:
                recommendations.append(
                    "📊 Low MRR suggests relevant documents are not ranked highly. "
                    "Consider: (1) Using reranking models, (2) Hybrid search (BM25 + vector), "
                    "(3) Fine-tuning embedding models on domain data"
                )

        # Check generation performance
        if "generation_metrics" in results:
            generation_data = results["generation_metrics"]

            if (
                "faithfulness" in generation_data
                and generation_data["faithfulness"]["mean"] < 0.6
            ):
                recommendations.append(
                    "✍️  Low faithfulness score indicates generated text may not align with sources. "
                    "Consider: (1) Using instruction-tuned models, (2) Improving prompt templates, "
                    "(3) Adding citation requirements in prompts"
                )

            if (
                "factual_consistency" in generation_data
                and generation_data["factual_consistency"]["mean"] < 0.5
            ):
                recommendations.append(
                    "❌ Low factual consistency. Consider: (1) Using more factual-aware LLMs, "
                    "(2) Adding fact-checking post-processing, (3) Constraining generation "
                    "to be more conservative"
                )

        # End-to-end recommendations
        if "end_to_end_metrics" in results:
            e2e_data = results["end_to_end_metrics"]
            if "e2e_score" in e2e_data and e2e_data["e2e_score"]["mean"] < 0.6:
                recommendations.append(
                    "🎯 Overall system performance is below target. Focus on: "
                    "(1) The component with lower individual scores, "
                    "(2) Better retrieval-generation alignment, "
                    "(3) End-to-end fine-tuning approaches"
                )

        if not recommendations:
            recommendations.append(
                "✅ System performance looks good! Consider: (1) Testing on larger datasets, "
                "(2) Evaluating on out-of-domain queries, (3) A/B testing with users"
            )

        return recommendations


# Initialize reporter
reporter = EvaluationReporter()

# Generate comprehensive report
print("\n🔎 Generating Evaluation Report...")
reporter.print_results_summary(rag_results)

# Create visualizations
print("\n📊 Creating Performance Visualizations...")
reporter.create_performance_plots(rag_results)

# Generate recommendations
print("\n💡 IMPROVEMENT RECOMMENDATIONS")
print("=" * 50)
recommendations = reporter.generate_recommendations(rag_results)
for i, rec in enumerate(recommendations, 1):
    print(f"\n{i}. {rec}")

In [None]:
# ===================================================================
# Cell 7: Advanced Evaluation Features
# 進階評估功能


class AdvancedEvaluator:
    """Advanced evaluation features for production RAG systems"""

    def __init__(self):
        self.evaluation_history = []

    def evaluate_query_types(
        self,
        eval_dataset: Dataset,
        rag_evaluator: RAGEvaluator,
        documents: List[str],
        generator_func: callable,
    ) -> Dict:
        """Evaluate performance by query type"""

        def categorize_query(query: str) -> str:
            """Simple query type categorization"""
            query_lower = query.lower()

            if any(word in query_lower for word in ["who", "when", "where"]):
                return "factual"
            elif any(word in query_lower for word in ["what", "define", "explain"]):
                return "definitional"
            elif any(word in query_lower for word in ["how", "why"]):
                return "procedural"
            elif any(
                word in query_lower for word in ["compare", "difference", "versus"]
            ):
                return "comparative"
            else:
                return "other"

        # Categorize queries
        query_types = defaultdict(list)
        for idx, example in enumerate(eval_dataset):
            query_type = categorize_query(example["query"])
            query_types[query_type].append(idx)

        print(f"\n[Query Analysis] Found {len(query_types)} query types:")
        for qtype, indices in query_types.items():
            print(f"  {qtype}: {len(indices)} queries")

        # Evaluate each type separately
        type_results = {}
        for qtype, indices in query_types.items():
            if len(indices) >= 3:  # Only evaluate types with enough examples
                subset = eval_dataset.select(indices)
                results = rag_evaluator.evaluate_rag_system(
                    subset, documents, generator_func, k_values=[3, 5]
                )
                type_results[qtype] = results
                print(f"\n[{qtype.upper()}] Completed evaluation")

        return type_results

    def ablation_study(
        self, eval_dataset: Dataset, documents: List[str], generator_func: callable
    ) -> Dict:
        """Conduct ablation study on retrieval parameters"""

        ablation_results = {}

        # Test different embedding models
        embedding_models = [
            "sentence-transformers/all-MiniLM-L6-v2",
            "sentence-transformers/all-mpnet-base-v2",
        ]

        for model_name in embedding_models:
            print(f"\n[Ablation] Testing embedding model: {model_name}")

            # Create new evaluator with different embedding model
            retrieval_eval = RetrievalEvaluator(model_name)
            retrieval_eval.index_documents(documents)

            rag_eval = RAGEvaluator(retrieval_eval, generation_evaluator)

            # Evaluate on subset
            subset = eval_dataset.select(range(min(5, len(eval_dataset))))
            results = rag_eval.evaluate_rag_system(subset, documents, generator_func)

            ablation_results[f"embedding_{model_name.split('/')[-1]}"] = results

        return ablation_results

    def confidence_intervals(
        self, results: Dict, confidence_level: float = 0.95
    ) -> Dict:
        """Calculate confidence intervals for metrics using bootstrap"""

        def bootstrap_ci(
            values: List[float], confidence: float = 0.95, n_bootstrap: int = 1000
        ):
            """Calculate bootstrap confidence interval"""
            if len(values) < 2:
                return (0.0, 0.0)

            import random

            bootstrap_means = []

            for _ in range(n_bootstrap):
                bootstrap_sample = [random.choice(values) for _ in range(len(values))]
                bootstrap_means.append(np.mean(bootstrap_sample))

            alpha = 1 - confidence
            lower_percentile = (alpha / 2) * 100
            upper_percentile = (1 - alpha / 2) * 100

            ci_lower = np.percentile(bootstrap_means, lower_percentile)
            ci_upper = np.percentile(bootstrap_means, upper_percentile)

            return (ci_lower, ci_upper)

        # This would require access to raw values, which we don't have in aggregated results
        # In practice, you'd store raw values and compute CIs
        print(f"[Info] Confidence interval calculation requires raw evaluation values")
        print(
            f"[Info] Consider storing individual scores during evaluation for CI computation"
        )

        return {}


# Run advanced evaluations
advanced_evaluator = AdvancedEvaluator()

print("\n🔬 Running Advanced Evaluations...")

# Query type analysis
print("\n1. Query Type Analysis")
query_type_results = advanced_evaluator.evaluate_query_types(
    eval_subset, dataset_manager.documents, simple_generator
)

for qtype, results in query_type_results.items():
    print(f"\n[{qtype.upper()}] Performance Summary:")
    if "retrieval_metrics" in results and "recall@5" in results["retrieval_metrics"]:
        recall = results["retrieval_metrics"]["recall@5"]["mean"]
        print(f"  Recall@5: {recall:.3f}")
    if (
        "generation_metrics" in results
        and "faithfulness" in results["generation_metrics"]
    ):
        faithfulness = results["generation_metrics"]["faithfulness"]["mean"]
        print(f"  Faithfulness: {faithfulness:.3f}")

# Ablation study (if sufficient resources)
print("\n2. Ablation Study")
try:
    ablation_results = advanced_evaluator.ablation_study(
        eval_subset, dataset_manager.documents, simple_generator
    )

    print("\n[Ablation] Embedding Model Comparison:")
    for model_variant, results in ablation_results.items():
        if (
            "retrieval_metrics" in results
            and "recall@5" in results["retrieval_metrics"]
        ):
            recall = results["retrieval_metrics"]["recall@5"]["mean"]
            print(f"  {model_variant}: Recall@5 = {recall:.3f}")

except Exception as e:
    print(f"[Ablation] Skipped due to resource constraints: {e}")

In [None]:
# ===================================================================
# Cell 8: Smoke Test & Validation
# 驗收測試


def run_evaluation_smoke_test():
    """Quick smoke test for evaluation pipeline"""

    print("\n🧪 EVALUATION SYSTEM SMOKE TEST")
    print("=" * 40)

    test_results = {"passed": 0, "total": 0}

    # Test 1: Dataset creation
    test_results["total"] += 1
    try:
        test_dataset = dataset_manager.create_synthetic_qa_dataset(size=3)
        assert len(test_dataset) == 3
        assert "query" in test_dataset[0]
        assert "ground_truth" in test_dataset[0]
        print("✅ Dataset creation: PASSED")
        test_results["passed"] += 1
    except Exception as e:
        print(f"❌ Dataset creation: FAILED - {e}")

    # Test 2: Retrieval evaluation
    test_results["total"] += 1
    try:
        test_retrieval_eval = RetrievalEvaluator()
        test_docs = [
            "Document 1 about Python",
            "Document 2 about AI",
            "Document 3 about ML",
        ]
        test_retrieval_eval.index_documents(test_docs)

        retrieved_ids, scores = test_retrieval_eval.retrieve("Python programming", k=2)
        assert len(retrieved_ids) == 2
        assert len(scores) == 2
        assert all(isinstance(score, (int, float)) for score in scores)

        recall = test_retrieval_eval.calculate_recall_at_k([0, 1], [0], 2)
        assert 0 <= recall <= 1

        print("✅ Retrieval evaluation: PASSED")
        test_results["passed"] += 1
    except Exception as e:
        print(f"❌ Retrieval evaluation: FAILED - {e}")

    # Test 3: Generation evaluation
    test_results["total"] += 1
    try:
        test_generation_eval = GenerationEvaluator()

        faithfulness = test_generation_eval.calculate_faithfulness(
            "Python is a programming language",
            "Python is a high-level programming language",
        )
        assert 0 <= faithfulness <= 1

        attribution = test_generation_eval.calculate_attribution(
            "Python was created by Guido", "Python was created by Guido van Rossum"
        )
        assert isinstance(attribution, dict)
        assert "source_coverage" in attribution

        print("✅ Generation evaluation: PASSED")
        test_results["passed"] += 1
    except Exception as e:
        print(f"❌ Generation evaluation: FAILED - {e}")

    # Test 4: End-to-end evaluation
    test_results["total"] += 1
    try:
        test_rag_eval = RAGEvaluator(test_retrieval_eval, test_generation_eval)

        def dummy_generator(query, context):
            return f"Answer: {context[:50]}"

        small_dataset = test_dataset.select([0])  # Just one example
        results = test_rag_eval.evaluate_rag_system(
            small_dataset, test_docs, dummy_generator, k_values=[1]
        )

        assert "retrieval_metrics" in results
        assert "generation_metrics" in results
        assert "end_to_end_metrics" in results

        print("✅ End-to-end evaluation: PASSED")
        test_results["passed"] += 1
    except Exception as e:
        print(f"❌ End-to-end evaluation: FAILED - {e}")

    # Test 5: Report generation
    test_results["total"] += 1
    try:
        test_reporter = EvaluationReporter()
        recommendations = test_reporter.generate_recommendations(results)
        assert isinstance(recommendations, list)
        assert len(recommendations) > 0

        print("✅ Report generation: PASSED")
        test_results["passed"] += 1
    except Exception as e:
        print(f"❌ Report generation: FAILED - {e}")

    # Final results
    print(
        f"\n📊 SMOKE TEST RESULTS: {test_results['passed']}/{test_results['total']} PASSED"
    )

    if test_results["passed"] == test_results["total"]:
        print("🎉 All tests passed! Evaluation system is ready.")
        return True
    else:
        print("⚠️  Some tests failed. Check the error messages above.")
        return False


# Run smoke test
smoke_test_passed = run_evaluation_smoke_test()

In [None]:
# ===================================================================
# Cell 9: Usage Examples & Best Practices
# 使用範例與最佳實踐

print("\n📚 USAGE EXAMPLES & BEST PRACTICES")
print("=" * 50)

usage_examples = """
# 1. 基本評估流程 (Basic Evaluation Pipeline)
from evaluation_system import RAGEvaluator, RetrievalEvaluator, GenerationEvaluator

# Initialize evaluators
retrieval_eval = RetrievalEvaluator("sentence-transformers/all-MiniLM-L6-v2")
generation_eval = GenerationEvaluator()
rag_eval = RAGEvaluator(retrieval_eval, generation_eval)

# Index your documents
retrieval_eval.index_documents(your_documents)

# Define your generator function
def your_llm_generator(query: str, context: str) -> str:
    # Your LLM call here
    return generated_response

# Run evaluation
results = rag_eval.evaluate_rag_system(
    eval_dataset, your_documents, your_llm_generator
)

# 2. 自訂評估指標 (Custom Evaluation Metrics)
class CustomEvaluator(GenerationEvaluator):
    def calculate_domain_relevance(self, generated_text, domain_keywords):
        # Your domain-specific evaluation logic
        pass

# 3. 批量模型比較 (Batch Model Comparison)
models_to_test = [
    ("model_a", model_a_generator),
    ("model_b", model_b_generator),
]

comparison_results = {}
for model_name, generator in models_to_test:
    results = rag_eval.evaluate_rag_system(eval_dataset, documents, generator)
    comparison_results[model_name] = results

# 4. 持續評估監控 (Continuous Evaluation Monitoring)
def setup_evaluation_monitoring():
    # Set up regular evaluation runs
    # Log results to monitoring system
    # Alert on performance degradation
    pass
"""

best_practices = """
🎯 EVALUATION BEST PRACTICES:

1. 資料集品質 (Dataset Quality):
   • 使用多樣化的查詢類型 (factual, definitional, comparative)
   • 確保 ground truth 的準確性與完整性
   • 定期更新評估資料集以反映實際使用情況

2. 指標選擇 (Metric Selection):
   • 檢索：優先 Recall@k 和 MRR，關注相關文檔的排序
   • 生成：結合事實性 (faithfulness) 和流暢性指標
   • 端到端：設計符合業務目標的複合指標

3. 評估頻率 (Evaluation Frequency):
   • 開發階段：每次模型變更後評估
   • 生產階段：定期評估 (週/月) + 關鍵變更後評估
   • A/B 測試：並行評估不同系統版本

4. 計算資源管理 (Resource Management):
   • 使用較小的評估集進行快速迭代
   • 完整評估使用較大資料集
   • 考慮使用 4bit/8bit 量化以降低記憶體需求

5. 結果解釋 (Result Interpretation):
   • 關注趨勢而非絕對數值
   • 結合定量指標與定性分析
   • 考慮不同查詢類型的差異表現
"""

print(usage_examples)
print(best_practices)

In [None]:
# ===================================================================
# Final Summary
print("\n" + "=" * 60)
print("🎊 NOTEBOOK COMPLETION SUMMARY")
print("=" * 60)

completion_summary = f"""
✅ COMPLETED COMPONENTS:
• Dataset Management: Synthetic QA dataset creation + MS MARCO integration
• Retrieval Evaluation: Recall@k, Precision@k, MRR, NDCG metrics
• Generation Evaluation: Faithfulness, groundedness, attribution scoring
• End-to-End Pipeline: Integrated RAG system evaluation
• Advanced Features: Query type analysis, ablation studies
• Reporting System: Automated report generation with visualizations
• Validation: Comprehensive smoke test suite

🔧 CORE CAPABILITIES:
• Decoupled evaluation of retrieval and generation components
• Multiple embedding model support with easy switching
• Production-ready evaluation pipeline with error handling
• Extensible architecture for custom metrics and evaluators
• Memory-efficient evaluation with 4bit/8bit support

📊 EVALUATION METRICS IMPLEMENTED:
• Retrieval: Recall@k, Precision@k, MRR, NDCG@k
• Generation: Faithfulness, Groundedness, Attribution, BERTScore
• End-to-End: Composite RAG performance scoring

💡 KEY LEARNINGS:
• 分離式評估讓你能精確定位系統瓶頸 (檢索 vs 生成)
• 不同查詢類型需要不同的評估策略和指標
• 自動化評估流程對於持續改進 RAG 系統至關重要
• 基準資料集的品質直接影響評估結果的可靠性

Smoke Test Status: {'✅ PASSED' if smoke_test_passed else '❌ FAILED'}
"""

print(completion_summary)

In [None]:
# ===================================================================
# SMOKE TEST CELL (Run this to verify everything works)
# 驗收測試 (運行此 Cell 驗證所有功能)

print("\n🚀 FINAL SMOKE TEST - Run this cell to verify the complete system")
print("=" * 60)


def comprehensive_smoke_test():
    """Comprehensive test of the entire evaluation system"""

    try:
        # 1. Quick dataset setup
        print("\n1. Setting up test dataset...")
        test_dm = EvaluationDatasetManager()
        test_data = test_dm.create_synthetic_qa_dataset(size=5)

        # 2. Initialize all evaluators
        print("2. Initializing evaluators...")
        ret_eval = RetrievalEvaluator()
        gen_eval = GenerationEvaluator()
        rag_eval = RAGEvaluator(ret_eval, gen_eval)

        # 3. Index documents
        print("3. Indexing documents...")
        ret_eval.index_documents(test_dm.documents)

        # 4. Simple generator for testing
        def test_generator(query, context):
            return f"Based on the context: {context.split('.')[0]}."

        # 5. Run evaluation
        print("4. Running RAG evaluation...")
        results = rag_eval.evaluate_rag_system(
            test_data.select([0]), test_dm.documents, test_generator, k_values=[1, 3]
        )

        # 6. Generate report
        print("5. Generating evaluation report...")
        reporter = EvaluationReporter()
        reporter.print_results_summary(results)

        print("\n✅ COMPREHENSIVE SMOKE TEST PASSED!")
        print("🎯 The evaluation system is ready for production use.")
        return True

    except Exception as e:
        print(f"\n❌ SMOKE TEST FAILED: {e}")
        import traceback

        traceback.print_exc()
        return False


# Run the comprehensive test
final_test_result = comprehensive_smoke_test()


## 6. 本章小結

### ✅ 完成項目
- **分離式評估架構**：實作了檢索器與生成器的獨立評估能力
- **全面評估指標**：包含 Recall@k, Precision@k, MRR, NDCG (檢索) 及 Faithfulness, Groundedness, Attribution (生成)
- **端到端評估流程**：整合檢索與生成的整體系統評估管線  
- **進階分析功能**：查詢類型分析、消融研究、信心區間計算
- **自動化報告生成**：視覺化結果展示與改進建議生成
### 🔍 核心概念與原理

**1. 分離式評估 (Decoupled Evaluation)**
- **檢索評估**：獨立衡量文檔檢索的準確性和相關性，不受生成品質影響
- **生成評估**：專注於文本生成的事實性、一致性和可歸因性
- **優勢**：能精確定位系統瓶頸，針對性改進特定組件

**2. 檢索評估指標體系**
- **Recall@k**：衡量檢索到的相關文檔覆蓋率
- **Precision@k**：衡量檢索結果中相關文檔的比例
- **MRR (Mean Reciprocal Rank)**：關注第一個相關文檔的排序位置
- **NDCG@k**：考慮排序質量的歸一化折扣累積增益

**3. 生成評估維度**
- **Faithfulness (忠實性)**：生成內容與源文檔的一致性
- **Groundedness (基於性)**：回答是否有充分的文檔支撐
- **Attribution (可歸因性)**：能否追溯到具體的源文檔位置

**4. 端到端評估哲學**
- **複合指標設計**：權衡檢索召回率與生成質量
- **業務對齊**：評估指標應反映實際應用場景的成功標準

### ⚠️ 常見陷阱與注意事項

**1. 評估資料集偏差**
- **問題**：合成資料集可能不反映真實查詢分布
- **解決**：結合多種資料來源，定期用真實用戶查詢更新評估集

**2. 指標過度優化**
- **問題**：單一指標優化可能損害整體系統性能
- **解決**：使用多維指標組合，關注指標間的平衡

**3. 計算資源消耗**
- **問題**：大規模評估可能消耗大量 GPU 記憶體
- **解決**：使用 4bit 量化、批次處理、分階段評估

**4. 生成評估的主觀性**
- **問題**：事實性和流暢性評估存在主觀判斷
- **解決**：結合自動化指標與人工評估，建立標準評估準則

### 🎯 下一步建議與延伸方向

**立即可行的改進 (本週內)**
1. **擴展評估資料集**：整合更多領域的標準 QA 資料集 (SQuAD, Natural Questions)
2. **優化記憶體使用**：實作批次評估和梯度累積以支援更大規模測試
3. **增強報告功能**：添加 LaTeX/PDF 報告導出和 Slack 通知整合

**中期發展目標 (未來 2-4 週)**
1. **多語言評估支援**：擴展到中文 RAG 系統評估，使用中文特化指標
2. **在線評估監控**：建立持續評估管線，監控生產系統性能退化
3. **對抗性評估**：添加 prompt injection、幻覺檢測等安全性評估

**長期規劃 (1-2 個月)**
1. **人機協作評估**：整合人工標註和眾包評估流程
2. **因果分析**：建立組件間性能相關性分析，理解檢索-生成耦合效應  
3. **自適應評估**：根據系統性能自動調整評估策略和指標權重

## 🔄 與其他 Notebook 的連接

**前置依賴**
- `nb26_rag_basic_faiss.ipynb`：基礎 RAG 系統實作
- `nb13_function_calling_tools.ipynb`：工具使用和函數調用

**後續建議順序**
1. **nb29_multi_agent_collaboration.ipynb**：多代理協作系統 (利用本章評估方法測試多代理性能)
2. **nb30_auto_pipeline_endtoend.ipynb**：自動化端到端流程 (整合評估作為品質門控)
3. **nb25_domain_specific_tuning.ipynb**：領域特定微調 (使用評估系統驗證微調效果)

**可選並行開發**
- **nb24_dpo_vs_rlhf.ipynb**：偏好優化方法 (可使用生成評估指標作為自動化偏好信號)
- **nb31_gradio_chat_ui.ipynb**：Web UI 開發 (整合評估結果展示功能)

## 🤔 階段性決策點

基於本章完成的評估系統，你現在有幾個發展方向可以選擇：

### 選項 A：多代理協作 (nb29) 
**優勢**：可以立即使用剛建立的評估系統測試多代理性能
**適合**：希望快速看到複雜系統整合效果
**投入**：中等，主要是架構設計

### 選項 B：領域微調 (nb25)
**優勢**：評估系統可以量化微調前後的效果差異
**適合**：希望深入理解模型優化技術
**投入**：較高，需要訓練資源

### 選項 C：自動化流程 (nb30)
**優勢**：打造完整的生產級 RAG 系統
**適合**：注重實用性和部署就緒度
**投入**：中等，偏重工程整合

**我的建議順序**：nb29 → nb30 → nb25，理由是先建立完整的系統架構，再進行模型層面的優化。

你希望接下來專注哪個方向？或者你有其他優先考慮的 notebook 主題？