In [None]:
# nb61_eval_groundedness_rules.ipynb
# Stage 7: Evaluation & Observability
# Goal: Rule-based + semantic groundedness evaluation for RAG answers

# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# =============================================================================
# Cell 2: Import & Setup
# =============================================================================
import json
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
from sentence_transformers import SentenceTransformer
import jieba
from collections import Counter
import warnings

warnings.filterwarnings("ignore")

# Setup output directory
pathlib.Path("outs/eval").mkdir(parents=True, exist_ok=True)


@dataclass
class GroundednessResult:
    """Container for groundedness evaluation results"""

    query: str
    answer: str
    sources: List[str]
    jaccard_score: float
    containment_score: float
    semantic_score: float
    combined_score: float
    is_grounded: bool
    details: Dict


print("✅ Imports and setup complete")


In [None]:
# =============================================================================
# Cell 3: Sample RAG Q&A Dataset Preparation
# =============================================================================

# Create sample RAG evaluation dataset
sample_qa_data = [
    {
        "query": "什麼是 Transformer 架構的核心創新？",
        "answer": "Transformer 架構的核心創新是自注意力機制（self-attention mechanism），它讓模型能夠同時關注序列中的所有位置，捕捉長距離依賴關係。[1][2]",
        "sources": [
            "Transformer 是基於注意力機制的神經網路架構，由 Vaswani 等人在 2017 年提出。其核心是自注意力機制，能夠計算序列中任意兩個位置之間的依賴關係。",
            "自注意力允許模型在處理每個詞時，同時考慮整個序列的信息，這使得 Transformer 能夠捕捉長距離的語義依賴。",
            "相比於 RNN 和 CNN，Transformer 的並行計算能力更強，訓練效率更高。",
        ],
    },
    {
        "query": "RAG 系統的主要組件有哪些？",
        "answer": "RAG 系統主要包含檢索器（retriever）、生成器（generator）和知識庫三個核心組件。檢索器負責從知識庫中找到相關文檔，生成器則基於檢索到的信息生成最終答案。[1]",
        "sources": [
            "檢索增強生成（RAG）系統結合了檢索和生成兩個步驟。首先用檢索器從大型知識庫中找到相關文檔片段。",
            "然後將檢索到的文檔與原始查詢一起輸入到生成模型中，生成最終的回答。",
            "RAG 的優勢在於能夠利用外部知識，提供更準確和時效性更強的答案。",
        ],
    },
    {
        "query": "深度學習的發展歷史如何？",
        "answer": "深度學習起源於 1940 年代的人工神經網路研究，經歷了多次起伏。2006 年 Hinton 提出深度信念網路，2012 年 AlexNet 在 ImageNet 上的突破性表現標誌著深度學習時代的到來。",
        "sources": [
            "Transformer 架構在自然語言處理領域取得了重大突破，BERT 和 GPT 系列模型都基於這一架構。",
            "卷積神經網路（CNN）在計算機視覺任務中表現優異，ResNet 解決了深層網路的梯度消失問題。",
            "循環神經網路（RNN）適合處理序列數據，LSTM 和 GRU 是其重要變種。",
        ],
    },
]

print(f"✅ Prepared {len(sample_qa_data)} sample Q&A pairs for evaluation")

In [None]:
# =============================================================================
# Cell 4: Rule-based Groundedness Metrics
# =============================================================================


class RuleBasedGroundedness:
    """Rule-based groundedness evaluation using lexical overlap"""

    def __init__(self, language="zh"):
        self.language = language

    def tokenize_chinese(self, text: str) -> List[str]:
        """Tokenize Chinese text using jieba"""
        # Remove citations [1], [2] etc.
        text = re.sub(r"\[\d+\]", "", text)
        # Remove punctuation and normalize
        text = re.sub(r"[^\w\s]", " ", text)
        text = re.sub(r"\s+", " ", text).strip().lower()

        if self.language == "zh":
            tokens = list(jieba.cut(text))
            # Filter out single characters and stopwords
            tokens = [t for t in tokens if len(t) > 1 and t.strip()]
        else:
            tokens = text.split()

        return tokens

    def jaccard_similarity(
        self, answer_tokens: List[str], source_tokens: List[str]
    ) -> float:
        """Calculate Jaccard similarity between answer and source tokens"""
        set_a = set(answer_tokens)
        set_s = set(source_tokens)

        if not set_a and not set_s:
            return 1.0
        if not set_a or not set_s:
            return 0.0

        intersection = len(set_a & set_s)
        union = len(set_a | set_s)

        return intersection / union if union > 0 else 0.0

    def containment_score(
        self, answer_tokens: List[str], source_tokens: List[str]
    ) -> float:
        """Calculate what percentage of answer tokens are contained in sources"""
        if not answer_tokens:
            return 1.0

        set_a = set(answer_tokens)
        set_s = set(source_tokens)

        contained = len(set_a & set_s)
        return contained / len(set_a) if len(set_a) > 0 else 0.0

    def evaluate_against_sources(self, answer: str, sources: List[str]) -> Dict:
        """Evaluate answer groundedness against multiple sources"""
        answer_tokens = self.tokenize_chinese(answer)
        all_source_tokens = []

        # Combine all source tokens
        for source in sources:
            source_tokens = self.tokenize_chinese(source)
            all_source_tokens.extend(source_tokens)

        # Calculate metrics
        jaccard = self.jaccard_similarity(answer_tokens, all_source_tokens)
        containment = self.containment_score(answer_tokens, all_source_tokens)

        # Per-source analysis
        source_scores = []
        for i, source in enumerate(sources):
            source_tokens = self.tokenize_chinese(source)
            src_jaccard = self.jaccard_similarity(answer_tokens, source_tokens)
            src_containment = self.containment_score(answer_tokens, source_tokens)
            source_scores.append(
                {"source_id": i, "jaccard": src_jaccard, "containment": src_containment}
            )

        return {
            "jaccard_score": jaccard,
            "containment_score": containment,
            "answer_tokens": answer_tokens,
            "source_tokens": all_source_tokens,
            "source_scores": source_scores,
            "num_answer_tokens": len(answer_tokens),
            "num_source_tokens": len(all_source_tokens),
        }


# Test rule-based evaluation
rule_eval = RuleBasedGroundedness()

print("🧪 Testing rule-based groundedness on sample data:")
for i, item in enumerate(sample_qa_data[:2]):
    result = rule_eval.evaluate_against_sources(item["answer"], item["sources"])
    print(f"\nExample {i+1}:")
    print(f"  Jaccard: {result['jaccard_score']:.3f}")
    print(f"  Containment: {result['containment_score']:.3f}")
    print(f"  Answer tokens: {result['num_answer_tokens']}")
    print(f"  Source tokens: {result['num_source_tokens']}")

print("✅ Rule-based groundedness evaluation ready")

In [None]:
# =============================================================================
# Cell 5: Semantic Similarity Groundedness (BGE-M3)
# =============================================================================


class SemanticGroundedness:
    """Semantic groundedness evaluation using embeddings"""

    def __init__(self, model_name: str = "BAAI/bge-m3", device: str = "auto"):
        print(f"Loading embedding model: {model_name}")
        self.model = SentenceTransformer(model_name, device=device)
        print(f"✅ Model loaded on device: {self.model.device}")

    def split_into_sentences(self, text: str) -> List[str]:
        """Split text into sentences for fine-grained comparison"""
        # Remove citations
        text = re.sub(r"\[\d+\]", "", text)
        # Split by Chinese punctuation
        sentences = re.split(r"[。！？；]", text)
        sentences = [s.strip() for s in sentences if s.strip()]
        return sentences

    def calculate_semantic_similarity(self, answer: str, sources: List[str]) -> Dict:
        """Calculate semantic similarity between answer and sources"""
        # Split answer into sentences
        answer_sentences = self.split_into_sentences(answer)
        if not answer_sentences:
            return {"overall_score": 0.0, "details": []}

        # Combine all sources
        all_sources_text = " ".join(sources)
        source_sentences = self.split_into_sentences(all_sources_text)

        if not source_sentences:
            return {"overall_score": 0.0, "details": []}

        # Encode sentences
        answer_embeddings = self.model.encode(
            answer_sentences, normalize_embeddings=True
        )
        source_embeddings = self.model.encode(
            source_sentences, normalize_embeddings=True
        )

        # Calculate similarities
        sentence_scores = []
        for i, ans_emb in enumerate(answer_embeddings):
            # Find best matching source sentence
            similarities = np.dot(source_embeddings, ans_emb)
            best_match_idx = np.argmax(similarities)
            best_score = similarities[best_match_idx]

            sentence_scores.append(
                {
                    "answer_sentence": answer_sentences[i],
                    "best_match_source": source_sentences[best_match_idx],
                    "similarity": float(best_score),
                    "answer_sent_id": i,
                    "source_sent_id": best_match_idx,
                }
            )

        # Overall score: average of all sentence similarities
        overall_score = np.mean([s["similarity"] for s in sentence_scores])

        return {
            "overall_score": float(overall_score),
            "sentence_details": sentence_scores,
            "num_answer_sentences": len(answer_sentences),
            "num_source_sentences": len(source_sentences),
        }


# Load semantic evaluation model (low VRAM mode)
try:
    semantic_eval = SemanticGroundedness("BAAI/bge-m3")
except Exception as e:
    print(f"⚠️ GPU model loading failed: {e}")
    print("🔄 Fallback to CPU mode...")
    semantic_eval = SemanticGroundedness("BAAI/bge-small-zh-v1.5", device="cpu")

print("🧪 Testing semantic groundedness:")
for i, item in enumerate(sample_qa_data[:2]):
    result = semantic_eval.calculate_semantic_similarity(
        item["answer"], item["sources"]
    )
    print(f"\nExample {i+1}:")
    print(f"  Semantic score: {result['overall_score']:.3f}")
    print(f"  Answer sentences: {result['num_answer_sentences']}")
    print(f"  Source sentences: {result['num_source_sentences']}")

print("✅ Semantic groundedness evaluation ready")

In [None]:
# =============================================================================
# Cell 6: Combined Groundedness Score
# =============================================================================


class GroundednessEvaluator:
    """Combined groundedness evaluator with configurable weights"""

    def __init__(
        self,
        rule_weight: float = 0.4,
        semantic_weight: float = 0.6,
        threshold: float = 0.6,
    ):
        self.rule_evaluator = RuleBasedGroundedness()
        self.semantic_evaluator = semantic_eval  # Use loaded model
        self.rule_weight = rule_weight
        self.semantic_weight = semantic_weight
        self.threshold = threshold

    def evaluate(
        self, query: str, answer: str, sources: List[str]
    ) -> GroundednessResult:
        """Comprehensive groundedness evaluation"""

        # Rule-based evaluation
        rule_result = self.rule_evaluator.evaluate_against_sources(answer, sources)

        # Semantic evaluation
        semantic_result = self.semantic_evaluator.calculate_semantic_similarity(
            answer, sources
        )

        # Combined score calculation
        # Use average of jaccard and containment for rule score
        rule_score = (
            rule_result["jaccard_score"] + rule_result["containment_score"]
        ) / 2
        semantic_score = semantic_result["overall_score"]

        combined_score = (
            self.rule_weight * rule_score + self.semantic_weight * semantic_score
        )

        is_grounded = combined_score >= self.threshold

        # Detailed results
        details = {
            "rule_based": rule_result,
            "semantic": semantic_result,
            "weights": {
                "rule_weight": self.rule_weight,
                "semantic_weight": self.semantic_weight,
            },
            "component_scores": {
                "rule_score": rule_score,
                "semantic_score": semantic_score,
            },
        }

        return GroundednessResult(
            query=query,
            answer=answer,
            sources=sources,
            jaccard_score=rule_result["jaccard_score"],
            containment_score=rule_result["containment_score"],
            semantic_score=semantic_score,
            combined_score=combined_score,
            is_grounded=is_grounded,
            details=details,
        )


# Initialize combined evaluator
evaluator = GroundednessEvaluator(rule_weight=0.4, semantic_weight=0.6, threshold=0.6)

print("🧪 Testing combined groundedness evaluation:")
for i, item in enumerate(sample_qa_data):
    result = evaluator.evaluate(item["query"], item["answer"], item["sources"])
    print(f"\nExample {i+1}:")
    print(f"  Query: {item['query'][:50]}...")
    print(f"  Jaccard: {result.jaccard_score:.3f}")
    print(f"  Containment: {result.containment_score:.3f}")
    print(f"  Semantic: {result.semantic_score:.3f}")
    print(f"  Combined: {result.combined_score:.3f}")
    print(f"  Grounded: {'✅' if result.is_grounded else '❌'}")

print("✅ Combined groundedness evaluator ready")

In [None]:
# =============================================================================
# Cell 7: Batch Evaluation & Analysis
# =============================================================================


def run_batch_evaluation(
    evaluator: GroundednessEvaluator, qa_data: List[Dict]
) -> List[GroundednessResult]:
    """Run groundedness evaluation on batch of Q&A pairs"""
    results = []

    print(f"🔄 Running batch evaluation on {len(qa_data)} samples...")

    for i, item in enumerate(qa_data):
        try:
            result = evaluator.evaluate(item["query"], item["answer"], item["sources"])
            results.append(result)

            if (i + 1) % 5 == 0:
                print(f"  Processed {i + 1}/{len(qa_data)} samples")

        except Exception as e:
            print(f"⚠️ Error processing sample {i}: {e}")
            continue

    return results


def analyze_results(results: List[GroundednessResult]) -> Dict:
    """Analyze batch evaluation results"""
    if not results:
        return {}

    scores = {
        "jaccard": [r.jaccard_score for r in results],
        "containment": [r.containment_score for r in results],
        "semantic": [r.semantic_score for r in results],
        "combined": [r.combined_score for r in results],
    }

    grounded_count = sum(1 for r in results if r.is_grounded)
    grounded_rate = grounded_count / len(results)

    analysis = {
        "total_samples": len(results),
        "grounded_count": grounded_count,
        "grounded_rate": grounded_rate,
        "score_stats": {},
    }

    for metric, values in scores.items():
        analysis["score_stats"][metric] = {
            "mean": np.mean(values),
            "std": np.std(values),
            "min": np.min(values),
            "max": np.max(values),
            "median": np.median(values),
        }

    return analysis


# Run batch evaluation
batch_results = run_batch_evaluation(evaluator, sample_qa_data)
analysis = analyze_results(batch_results)

print(f"\n📊 Batch Evaluation Results:")
print(f"  Total samples: {analysis['total_samples']}")
print(f"  Grounded samples: {analysis['grounded_count']}")
print(f"  Grounded rate: {analysis['grounded_rate']:.1%}")

print(f"\n📈 Score Statistics:")
for metric, stats in analysis["score_stats"].items():
    print(
        f"  {metric.capitalize()}: μ={stats['mean']:.3f}, σ={stats['std']:.3f}, range=[{stats['min']:.3f}, {stats['max']:.3f}]"
    )

In [None]:
# =============================================================================
# Cell 8: Visualization & Low-score Case Study
# =============================================================================


def plot_groundedness_distribution(results: List[GroundednessResult]):
    """Create visualization of groundedness scores"""

    # Extract scores
    scores_df = pd.DataFrame(
        [
            {
                "Jaccard": r.jaccard_score,
                "Containment": r.containment_score,
                "Semantic": r.semantic_score,
                "Combined": r.combined_score,
                "Grounded": r.is_grounded,
            }
            for r in results
        ]
    )

    # Create subplots
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    fig.suptitle("Groundedness Score Distributions", fontsize=16)

    # Score distributions
    metrics = ["Jaccard", "Containment", "Semantic", "Combined"]
    for i, metric in enumerate(metrics):
        ax = axes[i // 2, i % 2]

        # Histogram
        ax.hist(
            scores_df[metric], bins=15, alpha=0.7, color="skyblue", edgecolor="black"
        )
        ax.axvline(
            scores_df[metric].mean(),
            color="red",
            linestyle="--",
            label=f"Mean: {scores_df[metric].mean():.3f}",
        )

        if metric == "Combined":
            ax.axvline(
                0.6, color="orange", linestyle="-", linewidth=2, label="Threshold: 0.6"
            )

        ax.set_title(f"{metric} Score Distribution")
        ax.set_xlabel("Score")
        ax.set_ylabel("Count")
        ax.legend()
        ax.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig("outs/eval/groundedness_distribution.png", dpi=300, bbox_inches="tight")
    plt.show()

    # Correlation heatmap
    plt.figure(figsize=(8, 6))
    correlation = scores_df[metrics].corr()
    sns.heatmap(
        correlation, annot=True, cmap="coolwarm", center=0, square=True, fmt=".3f"
    )
    plt.title("Groundedness Metrics Correlation")
    plt.tight_layout()
    plt.savefig("outs/eval/groundedness_correlation.png", dpi=300, bbox_inches="tight")
    plt.show()


def identify_low_score_cases(
    results: List[GroundednessResult], threshold: float = 0.5
) -> List[GroundednessResult]:
    """Identify and analyze low groundedness score cases"""

    low_score_cases = [r for r in results if r.combined_score < threshold]

    print(f"\n🔍 Low Score Case Analysis (threshold < {threshold}):")
    print(f"  Found {len(low_score_cases)} low-score cases out of {len(results)} total")

    for i, case in enumerate(low_score_cases):
        print(f"\n--- Case {i+1} ---")
        print(f"Query: {case.query}")
        print(f"Answer: {case.answer[:100]}...")
        print(f"Combined Score: {case.combined_score:.3f}")
        print(f"  - Jaccard: {case.jaccard_score:.3f}")
        print(f"  - Containment: {case.containment_score:.3f}")
        print(f"  - Semantic: {case.semantic_score:.3f}")

        # Show problematic elements
        rule_details = case.details["rule_based"]
        print(f"Answer tokens: {rule_details['num_answer_tokens']}")
        print(f"Source tokens: {rule_details['num_source_tokens']}")

    return low_score_cases


# Create visualizations
if batch_results:
    plot_groundedness_distribution(batch_results)
    low_cases = identify_low_score_cases(batch_results, threshold=0.4)

print("✅ Visualization and case analysis complete")

In [None]:
# =============================================================================
# Cell 9: Integration with RAG Pipeline
# =============================================================================


class RAGWithGroundednessCheck:
    """RAG pipeline with integrated groundedness checking"""

    def __init__(self, groundedness_evaluator: GroundednessEvaluator):
        self.evaluator = groundedness_evaluator
        self.quality_log = []

    def generate_with_check(
        self,
        query: str,
        retrieved_sources: List[str],
        generated_answer: str,
        min_groundedness: float = 0.5,
    ) -> Dict:
        """Generate answer with groundedness checking"""

        # Evaluate groundedness
        result = self.evaluator.evaluate(query, generated_answer, retrieved_sources)

        # Determine action based on groundedness
        if result.combined_score >= min_groundedness:
            action = "accept"
            final_answer = generated_answer
        else:
            action = "reject"
            final_answer = f"⚠️ 回答可信度不足 (分數: {result.combined_score:.2f}), 建議重新檢索或人工核實。"

        # Log quality metrics
        quality_entry = {
            "query": query,
            "groundedness_score": result.combined_score,
            "action": action,
            "timestamp": pd.Timestamp.now(),
            "component_scores": {
                "jaccard": result.jaccard_score,
                "containment": result.containment_score,
                "semantic": result.semantic_score,
            },
        }
        self.quality_log.append(quality_entry)

        return {
            "query": query,
            "original_answer": generated_answer,
            "final_answer": final_answer,
            "groundedness_result": result,
            "action": action,
            "quality_score": result.combined_score,
        }

    def get_quality_summary(self) -> Dict:
        """Get summary of quality metrics"""
        if not self.quality_log:
            return {}

        df = pd.DataFrame(self.quality_log)

        return {
            "total_queries": len(df),
            "accepted_rate": (df["action"] == "accept").mean(),
            "mean_groundedness": df["groundedness_score"].mean(),
            "low_quality_count": (df["groundedness_score"] < 0.5).sum(),
            "quality_trend": df.groupby(df["timestamp"].dt.hour)["groundedness_score"]
            .mean()
            .to_dict(),
        }


# Simulate RAG pipeline with groundedness checking
rag_pipeline = RAGWithGroundednessCheck(evaluator)

print("🧪 Testing RAG pipeline with groundedness checking:")

# Simulate some RAG responses
test_cases = [
    {
        "query": "什麼是深度學習？",
        "sources": [
            "深度學習是機器學習的子領域，使用多層神經網路來學習數據表示。",
            "深度學習在圖像識別、自然語言處理等領域取得突破。",
        ],
        "answer": "深度學習是機器學習的重要分支，通過多層神經網路來自動學習數據的複雜表示和模式。[1]",
    },
    {
        "query": "區塊鏈的應用領域？",
        "sources": [
            "Transformer 架構revolutionized自然語言處理",
            "BERT 和 GPT 都基於 Transformer",
        ],
        "answer": "區塊鏈廣泛應用於金融、供應鏈管理、數字身份認證等多個領域，具有去中心化和不可篡改的特性。",
    },
]

for i, case in enumerate(test_cases):
    result = rag_pipeline.generate_with_check(
        case["query"], case["sources"], case["answer"], min_groundedness=0.6
    )

    print(f"\n--- Test Case {i+1} ---")
    print(f"Query: {result['query']}")
    print(f"Quality Score: {result['quality_score']:.3f}")
    print(f"Action: {result['action']}")
    print(f"Final Answer: {result['final_answer'][:100]}...")

# Quality summary
quality_summary = rag_pipeline.get_quality_summary()
print(f"\n📊 Pipeline Quality Summary:")
print(f"  Total queries: {quality_summary['total_queries']}")
print(f"  Acceptance rate: {quality_summary['accepted_rate']:.1%}")
print(f"  Mean groundedness: {quality_summary['mean_groundedness']:.3f}")

print("✅ RAG pipeline integration complete")

In [None]:
# =============================================================================
# Cell 10: Smoke Test & Export Results
# =============================================================================


def export_evaluation_results(
    results: List[GroundednessResult],
    analysis: Dict,
    output_path: str = "outs/eval/groundedness_results.json",
):
    """Export evaluation results to JSON"""

    export_data = {
        "metadata": {
            "evaluation_timestamp": pd.Timestamp.now().isoformat(),
            "total_samples": len(results),
            "evaluator_config": {
                "rule_weight": evaluator.rule_weight,
                "semantic_weight": evaluator.semantic_weight,
                "threshold": evaluator.threshold,
            },
        },
        "summary": analysis,
        "detailed_results": [],
    }

    # Add detailed results
    for result in results:
        export_data["detailed_results"].append(
            {
                "query": result.query,
                "answer": result.answer,
                "sources": result.sources,
                "scores": {
                    "jaccard": result.jaccard_score,
                    "containment": result.containment_score,
                    "semantic": result.semantic_score,
                    "combined": result.combined_score,
                },
                "is_grounded": result.is_grounded,
            }
        )

    # Write to file
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(export_data, f, ensure_ascii=False, indent=2)

    print(f"✅ Results exported to {output_path}")
    return export_data


def create_groundedness_report(
    results: List[GroundednessResult],
    analysis: Dict,
    output_path: str = "outs/eval/groundedness_report.md",
):
    """Create markdown report of groundedness evaluation"""

    report = f"""# Groundedness Evaluation Report

Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}

## Summary

- **Total Samples**: {analysis['total_samples']}
- **Grounded Samples**: {analysis['grounded_count']} ({analysis['grounded_rate']:.1%})
- **Mean Combined Score**: {analysis['score_stats']['combined']['mean']:.3f}

## Score Statistics

| Metric | Mean | Std | Min | Max | Median |
|--------|------|-----|-----|-----|--------|
"""

    for metric, stats in analysis["score_stats"].items():
        report += f"| {metric.capitalize()} | {stats['mean']:.3f} | {stats['std']:.3f} | {stats['min']:.3f} | {stats['max']:.3f} | {stats['median']:.3f} |\n"

    report += f"""
## Configuration

- **Rule Weight**: {evaluator.rule_weight}
- **Semantic Weight**: {evaluator.semantic_weight}
- **Threshold**: {evaluator.threshold}

## Low Groundedness Cases

"""

    low_cases = [r for r in results if r.combined_score < 0.5]
    for i, case in enumerate(low_cases[:3]):  # Show top 3 low cases
        report += f"""
### Case {i+1} (Score: {case.combined_score:.3f})

**Query**: {case.query}

**Answer**: {case.answer[:200]}...

**Scores**: Jaccard={case.jaccard_score:.3f}, Containment={case.containment_score:.3f}, Semantic={case.semantic_score:.3f}

"""

    report += """
## Recommendations

1. **Rule-based vs Semantic**: Consider adjusting weights based on your use case
2. **Threshold Tuning**: Current threshold may need adjustment based on domain requirements
3. **Low Score Investigation**: Review cases below 0.5 for common patterns
4. **Integration**: Consider adding groundedness checks to production RAG pipeline

"""

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(report)

    print(f"✅ Report generated: {output_path}")


# Run smoke test
print("🧪 Running final smoke test...")

# Test single evaluation
smoke_query = "什麼是機器學習？"
smoke_answer = "機器學習是人工智慧的分支，讓電腦從數據中自動學習模式。[1]"
smoke_sources = ["機器學習是AI的重要組成部分，通過算法讓機器從數據中學習並做出預測。"]

smoke_result = evaluator.evaluate(smoke_query, smoke_answer, smoke_sources)

assert smoke_result.combined_score > 0, "Smoke test failed: No score generated"
assert 0 <= smoke_result.combined_score <= 1, "Smoke test failed: Score out of range"
assert isinstance(
    smoke_result.is_grounded, bool
), "Smoke test failed: Invalid grounded flag"

print(f"✅ Smoke test passed!")
print(f"  Sample score: {smoke_result.combined_score:.3f}")
print(f"  Is grounded: {smoke_result.is_grounded}")

# Export results
if batch_results and analysis:
    export_data = export_evaluation_results(batch_results, analysis)
    create_groundedness_report(batch_results, analysis)

    # Create CSV for easy analysis
    results_df = pd.DataFrame(
        [
            {
                "query": r.query,
                "jaccard_score": r.jaccard_score,
                "containment_score": r.containment_score,
                "semantic_score": r.semantic_score,
                "combined_score": r.combined_score,
                "is_grounded": r.is_grounded,
                "answer_length": len(r.answer),
                "num_sources": len(r.sources),
            }
            for r in batch_results
        ]
    )

    results_df.to_csv(
        "outs/eval/groundedness_scores.csv", index=False, encoding="utf-8"
    )
    print("✅ CSV export complete: outs/eval/groundedness_scores.csv")

print(
    """
🎯 Key Takeaways:
1. Rule-based metrics (Jaccard, containment) catch lexical overlap
2. Semantic similarity captures meaning alignment even without exact matches
3. Combined scoring balances precision and recall
4. Integration with RAG pipeline enables quality gating
5. Threshold tuning critical for production deployment

⚠️ Pitfalls to avoid:
- Over-relying on lexical overlap for technical content
- Ignoring citation misalignment in answers
- Not accounting for paraphrasing in semantic evaluation
- Setting thresholds without domain-specific validation

🔄 Next steps:
- Tune weights and thresholds for your specific domain
- Add human evaluation benchmark for validation
- Implement real-time groundedness monitoring
- Consider query-type specific evaluation strategies
"""
)

print("✅ nb61_eval_groundedness_rules.ipynb complete!")

In [None]:
# Quick smoke test - run this cell independently to verify setup
print("🔥 Groundedness Evaluation Smoke Test")

# Minimal test case
test_query = "什麼是 Transformer？"
test_answer = "Transformer 是一種基於注意力機制的神經網路架構，用於自然語言處理。[1]"
test_sources = [
    "Transformer 是 Google 在 2017 年提出的神經網路架構，核心是自注意力機制。"
]

# Quick rule-based check
from collections import Counter
import re


def quick_jaccard(answer, sources):
    # Simple tokenization
    ans_tokens = set(re.findall(r"[\w]+", answer.lower()))
    src_tokens = set(re.findall(r"[\w]+", " ".join(sources).lower()))

    if not ans_tokens or not src_tokens:
        return 0.0

    intersection = len(ans_tokens & src_tokens)
    union = len(ans_tokens | src_tokens)
    return intersection / union if union > 0 else 0.0


score = quick_jaccard(test_answer, test_sources)
print(f"✅ Quick Jaccard score: {score:.3f}")
print(f"✅ Score validation: {'PASS' if 0 <= score <= 1 else 'FAIL'}")
print("🎯 Ready for full evaluation pipeline!")