In [None]:
# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# Cell 2: Dependencies & Imports
import re, json, time
from typing import List, Dict, Tuple, Optional, Any
from dataclasses import dataclass
from sentence_transformers import SentenceTransformer
import numpy as np
from rapidfuzz import fuzz
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Cell 3: Groundedness Detection Core
class GroundednessDetector:
    """
    Detects whether generated content is grounded in provided sources
    using keyword overlap + semantic similarity
    """

    def __init__(self, embed_model: str = "BAAI/bge-m3", threshold: float = 0.7):
        self.embed_model = SentenceTransformer(embed_model)
        self.threshold = threshold

    def extract_claims(self, text: str) -> List[str]:
        """Extract factual claims from text (simple sentence splitting)"""
        # Remove citations like [1], [2] first
        clean_text = re.sub(r"\[\d+\]", "", text)
        # Split by Chinese punctuation
        sentences = re.split(r"[。！？；]", clean_text)
        return [s.strip() for s in sentences if len(s.strip()) > 10]

    def keyword_overlap_score(self, claim: str, source: str) -> float:
        """Calculate keyword overlap between claim and source"""
        # Simple word-level Jaccard similarity
        claim_words = set(re.findall(r"\w+", claim.lower()))
        source_words = set(re.findall(r"\w+", source.lower()))

        if not claim_words or not source_words:
            return 0.0

        intersection = len(claim_words & source_words)
        union = len(claim_words | source_words)
        return intersection / union if union > 0 else 0.0

    def semantic_similarity_score(self, claim: str, source: str) -> float:
        """Calculate semantic similarity using embeddings"""
        try:
            embeddings = self.embed_model.encode(
                [claim, source], normalize_embeddings=True
            )
            similarity = float(np.dot(embeddings[0], embeddings[1]))
            return max(0.0, similarity)  # Ensure non-negative
        except Exception as e:
            logger.error(f"Embedding error: {e}")
            return 0.0

    def check_groundedness(self, claim: str, sources: List[str]) -> Dict[str, Any]:
        """
        Check if a claim is grounded in the provided sources
        Returns: {grounded: bool, best_source_idx: int, scores: dict}
        """
        if not sources:
            return {"grounded": False, "best_source_idx": -1, "scores": {}}

        best_score = 0.0
        best_idx = -1
        all_scores = []

        for i, source in enumerate(sources):
            keyword_score = self.keyword_overlap_score(claim, source)
            semantic_score = self.semantic_similarity_score(claim, source)

            # Weighted combination (favor semantic similarity for Chinese)
            combined_score = 0.3 * keyword_score + 0.7 * semantic_score
            all_scores.append(
                {
                    "source_idx": i,
                    "keyword": keyword_score,
                    "semantic": semantic_score,
                    "combined": combined_score,
                }
            )

            if combined_score > best_score:
                best_score = combined_score
                best_idx = i

        return {
            "grounded": best_score >= self.threshold,
            "best_source_idx": best_idx,
            "best_score": best_score,
            "scores": all_scores,
        }

In [None]:
# Cell 4: Citation Verification
class CitationVerifier:
    """Verify that citations in text match the provided sources"""

    def extract_citations(self, text: str) -> List[int]:
        """Extract citation numbers like [1], [2] from text"""
        citations = re.findall(r"\[(\d+)\]", text)
        return [int(c) for c in citations]

    def verify_citations(self, text: str, sources: List[Dict]) -> Dict[str, Any]:
        """
        Verify that all citations in text are valid
        sources: List of {text: str, meta: dict}
        """
        citations = self.extract_citations(text)
        max_source_idx = len(sources) - 1

        valid_citations = [c for c in citations if 0 <= c - 1 <= max_source_idx]
        invalid_citations = [
            c for c in citations if c - 1 < 0 or c - 1 > max_source_idx
        ]

        return {
            "total_citations": len(citations),
            "valid_citations": len(valid_citations),
            "invalid_citations": invalid_citations,
            "citation_rate": len(valid_citations) / max(1, len(citations)),
            "has_citations": len(citations) > 0,
        }

In [None]:
# Cell 5: Reviewer Role Implementation
@dataclass
class ReviewResult:
    """Result of content review"""

    is_grounded: bool
    groundedness_score: float
    citation_issues: List[str]
    recommendations: List[str]
    detailed_analysis: Dict[str, Any]


class ReviewerAgent:
    """
    Reviewer role: Check groundedness and citation accuracy
    """

    def __init__(
        self,
        llm_adapter=None,
        groundedness_threshold: float = 0.7,
        citation_threshold: float = 0.8,
    ):
        self.llm_adapter = llm_adapter
        self.detector = GroundednessDetector(threshold=groundedness_threshold)
        self.verifier = CitationVerifier()
        self.citation_threshold = citation_threshold

    def analyze_content(self, content: str, sources: List[Dict]) -> ReviewResult:
        """
        Comprehensive content analysis
        sources: List of {text: str, meta: dict}
        """
        # Extract claims from content
        claims = self.detector.extract_claims(content)
        logger.info(f"Extracted {len(claims)} claims for analysis")

        # Check groundedness for each claim
        source_texts = [s["text"] for s in sources]
        grounded_claims = 0
        total_groundedness_score = 0.0
        ungrounded_claims = []

        for claim in claims:
            result = self.detector.check_groundedness(claim, source_texts)
            if result["grounded"]:
                grounded_claims += 1
            else:
                ungrounded_claims.append(claim)
            total_groundedness_score += result["best_score"]

        avg_groundedness = total_groundedness_score / max(1, len(claims))

        # Verify citations
        citation_result = self.verifier.verify_citations(content, sources)

        # Generate issues and recommendations
        issues = []
        recommendations = []

        if avg_groundedness < self.detector.threshold:
            issues.append(f"整體有據可依程度偏低 ({avg_groundedness:.2f})")
            recommendations.append("請增加更多具體事實支撐或引用來源")

        if ungrounded_claims:
            issues.append(f"發現 {len(ungrounded_claims)} 條缺乏依據的聲明")
            recommendations.append(
                "請為以下聲明提供來源支撐：" + "; ".join(ungrounded_claims[:2])
            )

        if citation_result["citation_rate"] < self.citation_threshold:
            issues.append(f"引用比例偏低 ({citation_result['citation_rate']:.2f})")
            recommendations.append("請增加引用標註 [1], [2] 等")

        if citation_result["invalid_citations"]:
            issues.append(f"發現無效引用: {citation_result['invalid_citations']}")
            recommendations.append("請檢查引用編號是否正確")

        is_grounded = (
            avg_groundedness >= self.detector.threshold
            and citation_result["citation_rate"] >= self.citation_threshold
            and not citation_result["invalid_citations"]
        )

        return ReviewResult(
            is_grounded=is_grounded,
            groundedness_score=avg_groundedness,
            citation_issues=issues,
            recommendations=recommendations,
            detailed_analysis={
                "claims_total": len(claims),
                "claims_grounded": grounded_claims,
                "ungrounded_claims": ungrounded_claims[:3],  # Show first 3
                "citation_stats": citation_result,
            },
        )

    def generate_review_feedback(self, result: ReviewResult) -> str:
        """Generate natural language feedback based on review result"""
        if result.is_grounded:
            feedback = "✅ **審核通過**: 內容具有良好的事實依據且引用規範。\n\n"
        else:
            feedback = "⚠️  **需要修改**: 發現以下問題需要處理：\n\n"

        if result.citation_issues:
            feedback += "**發現問題:**\n"
            for issue in result.citation_issues:
                feedback += f"- {issue}\n"
            feedback += "\n"

        if result.recommendations:
            feedback += "**修改建議:**\n"
            for rec in result.recommendations:
                feedback += f"- {rec}\n"
            feedback += "\n"

        feedback += f"**有據可依得分:** {result.groundedness_score:.2f}/1.0\n"
        feedback += f"**已檢核聲明:** {result.detailed_analysis['claims_total']} 條"

        return feedback

In [None]:
# Cell 6: Integration with Blackboard
class ReviewerBlackboardIntegration:
    """Integration helper for Reviewer with shared blackboard"""

    def __init__(self, reviewer: ReviewerAgent):
        self.reviewer = reviewer

    def review_writer_output(self, blackboard: Dict[str, Any]) -> Dict[str, Any]:
        """
        Review writer output using researcher sources
        Updates blackboard with review results
        """
        # Get writer output and researcher sources
        writer_output = blackboard.get("writer_output", "")
        researcher_sources = blackboard.get("researcher_sources", [])

        if not writer_output:
            logger.warning("No writer output to review")
            return blackboard

        if not researcher_sources:
            logger.warning("No sources available for grounding check")

        # Perform review
        start_time = time.time()
        review_result = self.reviewer.analyze_content(writer_output, researcher_sources)
        review_time = time.time() - start_time

        # Generate feedback
        feedback = self.reviewer.generate_review_feedback(review_result)

        # Update blackboard
        blackboard.update(
            {
                "review_result": review_result,
                "review_feedback": feedback,
                "review_passed": review_result.is_grounded,
                "review_time": review_time,
                "review_timestamp": time.time(),
            }
        )

        logger.info(
            f"Review completed in {review_time:.2f}s - "
            f"{'PASSED' if review_result.is_grounded else 'FAILED'}"
        )

        return blackboard

    def suggest_revisions(self, blackboard: Dict[str, Any]) -> List[str]:
        """Generate specific revision suggestions"""
        review_result = blackboard.get("review_result")
        if not review_result:
            return []

        suggestions = []

        # Add specific suggestions based on analysis
        ungrounded_claims = review_result.detailed_analysis.get("ungrounded_claims", [])
        for claim in ungrounded_claims[:2]:  # Show top 2
            suggestions.append(f"為此聲明添加來源引用: '{claim[:50]}...'")

        citation_stats = review_result.detailed_analysis.get("citation_stats", {})
        if citation_stats.get("total_citations", 0) == 0:
            suggestions.append("內容完全缺乏引用，請添加 [1], [2] 等引用標註")

        return suggestions

In [None]:
# Cell 7: End-to-End Testing
def test_reviewer_workflow():
    """Test complete reviewer workflow with mock data"""

    # Mock researcher sources
    mock_sources = [
        {
            "text": "大型語言模型（LLM）是基於 Transformer 架構的深度學習模型，能夠理解和生成人類語言。",
            "meta": {"source_id": "wiki_llm", "title": "大型語言模型"},
        },
        {
            "text": "RAG（檢索增強生成）技術結合了資訊檢索和文本生成，可以提高模型回答的準確性。",
            "meta": {"source_id": "rag_paper", "title": "RAG 技術論文"},
        },
        {
            "text": "BERT 模型在 2018 年由 Google 提出，使用雙向編碼器表示。",
            "meta": {"source_id": "bert_paper", "title": "BERT 原始論文"},
        },
    ]

    # Mock writer outputs (good and bad examples)
    good_output = """大型語言模型（LLM）是現代 AI 的重要發展 [1]。這些模型基於 Transformer 架構，
    具有強大的語言理解和生成能力。RAG 技術進一步提升了模型的準確性 [2]，通過結合檢索和生成機制，
    讓模型能夠存取外部知識庫。"""

    bad_output = """人工智慧將在 2025 年完全取代人類工作。所有的程式設計師都會失業。
    量子計算機已經能夠破解所有加密算法。這些都是確定無疑的事實。"""

    # Initialize reviewer
    reviewer = ReviewerAgent(groundedness_threshold=0.6, citation_threshold=0.3)

    print("=== 測試 1: 高品質內容 ===")
    result1 = reviewer.analyze_content(good_output, mock_sources)
    feedback1 = reviewer.generate_review_feedback(result1)
    print(feedback1)
    print(f"通過審核: {result1.is_grounded}")

    print("\n=== 測試 2: 低品質內容 ===")
    result2 = reviewer.analyze_content(bad_output, mock_sources)
    feedback2 = reviewer.generate_review_feedback(result2)
    print(feedback2)
    print(f"通過審核: {result2.is_grounded}")

    # Test blackboard integration
    print("\n=== 測試 3: 黑板整合 ===")
    blackboard = {"writer_output": good_output, "researcher_sources": mock_sources}

    integration = ReviewerBlackboardIntegration(reviewer)
    updated_blackboard = integration.review_writer_output(blackboard)

    print(f"審核結果已更新到黑板")
    print(f"通過: {updated_blackboard['review_passed']}")
    print(f"耗時: {updated_blackboard['review_time']:.2f}s")

    return result1, result2, updated_blackboard


# Run the test
test_results = test_reviewer_workflow()

In [None]:
# Cell 8: Smoke Test
def smoke_test():
    """Quick smoke test for core functionality"""
    print("🔥 Reviewer Groundedness Smoke Test")

    # Test groundedness detector
    detector = GroundednessDetector(threshold=0.5)
    test_claim = "大型語言模型使用 Transformer 架構"
    test_source = "LLM 基於 Transformer 架構構建，具有強大的語言能力"

    result = detector.check_groundedness(test_claim, [test_source])
    print(
        f"✓ Groundedness detection: {result['grounded']} (score: {result['best_score']:.2f})"
    )

    # Test citation verifier
    verifier = CitationVerifier()
    test_text = "這是一個測試句子 [1] 和另一個引用 [2]"
    citations = verifier.extract_citations(test_text)
    print(f"✓ Citation extraction: {citations}")

    # Test reviewer agent
    reviewer = ReviewerAgent()
    mock_sources = [{"text": test_source, "meta": {}}]
    review_result = reviewer.analyze_content(test_claim, mock_sources)
    print(f"✓ Review analysis: grounded={review_result.is_grounded}")

    print("🎯 Smoke test completed successfully!")


smoke_test()