In [None]:
# Stage 7 - nb62 | Text Generation Quality Evaluation
# Goals: Rouge-L, chrF++, human annotation, comparative analysis

# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# Cell 2: Dependencies and Setup
import json
import pandas as pd
import numpy as np
from pathlib import Path
from typing import List, Dict, Any, Tuple
from dataclasses import dataclass
import time
from datetime import datetime

# Text evaluation metrics
try:
    from rouge_score import rouge_scorer

    print("✓ rouge-score available")
except ImportError:
    print("⚠️ Installing rouge-score...")
    os.system("pip install rouge-score")
    from rouge_score import rouge_scorer

try:
    import sacrebleu

    print("✓ sacrebleu available")
except ImportError:
    print("⚠️ Installing sacrebleu...")
    os.system("pip install sacrebleu")
    import sacrebleu

# Optional: Gradio for human evaluation interface
try:
    import gradio as gr

    print("✓ gradio available")
except ImportError:
    print("⚠️ Installing gradio...")
    os.system("pip install gradio")
    import gradio as gr

# Create output directories
for dir_name in ["outs/eval", "outs/reports", "data/eval"]:
    pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True)

print("Dependencies loaded successfully!")

In [None]:
# Cell 3: Test Dataset and Reference Answers Setup
@dataclass
class EvalSample:
    """Single evaluation sample with question, reference, and generated answers"""

    id: str
    question: str
    reference: str
    context: str = ""
    domain: str = "general"


class QualityEvaluator:
    """Text generation quality evaluator with multiple metrics"""

    def __init__(self):
        # Initialize Rouge scorer for Chinese/English
        self.rouge_scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)

    def compute_rouge_l(self, generated: str, reference: str) -> float:
        """Compute Rouge-L F1 score"""
        try:
            scores = self.rouge_scorer.score(reference, generated)
            return scores["rougeL"].fmeasure
        except Exception as e:
            print(f"Rouge-L error: {e}")
            return 0.0

    def compute_chrf(self, generated: str, reference: str) -> float:
        """Compute chrF++ score"""
        try:
            # chrF++ with word and character n-grams
            score = sacrebleu.sentence_chrf(generated, [reference], word_order=2)
            return score.score / 100.0  # Normalize to 0-1
        except Exception as e:
            print(f"chrF++ error: {e}")
            return 0.0

    def compute_length_penalty(self, generated: str, reference: str) -> float:
        """Length-based penalty (closer to reference length is better)"""
        if not reference.strip():
            return 0.0

        gen_len = len(generated.strip())
        ref_len = len(reference.strip())

        if gen_len == 0:
            return 0.0

        # Penalty for being too short or too long
        ratio = min(gen_len, ref_len) / max(gen_len, ref_len)
        return ratio

    def evaluate_sample(self, sample: EvalSample, generated: str) -> Dict[str, float]:
        """Evaluate a single generated text against reference"""
        metrics = {
            "rouge_l": self.compute_rouge_l(generated, sample.reference),
            "chrf_plus": self.compute_chrf(generated, sample.reference),
            "length_penalty": self.compute_length_penalty(generated, sample.reference),
        }

        # Composite score (weighted average)
        metrics["composite"] = (
            0.4 * metrics["rouge_l"]
            + 0.4 * metrics["chrf_plus"]
            + 0.2 * metrics["length_penalty"]
        )

        return metrics


# Create sample test dataset
def create_test_dataset() -> List[EvalSample]:
    """Create a small test dataset for evaluation"""
    samples = [
        EvalSample(
            id="qa_001",
            question="什麼是檢索增強生成（RAG）？",
            reference="檢索增強生成（RAG）是一種結合資訊檢索和文本生成的技術，通過從外部知識庫檢索相關資訊來增強語言模型的生成能力，提高回答的準確性和時效性。",
            domain="tech",
        ),
        EvalSample(
            id="qa_002",
            question="請解釋機器學習中的過擬合現象。",
            reference="過擬合是指模型在訓練數據上表現很好，但在新的、未見過的數據上表現較差的現象。這通常是因為模型過於複雜，學習了訓練數據中的噪音和特殊情況，而非通用規律。",
            domain="tech",
        ),
        EvalSample(
            id="qa_003",
            question="台灣的地理位置有什麼特色？",
            reference="台灣位於亞洲東部，西太平洋島弧上，東臨太平洋，西隔台灣海峽與中國大陸相望。地處熱帶與亞熱帶交界，地形多山，平原主要分布在西部沿海。",
            domain="general",
        ),
        EvalSample(
            id="qa_004",
            question="如何提高團隊溝通效率？",
            reference="提高團隊溝通效率的方法包括：建立清晰的溝通流程、使用合適的溝通工具、定期舉行團隊會議、鼓勵開放透明的對話、確保資訊及時共享、培養積極傾聽的習慣。",
            domain="general",
        ),
        EvalSample(
            id="qa_005",
            question="什麼是區塊鏈技術的核心概念？",
            reference="區塊鏈是一種分散式帳本技術，核心概念包括：去中心化、不可篡改性、透明性和共識機制。每個區塊包含交易記錄，並通過密碼學方法鏈接，形成安全可信的數據儲存系統。",
            domain="tech",
        ),
    ]

    return samples


# Load test dataset
test_samples = create_test_dataset()
evaluator = QualityEvaluator()

print(f"Created test dataset with {len(test_samples)} samples")
for sample in test_samples[:2]:
    print(f"Sample {sample.id}: {sample.question[:50]}...")

In [None]:
# Cell 4: Human Evaluation Interface (Gradio)
class HumanEvaluator:
    """Simple human evaluation interface for subjective quality assessment"""

    def __init__(self):
        self.annotations = []
        self.current_sample_idx = 0

    def create_evaluation_interface(
        self, samples: List[EvalSample], generated_texts: List[str]
    ):
        """Create Gradio interface for human evaluation"""

        def evaluate_text(
            sample_idx: int,
            generated: str,
            relevance: int,
            fluency: int,
            informativeness: int,
            overall: int,
            comments: str,
        ):
            """Save human evaluation"""
            annotation = {
                "sample_id": samples[sample_idx].id,
                "question": samples[sample_idx].question,
                "reference": samples[sample_idx].reference,
                "generated": generated,
                "relevance": relevance,
                "fluency": fluency,
                "informativeness": informativeness,
                "overall": overall,
                "comments": comments,
                "timestamp": datetime.now().isoformat(),
            }
            self.annotations.append(annotation)

            # Save to file
            annotations_file = "outs/eval/human_annotations.jsonl"
            with open(annotations_file, "a", encoding="utf-8") as f:
                f.write(json.dumps(annotation, ensure_ascii=False) + "\n")

            return f"評估已保存！已完成 {len(self.annotations)} 個樣本"

        def load_sample(sample_idx: int):
            """Load sample for evaluation"""
            if 0 <= sample_idx < len(samples):
                sample = samples[sample_idx]
                generated = (
                    generated_texts[sample_idx]
                    if sample_idx < len(generated_texts)
                    else ""
                )
                return (
                    sample.question,
                    sample.reference,
                    generated,
                    f"樣本 {sample_idx + 1}/{len(samples)} - {sample.domain}",
                )
            return "", "", "", "無效的樣本索引"

        # Create interface
        with gr.Blocks(title="文本品質人工評估") as demo:
            gr.Markdown("# 文本生成品質人工評估介面")
            gr.Markdown("請根據以下標準評估生成的文本品質（1-5分，5分最高）")

            with gr.Row():
                sample_idx = gr.Number(label="樣本索引", value=0, precision=0)
                load_btn = gr.Button("載入樣本")

            info_display = gr.Textbox(label="樣本資訊", interactive=False)
            question_display = gr.Textbox(label="問題", lines=2, interactive=False)
            reference_display = gr.Textbox(label="參考答案", lines=3, interactive=False)
            generated_display = gr.Textbox(label="生成文本", lines=4, interactive=True)

            with gr.Row():
                relevance = gr.Slider(1, 5, value=3, step=1, label="相關性 (1-5)")
                fluency = gr.Slider(1, 5, value=3, step=1, label="流暢性 (1-5)")

            with gr.Row():
                informativeness = gr.Slider(1, 5, value=3, step=1, label="資訊性 (1-5)")
                overall = gr.Slider(1, 5, value=3, step=1, label="整體品質 (1-5)")

            comments = gr.Textbox(
                label="評論（可選）", lines=2, placeholder="其他意見或建議..."
            )

            with gr.Row():
                submit_btn = gr.Button("提交評估", variant="primary")
                result_display = gr.Textbox(label="提交結果", interactive=False)

            # Event handlers
            load_btn.click(
                load_sample,
                inputs=[sample_idx],
                outputs=[
                    question_display,
                    reference_display,
                    generated_display,
                    info_display,
                ],
            )

            submit_btn.click(
                evaluate_text,
                inputs=[
                    sample_idx,
                    generated_display,
                    relevance,
                    fluency,
                    informativeness,
                    overall,
                    comments,
                ],
                outputs=[result_display],
            )

        return demo

    def load_annotations(
        self, file_path: str = "outs/eval/human_annotations.jsonl"
    ) -> List[Dict]:
        """Load human annotations from file"""
        annotations = []
        if Path(file_path).exists():
            with open(file_path, "r", encoding="utf-8") as f:
                for line in f:
                    try:
                        annotations.append(json.loads(line.strip()))
                    except json.JSONDecodeError:
                        continue
        return annotations

    def compute_human_metrics(self, annotations: List[Dict]) -> Dict[str, float]:
        """Compute aggregated human evaluation metrics"""
        if not annotations:
            return {}

        df = pd.DataFrame(annotations)
        metrics = {
            "avg_relevance": df["relevance"].mean(),
            "avg_fluency": df["fluency"].mean(),
            "avg_informativeness": df["informativeness"].mean(),
            "avg_overall": df["overall"].mean(),
            "std_overall": df["overall"].std(),
            "num_annotations": len(annotations),
        }

        return metrics


# Initialize human evaluator
human_evaluator = HumanEvaluator()
print("Human evaluation interface ready!")

In [None]:
# Cell 5: Multi-Model Comparative Evaluation
class ComparativeEvaluator:
    """Compare multiple models/configurations on the same test set"""

    def __init__(self, evaluator: QualityEvaluator):
        self.evaluator = evaluator
        self.results = []

    def mock_model_generate(
        self, model_name: str, question: str, context: str = ""
    ) -> str:
        """Mock different model responses for demonstration"""
        responses = {
            "baseline": {
                "什麼是檢索增強生成（RAG）？": "RAG是一種AI技術，結合了檢索和生成。",
                "請解釋機器學習中的過擬合現象。": "過擬合就是模型在訓練數據上效果好，但新數據上效果差。",
                "台灣的地理位置有什麼特色？": "台灣是個島嶼，在亞洲。",
                "如何提高團隊溝通效率？": "可以開會、用工具。",
                "什麼是區塊鏈技術的核心概念？": "區塊鏈是分散式技術。",
            },
            "improved": {
                "什麼是檢索增強生成（RAG）？": "檢索增強生成（RAG）是結合資訊檢索與文本生成的先進技術，能夠從外部知識庫檢索相關資訊並增強語言模型的回答能力，提升準確性。",
                "請解釋機器學習中的過擬合現象。": "過擬合是機器學習中常見問題，指模型在訓練數據上表現優異，但對新數據的泛化能力較差。這通常因為模型過於複雜，學習了訓練數據的噪音。",
                "台灣的地理位置有什麼特色？": "台灣位於西太平洋島弧，東臨太平洋，西隔台灣海峽與大陸相望，地處熱帶與亞熱帶交界，具有獨特的地理優勢。",
                "如何提高團隊溝通效率？": "提高團隊溝通效率需要建立清晰流程、選用適當工具、定期會議、促進開放對話、確保資訊共享，並培養良好的傾聽習慣。",
                "什麼是區塊鏈技術的核心概念？": "區塊鏈核心概念包括去中心化、不可篡改性、透明性和共識機制。透過密碼學方法將交易記錄鏈接成安全可信的分散式帳本。",
            },
            "advanced": {
                "什麼是檢索增強生成（RAG）？": "檢索增強生成（Retrieval-Augmented Generation, RAG）是一種創新的人工智慧架構，將傳統的資訊檢索系統與大型語言模型深度整合。RAG系統首先從龐大的外部知識庫中檢索與查詢相關的文檔片段，然後將這些上下文資訊與原始問題一起輸入到生成模型中，使模型能夠產生更準確、更具時效性和更具專業性的回答。這種方法有效解決了語言模型知識截止日期的限制，並大幅提升了生成內容的事實準確性。",
                "請解釋機器學習中的過擬合現象。": "過擬合（Overfitting）是機器學習領域的核心挑戰之一，表現為模型在訓練數據集上達到極高的準確率，卻在驗證集或測試集上表現顯著下降。這種現象的根本原因在於模型複雜度過高，導致其不僅學習了數據中的真實模式，還記憶了訓練樣本中的隨機噪音、離群值和特定細節。過擬合的模型缺乏泛化能力，無法有效處理未見過的新數據。常見的緩解策略包括正則化技術、交叉驗證、早停法、數據擴增以及降低模型複雜度等方法。",
                "台灣的地理位置有什麼特色？": "台灣島地理位置極為獨特且戰略重要，位於北緯22°至25°、東經120°至122°之間，座落在歐亞大陸板塊與菲律賓海板塊的交界處。島嶼東臨浩瀚的太平洋，西隔寬約130公里的台灣海峽與中國大陸福建省相望，南端巴士海峽連接南海，北部則面向東海。台灣正處於北回歸線穿越的熱帶與亞熱帶氣候交界帶，造就了豐富的生物多樣性。地形以山地為主體，中央山脈縱貫南北，平原主要分布在西部沿海，形成了「高山海島」的獨特地理特色。",
                "如何提高團隊溝通效率？": "提升團隊溝通效率是現代組織管理的關鍵議題，需要系統性的策略規劃。首要步驟是建立標準化的溝通流程與協議，明確不同情境下的溝通方式、責任歸屬和決策權限。技術層面應選擇適合團隊規模與工作性質的協作工具，如即時通訊平台、專案管理系統和視訊會議軟體。組織層面需要建立定期的團隊會議機制，包括日常站會、週期性回顧和專案里程碑討論。文化層面要培養開放透明的溝通氛圍，鼓勵成員主動分享資訊、提出疑問和建設性意見。此外，還需要強化積極傾聽技巧、提供溝通技能培訓，並建立有效的反饋機制以持續優化溝通品質。",
                "什麼是區塊鏈技術的核心概念？": "區塊鏈（Blockchain）技術的核心概念建立在四大支柱之上：去中心化、不可篡改性、透明性和共識機制。去中心化意味著系統不依賴單一的中央機構控制，而是由分散在網路中的多個節點共同維護。不可篡改性透過密碼學雜湊函數和鏈式結構實現，每個區塊都包含前一個區塊的雜湊值，形成不可逆的時間戳記錄鏈。透明性確保所有交易記錄對網路參與者公開可見，提升系統的可審計性。共識機制（如工作量證明、權益證明等）確保網路節點對新區塊的有效性達成一致，維護分散式帳本的完整性。這些核心特性使區塊鏈成為構建可信任數位經濟基礎設施的革命性技術，在金融服務、供應鏈管理、數位身份驗證等領域展現巨大潛力。",
            },
        }

        # Return mock response based on model and question
        model_responses = responses.get(model_name, responses["baseline"])
        return model_responses.get(
            question, f"這是{model_name}模型對於「{question}」的回答。"
        )

    def evaluate_models(
        self, models: List[str], samples: List[EvalSample]
    ) -> pd.DataFrame:
        """Evaluate multiple models on test samples"""
        results = []

        for model_name in models:
            print(f"Evaluating model: {model_name}")
            model_results = []

            for sample in samples:
                # Generate response (mock)
                generated = self.mock_model_generate(
                    model_name, sample.question, sample.context
                )

                # Evaluate quality
                metrics = self.evaluator.evaluate_sample(sample, generated)

                result = {
                    "model": model_name,
                    "sample_id": sample.id,
                    "domain": sample.domain,
                    "question": sample.question,
                    "reference": sample.reference,
                    "generated": generated,
                    **metrics,
                }

                model_results.append(result)

            results.extend(model_results)

        return pd.DataFrame(results)

    def compute_model_summary(self, results_df: pd.DataFrame) -> pd.DataFrame:
        """Compute summary statistics for each model"""
        summary = (
            results_df.groupby("model")
            .agg(
                {
                    "rouge_l": ["mean", "std"],
                    "chrf_plus": ["mean", "std"],
                    "length_penalty": ["mean", "std"],
                    "composite": ["mean", "std"],
                    "sample_id": "count",
                }
            )
            .round(4)
        )

        # Flatten column names
        summary.columns = ["_".join(col).strip() for col in summary.columns]
        summary = summary.rename(columns={"sample_id_count": "num_samples"})

        return summary.reset_index()


# Run comparative evaluation
models_to_compare = ["baseline", "improved", "advanced"]
comparative_evaluator = ComparativeEvaluator(evaluator)

print("Running comparative evaluation...")
results_df = comparative_evaluator.evaluate_models(models_to_compare, test_samples)
summary_df = comparative_evaluator.compute_model_summary(results_df)

print("\n=== Model Comparison Summary ===")
print(summary_df.to_string(index=False))

In [None]:
# Cell 6: Quality Score Aggregation and Reporting
class QualityReporter:
    """Generate comprehensive quality evaluation reports"""

    def __init__(self):
        self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    def generate_detailed_report(
        self,
        results_df: pd.DataFrame,
        summary_df: pd.DataFrame,
        human_metrics: Dict = None,
    ) -> str:
        """Generate detailed evaluation report"""

        report_lines = [
            "# Text Generation Quality Evaluation Report",
            f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
            "",
            "## Executive Summary",
            f"- Evaluated {len(results_df['model'].unique())} models",
            f"- Tested on {len(results_df['sample_id'].unique())} samples",
            f"- Domains: {', '.join(results_df['domain'].unique())}",
            "",
            "## Automatic Metrics Summary",
            "| Model | Rouge-L | chrF++ | Length Penalty | Composite Score |",
            "| --- | --- | --- | --- | --- |",
        ]

        for _, row in summary_df.iterrows():
            report_lines.append(
                f"| {row['model']} | "
                f"{row['rouge_l_mean']:.3f}±{row['rouge_l_std']:.3f} | "
                f"{row['chrf_plus_mean']:.3f}±{row['chrf_plus_std']:.3f} | "
                f"{row['length_penalty_mean']:.3f}±{row['length_penalty_std']:.3f} | "
                f"{row['composite_mean']:.3f}±{row['composite_std']:.3f} |"
            )

        # Add human evaluation metrics if available
        if human_metrics:
            report_lines.extend(
                [
                    "",
                    "## Human Evaluation Metrics",
                    f"- Number of annotations: {human_metrics.get('num_annotations', 0)}",
                    f"- Average relevance: {human_metrics.get('avg_relevance', 0):.2f}/5",
                    f"- Average fluency: {human_metrics.get('avg_fluency', 0):.2f}/5",
                    f"- Average informativeness: {human_metrics.get('avg_informativeness', 0):.2f}/5",
                    f"- Average overall quality: {human_metrics.get('avg_overall', 0):.2f}/5 (±{human_metrics.get('std_overall', 0):.2f})",
                ]
            )

        # Best performing model
        best_model = summary_df.loc[summary_df["composite_mean"].idxmax(), "model"]
        best_score = summary_df.loc[
            summary_df["composite_mean"].idxmax(), "composite_mean"
        ]

        report_lines.extend(
            [
                "",
                "## Key Findings",
                f"- **Best performing model**: {best_model} (composite score: {best_score:.3f})",
                "- **Rouge-L**: Measures overlap of longest common subsequences",
                "- **chrF++**: Character-level F-score with word order consideration",
                "- **Length Penalty**: Penalizes responses too short or too long",
                "",
                "## Recommendations",
                "1. Consider the trade-off between automatic metrics and human preferences",
                "2. Evaluate on larger, domain-specific datasets for production use",
                "3. Include task-specific metrics (e.g., factuality, coherence)",
                "4. Regularly update evaluation benchmarks",
                "",
                "## Sample Outputs (Best vs Worst)",
            ]
        )

        # Add sample comparisons
        best_samples = results_df[results_df["model"] == best_model].nlargest(
            2, "composite"
        )
        for idx, (_, sample) in enumerate(best_samples.iterrows()):
            report_lines.extend(
                [
                    f"### Sample {idx+1} (Score: {sample['composite']:.3f})",
                    f"**Question**: {sample['question']}",
                    f"**Generated**: {sample['generated'][:200]}...",
                    "",
                ]
            )

        return "\n".join(report_lines)

    def save_results(
        self,
        results_df: pd.DataFrame,
        summary_df: pd.DataFrame,
        report_text: str,
        human_metrics: Dict = None,
    ):
        """Save all evaluation results to files"""

        # Save detailed results
        results_file = f"outs/eval/quality_results_{self.timestamp}.csv"
        results_df.to_csv(results_file, index=False, encoding="utf-8")
        print(f"✓ Detailed results saved to: {results_file}")

        # Save summary
        summary_file = f"outs/eval/quality_summary_{self.timestamp}.csv"
        summary_df.to_csv(summary_file, index=False, encoding="utf-8")
        print(f"✓ Summary saved to: {summary_file}")

        # Save report
        report_file = f"outs/reports/quality_report_{self.timestamp}.md"
        with open(report_file, "w", encoding="utf-8") as f:
            f.write(report_text)
        print(f"✓ Report saved to: {report_file}")

        # Save metrics JSON
        metrics_data = {
            "timestamp": self.timestamp,
            "num_models": len(results_df["model"].unique()),
            "num_samples": len(results_df["sample_id"].unique()),
            "summary": summary_df.to_dict("records"),
            "human_metrics": human_metrics or {},
        }

        metrics_file = f"outs/eval/quality_metrics_{self.timestamp}.json"
        with open(metrics_file, "w", encoding="utf-8") as f:
            json.dump(metrics_data, f, ensure_ascii=False, indent=2)
        print(f"✓ Metrics saved to: {metrics_file}")

        return {
            "results_file": results_file,
            "summary_file": summary_file,
            "report_file": report_file,
            "metrics_file": metrics_file,
        }


# Generate comprehensive report
reporter = QualityReporter()

# Load human annotations (if any exist)
human_annotations = human_evaluator.load_annotations()
human_metrics = (
    human_evaluator.compute_human_metrics(human_annotations)
    if human_annotations
    else None
)

# Generate report
report_text = reporter.generate_detailed_report(results_df, summary_df, human_metrics)

# Save all results
saved_files = reporter.save_results(results_df, summary_df, report_text, human_metrics)

print("\n=== Quality Evaluation Report Generated ===")
print(report_text[:1000] + "..." if len(report_text) > 1000 else report_text)

In [None]:
# Cell 7: Smoke Test - Run evaluation on subset
def run_smoke_test():
    """Quick smoke test on a subset of samples"""
    print("🔥 Running smoke test...")

    # Test with first 3 samples only
    smoke_samples = test_samples[:3]
    smoke_models = ["baseline", "improved"]

    print(f"Testing {len(smoke_models)} models on {len(smoke_samples)} samples")

    # Run evaluation
    smoke_evaluator = QualityEvaluator()
    smoke_results = []

    for model in smoke_models:
        for sample in smoke_samples:
            generated = comparative_evaluator.mock_model_generate(
                model, sample.question
            )
            metrics = smoke_evaluator.evaluate_sample(sample, generated)

            result = {
                "model": model,
                "sample_id": sample.id,
                "rouge_l": metrics["rouge_l"],
                "chrf_plus": metrics["chrf_plus"],
                "composite": metrics["composite"],
            }
            smoke_results.append(result)

    smoke_df = pd.DataFrame(smoke_results)

    # Quick summary
    print("\n=== Smoke Test Results ===")
    for model in smoke_models:
        model_data = smoke_df[smoke_df["model"] == model]
        avg_composite = model_data["composite"].mean()
        print(f"{model}: avg composite = {avg_composite:.3f}")

    # Verify all scores are reasonable (0-1 range)
    all_scores_valid = (
        (smoke_df["rouge_l"] >= 0).all()
        and (smoke_df["rouge_l"] <= 1).all()
        and (smoke_df["chrf_plus"] >= 0).all()
        and (smoke_df["chrf_plus"] <= 1).all()
        and (smoke_df["composite"] >= 0).all()
        and (smoke_df["composite"] <= 1).all()
    )

    if all_scores_valid:
        print("✅ All metrics within expected range [0, 1]")
    else:
        print("❌ Some metrics outside expected range")

    return smoke_df


# Run smoke test
smoke_results = run_smoke_test()

In [None]:
# Cell 8: Advanced Quality Analysis
class AdvancedQualityAnalyzer:
    """Advanced analysis of quality patterns and insights"""

    def analyze_domain_performance(self, results_df: pd.DataFrame) -> pd.DataFrame:
        """Analyze performance by domain"""
        domain_analysis = (
            results_df.groupby(["model", "domain"])
            .agg({"rouge_l": "mean", "chrf_plus": "mean", "composite": "mean"})
            .round(3)
        )

        return domain_analysis.reset_index()

    def identify_failure_cases(
        self, results_df: pd.DataFrame, threshold: float = 0.3
    ) -> pd.DataFrame:
        """Identify samples with poor quality scores"""
        failure_cases = results_df[results_df["composite"] < threshold].copy()
        failure_cases = failure_cases.sort_values("composite")

        return failure_cases[
            ["model", "sample_id", "domain", "question", "composite", "generated"]
        ]

    def compute_consistency_metrics(self, results_df: pd.DataFrame) -> Dict[str, float]:
        """Compute consistency metrics across samples"""
        consistency = {}

        for model in results_df["model"].unique():
            model_data = results_df[results_df["model"] == model]

            # Coefficient of variation (std/mean) for each metric
            for metric in ["rouge_l", "chrf_plus", "composite"]:
                cv = (
                    model_data[metric].std() / model_data[metric].mean()
                    if model_data[metric].mean() > 0
                    else float("inf")
                )
                consistency[f"{model}_{metric}_cv"] = cv

        return consistency


# Run advanced analysis
analyzer = AdvancedQualityAnalyzer()

domain_perf = analyzer.analyze_domain_performance(results_df)
print("\n=== Performance by Domain ===")
print(domain_perf.to_string(index=False))

failure_cases = analyzer.identify_failure_cases(results_df, threshold=0.4)
print(f"\n=== Failure Cases (composite < 0.4) ===")
print(f"Found {len(failure_cases)} failure cases")
if not failure_cases.empty:
    print(failure_cases[["model", "sample_id", "composite"]].to_string(index=False))

consistency = analyzer.compute_consistency_metrics(results_df)
print(f"\n=== Consistency Metrics (Coefficient of Variation) ===")
for metric, value in consistency.items():
    print(f"{metric}: {value:.3f}")

In [None]:
# Cell 9: Integration with RAG Groundedness (Optional)
def integrate_groundedness_check(
    results_df: pd.DataFrame, context_samples: List[str] = None
) -> pd.DataFrame:
    """Add groundedness checking to quality evaluation"""

    def simple_groundedness_score(generated: str, context: str) -> float:
        """Simple groundedness check based on keyword overlap"""
        if not context or not generated:
            return 0.0

        # Tokenize (simple split for demo)
        gen_words = set(generated.lower().split())
        ctx_words = set(context.lower().split())

        if not gen_words:
            return 0.0

        # Jaccard similarity
        overlap = len(gen_words & ctx_words)
        union = len(gen_words | ctx_words)

        return overlap / union if union > 0 else 0.0

    # Add groundedness scores (mock context for demo)
    results_with_groundedness = results_df.copy()

    if context_samples is None:
        # Mock context for each sample
        context_samples = [
            "檢索增強生成是結合資訊檢索和文本生成的技術，通過外部知識庫增強語言模型能力。",
            "過擬合是機器學習中模型在訓練數據表現好但在新數據表現差的現象。",
            "台灣位於亞洲東部，是西太平洋上的島嶼，具有獨特地理位置。",
            "團隊溝通效率可透過建立流程、使用工具、定期會議等方式提升。",
            "區塊鏈是分散式帳本技術，具有去中心化、不可篡改等特性。",
        ] * 3  # Repeat for all models

    groundedness_scores = []
    for idx, row in results_with_groundedness.iterrows():
        sample_idx = list(test_samples).index(
            next(s for s in test_samples if s.id == row["sample_id"])
        )
        context = (
            context_samples[sample_idx] if sample_idx < len(context_samples) else ""
        )

        groundedness = simple_groundedness_score(row["generated"], context)
        groundedness_scores.append(groundedness)

    results_with_groundedness["groundedness"] = groundedness_scores

    # Update composite score to include groundedness
    results_with_groundedness["composite_with_groundedness"] = (
        0.3 * results_with_groundedness["rouge_l"]
        + 0.3 * results_with_groundedness["chrf_plus"]
        + 0.2 * results_with_groundedness["length_penalty"]
        + 0.2 * results_with_groundedness["groundedness"]
    )

    return results_with_groundedness


# Add groundedness analysis
results_with_groundedness = integrate_groundedness_check(results_df)

print("\n=== Updated Results with Groundedness ===")
groundedness_summary = (
    results_with_groundedness.groupby("model")[
        ["composite", "groundedness", "composite_with_groundedness"]
    ]
    .mean()
    .round(3)
)
print(groundedness_summary.to_string())

In [None]:
# Cell 10: Summary and Next Steps
print("\n" + "=" * 60)
print("📊 STAGE 7 - NB62 QUALITY EVALUATION SUMMARY")
print("=" * 60)

print("\n✅ COMPLETED:")
print("• Rouge-L and chrF++ automatic metrics implementation")
print("• Human evaluation interface with Gradio")
print("• Multi-model comparative evaluation pipeline")
print("• Comprehensive reporting and CSV export")
print("• Groundedness integration for RAG applications")
print("• Domain-specific performance analysis")
print("• Failure case identification and consistency metrics")

print("\n🔑 CORE CONCEPTS:")
print("• Rouge-L: Longest common subsequence F1 score")
print("• chrF++: Character-level F-score with word order")
print("• Human evaluation: Relevance, fluency, informativeness")
print("• Composite scoring: Weighted combination of metrics")
print("• Groundedness: Context-grounded generation quality")
print("• Consistency: Coefficient of variation across samples")

print("\n⚠️ PITFALLS:")
print("• Automatic metrics may not correlate with human judgment")
print("• Small test sets can give misleading results")
print("• Domain-specific metrics often more valuable than general ones")
print("• Human annotation requires careful guideline design")
print("• Groundedness checking needs sophisticated NLI models")

print("\n🎯 NEXT STEPS:")
print("• Scale evaluation to larger, diverse datasets")
print("• Implement semantic similarity metrics (BERTScore)")
print("• Add task-specific metrics (factuality, coherence)")
print("• Integrate with continuous evaluation pipelines")
print("• Build evaluation leaderboards for model comparison")

print("\n📁 OUTPUT FILES:")
for file_type, file_path in saved_files.items():
    print(f"• {file_type}: {file_path}")

print(f"\n🔍 KEY FINDINGS:")
best_model_row = summary_df.loc[summary_df["composite_mean"].idxmax()]
print(f"• Best performing model: {best_model_row['model']}")
print(f"• Best composite score: {best_model_row['composite_mean']:.3f}")
print(f"• Total samples evaluated: {len(results_df)}")
print(f"• Models compared: {len(results_df['model'].unique())}")

print("\n" + "=" * 60)
print("Quality evaluation framework ready for production use! 🚀")
print("=" * 60)