In [None]:
# nb64 | 彙總報表與排行榜系統
# Goal: 整合多維度評估指標，生成模型比較排行榜與視覺化報表

# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# Cell 2: 載入評估結果與依賴
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Any
import warnings

warnings.filterwarnings("ignore")

# Create output directories
outs_dir = Path("outs")
outs_dir.mkdir(exist_ok=True)
(outs_dir / "reports").mkdir(exist_ok=True)
(outs_dir / "charts").mkdir(exist_ok=True)

print("📊 載入評估結果模組")

In [None]:
# Cell 3: 報表資料結構設計
class EvalReportBuilder:
    """統一評估報表建構器"""

    def __init__(self):
        self.metrics = {}
        self.models = {}
        self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    def load_retrieval_metrics(self, filepath="outs/retrieval_metrics.json"):
        """載入檢索指標 (nb60)"""
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                data = json.load(f)
            self.metrics["retrieval"] = data
            print(f"✅ 載入檢索指標: {len(data.get('results', []))} 項結果")
        except FileNotFoundError:
            print(f"⚠️  檔案不存在: {filepath}")
            self.metrics["retrieval"] = {}

    def load_groundedness_metrics(self, filepath="outs/groundedness_metrics.json"):
        """載入 Groundedness 指標 (nb61)"""
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                data = json.load(f)
            self.metrics["groundedness"] = data
            print(f"✅ 載入真實性指標: {len(data.get('results', []))} 項結果")
        except FileNotFoundError:
            print(f"⚠️  檔案不存在: {filepath}")
            self.metrics["groundedness"] = {}

    def load_quality_metrics(self, filepath="outs/text_quality_metrics.json"):
        """載入文本品質指標 (nb62)"""
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                data = json.load(f)
            self.metrics["quality"] = data
            print(f"✅ 載入品質指標: {len(data.get('results', []))} 項結果")
        except FileNotFoundError:
            print(f"⚠️  檔案不存在: {filepath}")
            self.metrics["quality"] = {}

    def load_performance_metrics(self, filepath="outs/performance_baseline.json"):
        """載入效能指標 (nb63)"""
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                data = json.load(f)
            self.metrics["performance"] = data
            print(f"✅ 載入效能指標: {len(data.get('models', []))} 個模型")
        except FileNotFoundError:
            print(f"⚠️  檔案不存在: {filepath}")
            self.metrics["performance"] = {}

In [None]:
# Cell 4: 檢索指標彙總
def aggregate_retrieval_metrics(retrieval_data):
    """彙總檢索指標"""
    if not retrieval_data:
        return {}

    results = retrieval_data.get("results", [])
    if not results:
        return {}

    # Extract metrics from results
    recall_at_5 = []
    recall_at_10 = []
    ndcg_scores = []

    for result in results:
        metrics = result.get("metrics", {})
        recall_at_5.append(metrics.get("recall_at_5", 0))
        recall_at_10.append(metrics.get("recall_at_10", 0))
        ndcg_scores.append(metrics.get("ndcg_at_10", 0))

    return {
        "recall_at_5_mean": np.mean(recall_at_5),
        "recall_at_5_std": np.std(recall_at_5),
        "recall_at_10_mean": np.mean(recall_at_10),
        "recall_at_10_std": np.std(recall_at_10),
        "ndcg_at_10_mean": np.mean(ndcg_scores),
        "ndcg_at_10_std": np.std(ndcg_scores),
        "total_queries": len(results),
    }

In [None]:
# Cell 5: 品質指標彙總
def aggregate_quality_metrics(groundedness_data, quality_data):
    """彙總品質相關指標"""
    metrics = {}

    # Groundedness metrics
    if groundedness_data:
        ground_results = groundedness_data.get("results", [])
        if ground_results:
            ground_scores = [r.get("groundedness_score", 0) for r in ground_results]
            metrics.update(
                {
                    "groundedness_mean": np.mean(ground_scores),
                    "groundedness_std": np.std(ground_scores),
                    "groundedness_samples": len(ground_scores),
                }
            )

    # Text quality metrics (Rouge-L, chrF++)
    if quality_data:
        qual_results = quality_data.get("results", [])
        if qual_results:
            rouge_scores = [r.get("rouge_l", 0) for r in qual_results]
            chrf_scores = [r.get("chrf", 0) for r in qual_results]

            metrics.update(
                {
                    "rouge_l_mean": np.mean(rouge_scores),
                    "rouge_l_std": np.std(rouge_scores),
                    "chrf_mean": np.mean(chrf_scores),
                    "chrf_std": np.std(chrf_scores),
                    "quality_samples": len(qual_results),
                }
            )

    return metrics

In [None]:
# Cell 6: 效能指標彙總
def aggregate_performance_metrics(performance_data):
    """彙總效能指標"""
    if not performance_data:
        return {}

    models = performance_data.get("models", [])
    if not models:
        return {}

    # Find best performing model
    best_model = None
    best_tokens_per_sec = 0

    for model in models:
        tps = model.get("tokens_per_second", 0)
        if tps > best_tokens_per_sec:
            best_tokens_per_sec = tps
            best_model = model

    if not best_model:
        return {}

    return {
        "best_model_name": best_model.get("model_name", "unknown"),
        "best_tokens_per_sec": best_model.get("tokens_per_second", 0),
        "best_latency_ms": best_model.get("avg_latency_ms", 0),
        "best_vram_peak_gb": best_model.get("vram_peak_gb", 0),
        "total_models_tested": len(models),
    }

In [None]:
# Cell 7: 綜合排行榜生成
def generate_leaderboard(builder: EvalReportBuilder):
    """生成綜合排行榜"""

    # Aggregate all metrics
    retrieval_agg = aggregate_retrieval_metrics(builder.metrics.get("retrieval", {}))
    quality_agg = aggregate_quality_metrics(
        builder.metrics.get("groundedness", {}), builder.metrics.get("quality", {})
    )
    perf_agg = aggregate_performance_metrics(builder.metrics.get("performance", {}))

    # Calculate composite score (weighted)
    composite_score = 0
    weights = {"retrieval": 0.4, "quality": 0.4, "performance": 0.2}

    if retrieval_agg:
        retrieval_score = (
            retrieval_agg.get("recall_at_5_mean", 0)
            + retrieval_agg.get("ndcg_at_10_mean", 0)
        ) / 2
        composite_score += weights["retrieval"] * retrieval_score

    if quality_agg:
        quality_score = (
            quality_agg.get("groundedness_mean", 0) + quality_agg.get("rouge_l_mean", 0)
        ) / 2
        composite_score += weights["quality"] * quality_score

    if perf_agg:
        # Normalize performance (higher tokens/sec = better, lower latency = better)
        perf_score = min(perf_agg.get("best_tokens_per_sec", 0) / 100, 1.0)
        composite_score += weights["performance"] * perf_score

    return {
        "composite_score": composite_score,
        "retrieval": retrieval_agg,
        "quality": quality_agg,
        "performance": perf_agg,
        "timestamp": builder.timestamp,
    }

In [None]:
# Cell 8: Markdown 報表輸出
def generate_markdown_report(
    leaderboard_data, output_path="outs/reports/leaderboard.md"
):
    """生成 Markdown 格式的排行榜報表"""

    md_content = f"""# RAG 系統評估排行榜

> 生成時間：{leaderboard_data['timestamp']}
> 綜合分數：{leaderboard_data['composite_score']:.3f} / 1.000

## 📊 整體表現摘要

| 維度 | 主要指標 | 分數 | 備註 |
|------|----------|------|------|
| 🔍 檢索效能 | Recall@5 | {leaderboard_data['retrieval'].get('recall_at_5_mean', 0):.3f} | {leaderboard_data['retrieval'].get('total_queries', 0)} 條查詢 |
| 📝 品質指標 | Groundedness | {leaderboard_data['quality'].get('groundedness_mean', 0):.3f} | 真實性評分 |
| ⚡ 系統效能 | Tokens/sec | {leaderboard_data['performance'].get('best_tokens_per_sec', 0):.1f} | {leaderboard_data['performance'].get('best_model_name', 'N/A')} |

## 🎯 詳細指標

### 檢索系統
"""

    if leaderboard_data["retrieval"]:
        ret = leaderboard_data["retrieval"]
        md_content += f"""
- **Recall@5**: {ret.get('recall_at_5_mean', 0):.3f} ± {ret.get('recall_at_5_std', 0):.3f}
- **Recall@10**: {ret.get('recall_at_10_mean', 0):.3f} ± {ret.get('recall_at_10_std', 0):.3f}
- **nDCG@10**: {ret.get('ndcg_at_10_mean', 0):.3f} ± {ret.get('ndcg_at_10_std', 0):.3f}
- **測試查詢數**: {ret.get('total_queries', 0)}
"""

    md_content += "\n### 生成品質\n"
    if leaderboard_data["quality"]:
        qual = leaderboard_data["quality"]
        md_content += f"""
- **Groundedness**: {qual.get('groundedness_mean', 0):.3f} ± {qual.get('groundedness_std', 0):.3f}
- **Rouge-L**: {qual.get('rouge_l_mean', 0):.3f} ± {qual.get('rouge_l_std', 0):.3f}
- **chrF++**: {qual.get('chrf_mean', 0):.3f} ± {qual.get('chrf_std', 0):.3f}
"""

    md_content += "\n### 系統效能\n"
    if leaderboard_data["performance"]:
        perf = leaderboard_data["performance"]
        md_content += f"""
- **最佳模型**: {perf.get('best_model_name', 'N/A')}
- **輸出速度**: {perf.get('best_tokens_per_sec', 0):.1f} tokens/sec
- **平均延遲**: {perf.get('best_latency_ms', 0):.1f} ms
- **VRAM 峰值**: {perf.get('best_vram_peak_gb', 0):.1f} GB
"""

    md_content += f"""

## 📈 趨勢分析

> 建議檢視 `outs/charts/` 目錄下的視覺化圖表以獲得更詳細的趨勢分析。

## 🎯 改進建議

根據當前評估結果：

1. **檢索優化**: {"召回率偏低，建議調整 chunk 策略" if leaderboard_data['retrieval'].get('recall_at_5_mean', 0) < 0.6 else "檢索表現良好"}
2. **品質提升**: {"需要改善答案真實性" if leaderboard_data['quality'].get('groundedness_mean', 0) < 0.7 else "答案品質達標"}
3. **效能調優**: {"建議使用量化或更高效模型" if leaderboard_data['performance'].get('best_tokens_per_sec', 0) < 20 else "效能表現佳"}

---
*本報表由 nb64 自動生成 | ragent-text-lab*
"""

    # Write to file
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(md_content)

    print(f"📄 排行榜報表已生成: {output_path}")
    return md_content

In [None]:
# Cell 9: 視覺化圖表生成
def generate_charts(leaderboard_data, charts_dir="outs/charts"):
    """生成評估指標的視覺化圖表"""

    charts_path = Path(charts_dir)
    charts_path.mkdir(exist_ok=True)

    # Set Chinese font for matplotlib
    plt.rcParams["font.sans-serif"] = ["SimHei", "Arial Unicode MS", "DejaVu Sans"]
    plt.rcParams["axes.unicode_minus"] = False

    # Chart 1: 綜合指標雷達圖
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

    # Metrics summary bar chart
    metrics = []
    values = []

    if leaderboard_data["retrieval"]:
        metrics.append("Recall@5")
        values.append(leaderboard_data["retrieval"].get("recall_at_5_mean", 0))

    if leaderboard_data["quality"]:
        metrics.append("Groundedness")
        values.append(leaderboard_data["quality"].get("groundedness_mean", 0))

    metrics.append("Composite Score")
    values.append(leaderboard_data["composite_score"])

    if metrics and values:
        bars = ax1.bar(metrics, values, color=["#3498db", "#e74c3c", "#f39c12"])
        ax1.set_title("Key Metrics Summary", fontsize=14, fontweight="bold")
        ax1.set_ylabel("Score")
        ax1.set_ylim(0, 1)

        # Add value labels on bars
        for bar, value in zip(bars, values):
            height = bar.get_height()
            ax1.text(
                bar.get_x() + bar.get_width() / 2.0,
                height + 0.01,
                f"{value:.3f}",
                ha="center",
                va="bottom",
            )

    # Performance timeline (mock data for demo)
    if leaderboard_data["performance"]:
        perf_data = leaderboard_data["performance"]
        ax2.plot(
            [1, 2, 3],
            [10, 15, perf_data.get("best_tokens_per_sec", 20)],
            marker="o",
            linewidth=2,
            markersize=6,
        )
        ax2.set_title("Performance Trend", fontsize=14, fontweight="bold")
        ax2.set_xlabel("Evaluation Run")
        ax2.set_ylabel("Tokens/sec")
        ax2.grid(True, alpha=0.3)

    plt.tight_layout()
    chart_path = charts_path / "metrics_summary.png"
    plt.savefig(chart_path, dpi=150, bbox_inches="tight")
    plt.close()

    print(f"📊 圖表已生成: {chart_path}")

    return str(chart_path)

In [None]:
# Cell 10: 主執行流程與 Smoke Test
def run_eval_reports():
    """執行完整的評估報表生成流程"""

    print("🚀 開始生成評估報表...")

    # Initialize report builder
    builder = EvalReportBuilder()

    # Create sample data if files don't exist (for demo)
    sample_retrieval = {
        "results": [
            {
                "metrics": {
                    "recall_at_5": 0.75,
                    "recall_at_10": 0.85,
                    "ndcg_at_10": 0.72,
                }
            },
            {
                "metrics": {
                    "recall_at_5": 0.68,
                    "recall_at_10": 0.78,
                    "ndcg_at_10": 0.69,
                }
            },
            {
                "metrics": {
                    "recall_at_5": 0.72,
                    "recall_at_10": 0.82,
                    "ndcg_at_10": 0.71,
                }
            },
        ]
    }

    sample_groundedness = {
        "results": [
            {"groundedness_score": 0.82},
            {"groundedness_score": 0.79},
            {"groundedness_score": 0.85},
        ]
    }

    sample_quality = {
        "results": [
            {"rouge_l": 0.65, "chrf": 0.58},
            {"rouge_l": 0.62, "chrf": 0.55},
            {"rouge_l": 0.68, "chrf": 0.61},
        ]
    }

    sample_performance = {
        "models": [
            {
                "model_name": "Qwen2.5-7B-Instruct",
                "tokens_per_second": 28.5,
                "avg_latency_ms": 85.2,
                "vram_peak_gb": 8.4,
            }
        ]
    }

    # Load metrics (use samples if files don't exist)
    builder.load_retrieval_metrics()
    if not builder.metrics.get("retrieval"):
        builder.metrics["retrieval"] = sample_retrieval
        print("📝 使用範例檢索資料")

    builder.load_groundedness_metrics()
    if not builder.metrics.get("groundedness"):
        builder.metrics["groundedness"] = sample_groundedness
        print("📝 使用範例 Groundedness 資料")

    builder.load_quality_metrics()
    if not builder.metrics.get("quality"):
        builder.metrics["quality"] = sample_quality
        print("📝 使用範例品質資料")

    builder.load_performance_metrics()
    if not builder.metrics.get("performance"):
        builder.metrics["performance"] = sample_performance
        print("📝 使用範例效能資料")

    # Generate leaderboard
    leaderboard = generate_leaderboard(builder)

    # Generate reports
    md_report = generate_markdown_report(leaderboard)
    chart_path = generate_charts(leaderboard)

    # Save leaderboard data as JSON
    json_path = "outs/reports/leaderboard.json"
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(leaderboard, f, ensure_ascii=False, indent=2)

    print(f"💾 排行榜資料已保存: {json_path}")

    return leaderboard


# Smoke Test
if __name__ == "__main__":
    leaderboard_result = run_eval_reports()

    # Verify results
    assert leaderboard_result["composite_score"] > 0, "綜合分數應大於 0"
    assert "retrieval" in leaderboard_result, "應包含檢索指標"
    assert "quality" in leaderboard_result, "應包含品質指標"
    assert "performance" in leaderboard_result, "應包含效能指標"

    print("✅ Smoke Test 通過")
    print(f"🎯 綜合分數: {leaderboard_result['composite_score']:.3f}")
    print("📋 報表生成完成，請檢查 outs/reports/ 目錄")

In [None]:
weights = {
    "retrieval": 0.4,  # 檢索效能權重
    "quality": 0.4,  # 品質指標權重
    "performance": 0.2,  # 系統效能權重
}

report_formats = {
    "markdown": True,  # 生成 Markdown 報表
    "json": True,  # 保存 JSON 資料
    "charts": True,  # 生成視覺化圖表
    "csv": False,  # CSV 格式（可選）
}

In [None]:
# 快速驗證腳本
def smoke_test_reports():
    builder = EvalReportBuilder()

    # 模擬載入基本資料
    sample_data = {
        "retrieval": {"results": [{"metrics": {"recall_at_5": 0.7}}]},
        "performance": {"models": [{"model_name": "test", "tokens_per_second": 25}]},
    }

    for key, value in sample_data.items():
        builder.metrics[key] = value

    leaderboard = generate_leaderboard(builder)

    assert leaderboard["composite_score"] > 0
    assert "timestamp" in leaderboard

    print("✅ 報表系統驗證通過")


smoke_test_reports()