In [None]:
# nb39_eval_agentic_tasks.ipynb
# 多代理任務評估基準

# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# Cell 2: Import and Setup
import json
import time
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List, Any, Optional
from dataclasses import dataclass, asdict
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

# Mock imports - replace with actual modules from previous notebooks
from shared_utils.agents.orchestrator import Orchestrator
from shared_utils.agents.roles import Researcher, Planner, Writer, Reviewer
from shared_utils.agents.blackboard import Blackboard
from shared_utils.rag.retriever import Retriever
from shared_utils.metrics.text_eval import calculate_rouge, calculate_consistency
from shared_utils.adapters.llm_adapter import LLMAdapter

# Create output directory
outs_dir = Path("outs/eval_agentic")
outs_dir.mkdir(parents=True, exist_ok=True)

print("評估環境初始化完成")

In [None]:
# Cell 3: Evaluation Task Definitions
@dataclass
class EvalTask:
    """Standard evaluation task definition"""

    id: str
    name: str
    query: str
    domain: str
    complexity: str  # simple|medium|complex
    expected_sections: List[str]
    max_time_seconds: int = 300
    min_citations: int = 2


# Define standard evaluation tasks
EVAL_TASKS = [
    EvalTask(
        id="task_001",
        name="RAG技術簡介",
        query="請解釋什麼是RAG（檢索增強生成），並說明其核心優勢和應用場景",
        domain="tech",
        complexity="simple",
        expected_sections=["定義", "核心組件", "優勢", "應用"],
        max_time_seconds=180,
        min_citations=3,
    ),
    EvalTask(
        id="task_002",
        name="多模態AI發展分析",
        query="分析當前多模態AI的發展趨勢，比較主要技術路線，並預測未來發展方向",
        domain="tech",
        complexity="medium",
        expected_sections=["現狀分析", "技術比較", "發展趨勢", "未來預測"],
        max_time_seconds=240,
        min_citations=5,
    ),
    EvalTask(
        id="task_003",
        name="教育AI倫理考量",
        query="深入探討AI在教育領域應用的倫理議題，包括隱私保護、公平性、透明度等，並提出具體建議",
        domain="edu",
        complexity="complex",
        expected_sections=["背景", "主要倫理議題", "案例分析", "解決方案", "實施建議"],
        max_time_seconds=300,
        min_citations=7,
    ),
]

print(f"定義了 {len(EVAL_TASKS)} 個評估任務")

In [None]:
# Cell 4: Metrics Calculation Framework
@dataclass
class AgenticsMetrics:
    """Agentic task evaluation metrics"""

    task_id: str
    completion_rate: float  # 0-1, task completion percentage
    citation_accuracy: float  # 0-1, citation quality score
    content_consistency: float  # 0-1, internal consistency
    section_coverage: float  # 0-1, expected sections covered
    execution_time: float  # seconds
    total_tokens: int
    error_count: int
    retry_count: int

    def overall_score(self) -> float:
        """Calculate weighted overall score"""
        weights = {
            "completion": 0.25,
            "citation": 0.20,
            "consistency": 0.20,
            "coverage": 0.20,
            "efficiency": 0.15,  # based on time/token efficiency
        }

        efficiency = min(
            1.0, 180.0 / max(self.execution_time, 30)
        )  # normalize to 3min baseline

        return (
            weights["completion"] * self.completion_rate
            + weights["citation"] * self.citation_accuracy
            + weights["consistency"] * self.content_consistency
            + weights["coverage"] * self.section_coverage
            + weights["efficiency"] * efficiency
        )


def calculate_citation_accuracy(content: str, citations: List[str]) -> float:
    """Calculate citation accuracy based on content-citation alignment"""
    if not citations:
        return 0.0

    # Simple heuristic: check if citations are properly formatted and referenced
    citation_refs = []
    for i, cite in enumerate(citations, 1):
        if f"[{i}]" in content:
            citation_refs.append(1)
        else:
            citation_refs.append(0)

    return sum(citation_refs) / len(citations) if citations else 0.0


def calculate_section_coverage(content: str, expected_sections: List[str]) -> float:
    """Calculate how well content covers expected sections"""
    content_lower = content.lower()
    covered = 0

    for section in expected_sections:
        # Simple keyword matching - could be enhanced with semantic similarity
        if any(keyword in content_lower for keyword in section.lower().split()):
            covered += 1

    return covered / len(expected_sections) if expected_sections else 1.0


def evaluate_agentic_task(
    orchestrator: Orchestrator, task: EvalTask
) -> AgenticsMetrics:
    """Evaluate single agentic task execution"""
    start_time = time.time()

    try:
        # Execute task through orchestrator
        result = orchestrator.execute_task(
            query=task.query, max_time=task.max_time_seconds, domain=task.domain
        )

        execution_time = time.time() - start_time

        # Extract results
        final_content = result.get("final_output", "")
        citations = result.get("citations", [])
        total_tokens = result.get("total_tokens", 0)
        error_count = result.get("error_count", 0)
        retry_count = result.get("retry_count", 0)

        # Calculate metrics
        completion_rate = (
            1.0 if len(final_content) > 200 else len(final_content) / 200.0
        )
        citation_accuracy = calculate_citation_accuracy(final_content, citations)
        content_consistency = calculate_consistency(final_content)
        section_coverage = calculate_section_coverage(
            final_content, task.expected_sections
        )

        return AgenticsMetrics(
            task_id=task.id,
            completion_rate=min(1.0, completion_rate),
            citation_accuracy=citation_accuracy,
            content_consistency=content_consistency,
            section_coverage=section_coverage,
            execution_time=execution_time,
            total_tokens=total_tokens,
            error_count=error_count,
            retry_count=retry_count,
        )

    except Exception as e:
        execution_time = time.time() - start_time
        print(f"Task {task.id} failed: {e}")

        return AgenticsMetrics(
            task_id=task.id,
            completion_rate=0.0,
            citation_accuracy=0.0,
            content_consistency=0.0,
            section_coverage=0.0,
            execution_time=execution_time,
            total_tokens=0,
            error_count=1,
            retry_count=0,
        )


print("評估指標框架建立完成")

In [None]:
# Cell 5: Execute Evaluation and Collect Results
def run_agentic_evaluation(tasks: List[EvalTask]) -> List[AgenticsMetrics]:
    """Run evaluation on all tasks and collect metrics"""

    # Initialize orchestrator (mock setup - adjust based on actual implementation)
    llm_adapter = LLMAdapter(
        model_id="Qwen/Qwen2.5-7B-Instruct",
        backend="transformers",
        device_map="auto",
        torch_dtype="auto",
    )

    # Mock retriever setup
    retriever = None  # Would be initialized with actual RAG components

    orchestrator = Orchestrator(
        llm_adapter=llm_adapter,
        retriever=retriever,
        config={"max_iterations": 5, "timeout_seconds": 300, "retry_attempts": 3},
    )

    results = []

    print("開始執行代理任務評估...")
    for task in tasks:
        print(f"\n執行任務: {task.name} ({task.complexity})")

        metrics = evaluate_agentic_task(orchestrator, task)
        results.append(metrics)

        print(f"完成度: {metrics.completion_rate:.2f}")
        print(f"引用準確性: {metrics.citation_accuracy:.2f}")
        print(f"內容一致性: {metrics.content_consistency:.2f}")
        print(f"章節覆蓋度: {metrics.section_coverage:.2f}")
        print(f"總分: {metrics.overall_score():.2f}")
        print(f"執行時間: {metrics.execution_time:.1f}s")

    return results


# Run evaluation (commented out for demo - would run actual evaluation)
# eval_results = run_agentic_evaluation(EVAL_TASKS)

# Demo results for illustration
demo_results = [
    AgenticsMetrics(
        task_id="task_001",
        completion_rate=0.85,
        citation_accuracy=0.75,
        content_consistency=0.80,
        section_coverage=0.90,
        execution_time=156.3,
        total_tokens=1247,
        error_count=0,
        retry_count=1,
    ),
    AgenticsMetrics(
        task_id="task_002",
        completion_rate=0.78,
        citation_accuracy=0.82,
        content_consistency=0.75,
        section_coverage=0.85,
        execution_time=198.7,
        total_tokens=1856,
        error_count=1,
        retry_count=2,
    ),
    AgenticsMetrics(
        task_id="task_003",
        completion_rate=0.72,
        citation_accuracy=0.68,
        content_consistency=0.71,
        section_coverage=0.80,
        execution_time=245.2,
        total_tokens=2134,
        error_count=0,
        retry_count=1,
    ),
]

eval_results = demo_results
print(f"\n評估完成，共收集 {len(eval_results)} 個結果")

In [None]:
# Cell 6: Report Generation and Visualization
def generate_evaluation_report(
    results: List[AgenticsMetrics], tasks: List[EvalTask]
) -> pd.DataFrame:
    """Generate comprehensive evaluation report"""

    # Convert to DataFrame
    df_data = []
    for result, task in zip(results, tasks):
        row = asdict(result)
        row.update(
            {
                "task_name": task.name,
                "complexity": task.complexity,
                "domain": task.domain,
                "overall_score": result.overall_score(),
                "tokens_per_second": result.total_tokens
                / max(result.execution_time, 1),
                "success_rate": 1.0 if result.error_count == 0 else 0.0,
            }
        )
        df_data.append(row)

    df = pd.DataFrame(df_data)

    # Save detailed results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_file = outs_dir / f"agentic_eval_results_{timestamp}.csv"
    df.to_csv(results_file, index=False, encoding="utf-8")

    # Generate summary statistics
    summary_stats = {
        "avg_overall_score": df["overall_score"].mean(),
        "avg_completion_rate": df["completion_rate"].mean(),
        "avg_citation_accuracy": df["citation_accuracy"].mean(),
        "avg_content_consistency": df["content_consistency"].mean(),
        "avg_section_coverage": df["section_coverage"].mean(),
        "avg_execution_time": df["execution_time"].mean(),
        "success_rate": df["success_rate"].mean(),
        "total_tokens": df["total_tokens"].sum(),
        "avg_tokens_per_second": df["tokens_per_second"].mean(),
    }

    # Save summary
    summary_file = outs_dir / f"agentic_eval_summary_{timestamp}.json"
    with open(summary_file, "w", encoding="utf-8") as f:
        json.dump(summary_stats, f, indent=2, ensure_ascii=False)

    print(f"評估報表已保存: {results_file}")
    print(f"摘要統計已保存: {summary_file}")

    return df


def create_evaluation_charts(df: pd.DataFrame):
    """Create visualization charts for evaluation results"""

    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle("多代理任務評估結果", fontsize=16)

    # Overall scores by complexity
    axes[0, 0].bar(
        df["complexity"],
        df["overall_score"],
        color=["lightblue", "lightgreen", "lightcoral"],
    )
    axes[0, 0].set_title("整體分數 vs 任務複雜度")
    axes[0, 0].set_ylabel("整體分數")
    axes[0, 0].set_ylim(0, 1)

    # Metric breakdown
    metrics = [
        "completion_rate",
        "citation_accuracy",
        "content_consistency",
        "section_coverage",
    ]
    metric_names = ["完成度", "引用準確性", "內容一致性", "章節覆蓋度"]

    x = np.arange(len(df))
    width = 0.2
    for i, (metric, name) in enumerate(zip(metrics, metric_names)):
        axes[0, 1].bar(x + i * width, df[metric], width, label=name, alpha=0.8)

    axes[0, 1].set_title("各項指標表現")
    axes[0, 1].set_ylabel("分數")
    axes[0, 1].set_xlabel("任務")
    axes[0, 1].set_xticks(x + width * 1.5)
    axes[0, 1].set_xticklabels([f"T{i+1}" for i in range(len(df))])
    axes[0, 1].legend()
    axes[0, 1].set_ylim(0, 1)

    # Execution time vs complexity
    complexity_order = ["simple", "medium", "complex"]
    complexity_colors = {"simple": "green", "medium": "orange", "complex": "red"}

    for complexity in complexity_order:
        mask = df["complexity"] == complexity
        if mask.any():
            axes[1, 0].scatter(
                df[mask]["execution_time"],
                df[mask]["overall_score"],
                c=complexity_colors[complexity],
                label=complexity,
                s=100,
                alpha=0.7,
            )

    axes[1, 0].set_xlabel("執行時間 (秒)")
    axes[1, 0].set_ylabel("整體分數")
    axes[1, 0].set_title("執行時間 vs 整體分數")
    axes[1, 0].legend()

    # Tokens/second performance
    axes[1, 1].bar(range(len(df)), df["tokens_per_second"], color="skyblue")
    axes[1, 1].set_title("Token 生成效率")
    axes[1, 1].set_ylabel("Tokens/秒")
    axes[1, 1].set_xlabel("任務")
    axes[1, 1].set_xticks(range(len(df)))
    axes[1, 1].set_xticklabels([f"T{i+1}" for i in range(len(df))])

    plt.tight_layout()

    # Save chart
    chart_file = (
        outs_dir / f"agentic_eval_charts_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
    )
    plt.savefig(chart_file, dpi=300, bbox_inches="tight")
    plt.show()

    print(f"評估圖表已保存: {chart_file}")


# Generate report and charts
df_results = generate_evaluation_report(eval_results, EVAL_TASKS)
create_evaluation_charts(df_results)

# Display summary
print("\n=== 評估摘要 ===")
print(f"平均整體分數: {df_results['overall_score'].mean():.3f}")
print(f"平均完成度: {df_results['completion_rate'].mean():.3f}")
print(f"平均引用準確性: {df_results['citation_accuracy'].mean():.3f}")
print(f"平均內容一致性: {df_results['content_consistency'].mean():.3f}")
print(f"平均章節覆蓋度: {df_results['section_coverage'].mean():.3f}")
print(f"平均執行時間: {df_results['execution_time'].mean():.1f} 秒")
print(f"成功率: {df_results['success_rate'].mean():.3f}")

In [None]:
# Cell 7: Smoke Test
def smoke_test_agentic_eval():
    """Quick smoke test for agentic evaluation system"""

    print("=== 代理任務評估系統煙霧測試 ===")

    # Test 1: Task definition
    test_task = EvalTask(
        id="smoke_001",
        name="測試任務",
        query="簡單測試查詢",
        domain="general",
        complexity="simple",
        expected_sections=["介紹", "結論"],
        max_time_seconds=60,
    )

    assert test_task.id == "smoke_001"
    assert test_task.complexity == "simple"
    print("✓ 任務定義測試通過")

    # Test 2: Metrics calculation
    test_metrics = AgenticsMetrics(
        task_id="smoke_001",
        completion_rate=0.8,
        citation_accuracy=0.7,
        content_consistency=0.75,
        section_coverage=0.85,
        execution_time=45.0,
        total_tokens=500,
        error_count=0,
        retry_count=0,
    )

    overall_score = test_metrics.overall_score()
    assert 0 <= overall_score <= 1
    assert overall_score > 0.5  # Should be reasonable score
    print(f"✓ 指標計算測試通過 (總分: {overall_score:.3f})")

    # Test 3: Citation accuracy calculation
    test_content = "這是測試內容 [1]，包含引用 [2]。"
    test_citations = ["來源1", "來源2"]
    citation_acc = calculate_citation_accuracy(test_content, test_citations)
    assert citation_acc == 1.0  # Both citations referenced
    print(f"✓ 引用準確性計算測試通過 (準確性: {citation_acc:.3f})")

    # Test 4: Section coverage calculation
    test_content = "這是介紹部分的內容。最後我們得出結論。"
    test_sections = ["介紹", "結論"]
    coverage = calculate_section_coverage(test_content, test_sections)
    assert coverage == 1.0  # Both sections covered
    print(f"✓ 章節覆蓋度計算測試通過 (覆蓋度: {coverage:.3f})")

    # Test 5: Report generation
    test_results = [test_metrics]
    test_tasks = [test_task]
    df_test = generate_evaluation_report(test_results, test_tasks)
    assert len(df_test) == 1
    assert "overall_score" in df_test.columns
    print("✓ 報表生成測試通過")

    print("\n🎉 所有煙霧測試通過！代理任務評估系統運作正常。")

    return True


# Run smoke test
smoke_test_result = smoke_test_agentic_eval()