In [None]:
# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# Cell 2: GoT 核心概念與 DAG 結構
from dataclasses import dataclass, field
from typing import Dict, List, Set, Any, Optional, Union, Callable
from enum import Enum
import asyncio
import json
from collections import defaultdict, deque


class NodeType(Enum):
    TASK = "task"  # 執行具體任務
    CONDITION = "condition"  # 條件判斷分支
    AGGREGATION = "agg"  # 結果聚合
    PARALLEL = "parallel"  # 並行執行組


class NodeStatus(Enum):
    PENDING = "pending"
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"
    SKIPPED = "skipped"


@dataclass
class DAGNode:
    """DAG 節點定義"""

    node_id: str
    node_type: NodeType
    task_description: str
    dependencies: Set[str] = field(default_factory=set)  # 前置節點
    agent_role: Optional[str] = None  # 執行角色 (researcher/planner/writer/reviewer)
    condition_func: Optional[Callable] = None  # 條件節點的判斷函數
    aggregation_func: Optional[Callable] = None  # 聚合節點的合併函數
    metadata: Dict[str, Any] = field(default_factory=dict)

    # 執行狀態
    status: NodeStatus = NodeStatus.PENDING
    result: Any = None
    error: Optional[str] = None
    start_time: Optional[float] = None
    end_time: Optional[float] = None


@dataclass
class DAGExecutionContext:
    """DAG 執行上下文"""

    nodes: Dict[str, DAGNode] = field(default_factory=dict)
    edges: Dict[str, Set[str]] = field(default_factory=lambda: defaultdict(set))
    results: Dict[str, Any] = field(default_factory=dict)
    blackboard: Dict[str, Any] = field(default_factory=dict)
    execution_log: List[Dict] = field(default_factory=list)


print("✓ GoT 核心結構定義完成")

In [None]:
# Cell 3: DAG 節點類型設計
class DAGBuilder:
    """DAG 建構器"""

    def __init__(self):
        self.context = DAGExecutionContext()

    def add_task_node(
        self,
        node_id: str,
        description: str,
        agent_role: str,
        dependencies: List[str] = None,
        **metadata,
    ) -> "DAGBuilder":
        """新增任務節點"""
        node = DAGNode(
            node_id=node_id,
            node_type=NodeType.TASK,
            task_description=description,
            agent_role=agent_role,
            dependencies=set(dependencies or []),
            metadata=metadata,
        )
        self.context.nodes[node_id] = node

        # 建立邊關係
        for dep in dependencies or []:
            self.context.edges[dep].add(node_id)

        return self

    def add_condition_node(
        self,
        node_id: str,
        description: str,
        condition_func: Callable,
        dependencies: List[str] = None,
        **metadata,
    ) -> "DAGBuilder":
        """新增條件節點"""
        node = DAGNode(
            node_id=node_id,
            node_type=NodeType.CONDITION,
            task_description=description,
            condition_func=condition_func,
            dependencies=set(dependencies or []),
            metadata=metadata,
        )
        self.context.nodes[node_id] = node

        for dep in dependencies or []:
            self.context.edges[dep].add(node_id)

        return self

    def add_aggregation_node(
        self,
        node_id: str,
        description: str,
        aggregation_func: Callable,
        dependencies: List[str],
        **metadata,
    ) -> "DAGBuilder":
        """新增聚合節點"""
        node = DAGNode(
            node_id=node_id,
            node_type=NodeType.AGGREGATION,
            task_description=description,
            aggregation_func=aggregation_func,
            dependencies=set(dependencies),
            metadata=metadata,
        )
        self.context.nodes[node_id] = node

        for dep in dependencies:
            self.context.edges[dep].add(node_id)

        return self

    def build(self) -> DAGExecutionContext:
        """建構並驗證 DAG"""
        self._validate_dag()
        return self.context

    def _validate_dag(self):
        """驗證 DAG 是否有效（無循環、依賴存在）"""
        # 檢查循環依賴
        if self._has_cycle():
            raise ValueError("DAG contains cycles")

        # 檢查依賴節點存在
        for node_id, node in self.context.nodes.items():
            for dep in node.dependencies:
                if dep not in self.context.nodes:
                    raise ValueError(
                        f"Node {node_id} depends on non-existent node {dep}"
                    )

    def _has_cycle(self) -> bool:
        """檢查是否有循環依賴（DFS）"""
        visited = set()
        rec_stack = set()

        def dfs(node_id):
            if node_id in rec_stack:
                return True
            if node_id in visited:
                return False

            visited.add(node_id)
            rec_stack.add(node_id)

            for neighbor in self.context.edges.get(node_id, []):
                if dfs(neighbor):
                    return True

            rec_stack.remove(node_id)
            return False

        for node_id in self.context.nodes:
            if node_id not in visited:
                if dfs(node_id):
                    return True
        return False


# 範例：建構簡單研究 DAG
def create_research_dag():
    """建立研究報告 DAG 範例"""

    # 條件函數：判斷是否需要深度研究
    def needs_deep_research(context):
        initial_result = context.results.get("initial_research", "")
        return len(initial_result) < 500  # 如果初步研究太短，需要深度研究

    # 聚合函數：合併多個研究結果
    def aggregate_research(context):
        results = []
        for key in ["initial_research", "deep_research", "expert_interview"]:
            if key in context.results:
                results.append(context.results[key])
        return "\n\n".join(results)

    dag = (
        DAGBuilder()
        .add_task_node("initial_research", "進行初步主題研究", "researcher")
        .add_condition_node(
            "check_depth",
            "檢查是否需要深度研究",
            needs_deep_research,
            ["initial_research"],
        )
        .add_task_node(
            "deep_research",
            "深度專題研究",
            "researcher",
            ["check_depth"],
            condition_value=True,
        )
        .add_task_node(
            "expert_interview", "專家訪談摘要", "researcher", ["initial_research"]
        )
        .add_aggregation_node(
            "research_summary",
            "研究結果聚合",
            aggregate_research,
            ["initial_research", "deep_research", "expert_interview"],
        )
        .add_task_node("outline_plan", "建立報告大綱", "planner", ["research_summary"])
        .add_task_node("draft_writing", "撰寫初稿", "writer", ["outline_plan"])
        .add_task_node("review_check", "內容審查", "reviewer", ["draft_writing"])
        .build()
    )

    return dag


# 測試 DAG 建構
test_dag = create_research_dag()
print(f"✓ DAG 建構完成，共 {len(test_dag.nodes)} 個節點")
print(f"  節點：{list(test_dag.nodes.keys())}")

In [None]:
# Cell 4: DAG 執行引擎實作
import time
import asyncio
from concurrent.futures import ThreadPoolExecutor


class DAGExecutor:
    """DAG 執行引擎"""

    def __init__(self, agent_registry: Dict[str, Any] = None):
        self.agent_registry = agent_registry or {}
        self.executor = ThreadPoolExecutor(max_workers=4)

    async def execute_dag(self, context: DAGExecutionContext) -> Dict[str, Any]:
        """執行完整 DAG"""
        print("🚀 開始執行 DAG...")

        # 計算拓撲排序
        execution_order = self._topological_sort(context)
        print(f"📋 執行順序：{execution_order}")

        # 按層級並行執行
        execution_levels = self._group_by_level(context, execution_order)

        for level_idx, level_nodes in enumerate(execution_levels):
            print(f"\n🔄 執行第 {level_idx + 1} 層：{level_nodes}")

            # 並行執行同一層的節點
            tasks = []
            for node_id in level_nodes:
                if self._can_execute_node(context, node_id):
                    task = self._execute_node_async(context, node_id)
                    tasks.append(task)

            if tasks:
                await asyncio.gather(*tasks, return_exceptions=True)

        print(f"\n✅ DAG 執行完成")
        return context.results

    def _topological_sort(self, context: DAGExecutionContext) -> List[str]:
        """拓撲排序計算執行順序"""
        in_degree = defaultdict(int)

        # 計算入度
        for node_id in context.nodes:
            in_degree[node_id] = len(context.nodes[node_id].dependencies)

        # Kahn's algorithm
        queue = deque([node_id for node_id, degree in in_degree.items() if degree == 0])
        result = []

        while queue:
            node_id = queue.popleft()
            result.append(node_id)

            # 更新後繼節點的入度
            for neighbor in context.edges.get(node_id, []):
                in_degree[neighbor] -= 1
                if in_degree[neighbor] == 0:
                    queue.append(neighbor)

        return result

    def _group_by_level(
        self, context: DAGExecutionContext, execution_order: List[str]
    ) -> List[List[str]]:
        """按依賴層級分組，同層可並行執行"""
        levels = []
        processed = set()

        while len(processed) < len(execution_order):
            current_level = []

            for node_id in execution_order:
                if node_id in processed:
                    continue

                # 檢查所有依賴是否已處理
                node = context.nodes[node_id]
                if all(dep in processed for dep in node.dependencies):
                    current_level.append(node_id)

            for node_id in current_level:
                processed.add(node_id)

            if current_level:
                levels.append(current_level)
            else:
                break  # 避免無限循環

        return levels

    def _can_execute_node(self, context: DAGExecutionContext, node_id: str) -> bool:
        """檢查節點是否可執行"""
        node = context.nodes[node_id]

        # 檢查依賴是否都完成
        for dep_id in node.dependencies:
            dep_node = context.nodes[dep_id]
            if dep_node.status != NodeStatus.COMPLETED:
                return False

            # 條件節點特殊處理
            if dep_node.node_type == NodeType.CONDITION:
                condition_result = dep_node.result
                node_condition = node.metadata.get("condition_value")
                if node_condition is not None and condition_result != node_condition:
                    node.status = NodeStatus.SKIPPED
                    print(f"⏭️  節點 {node_id} 被條件跳過")
                    return False

        return True

    async def _execute_node_async(self, context: DAGExecutionContext, node_id: str):
        """異步執行單個節點"""
        node = context.nodes[node_id]
        node.status = NodeStatus.RUNNING
        node.start_time = time.time()

        try:
            print(f"▶️  執行節點: {node_id} ({node.task_description})")

            if node.node_type == NodeType.TASK:
                result = await self._execute_task_node(context, node)
            elif node.node_type == NodeType.CONDITION:
                result = await self._execute_condition_node(context, node)
            elif node.node_type == NodeType.AGGREGATION:
                result = await self._execute_aggregation_node(context, node)
            else:
                raise ValueError(f"Unknown node type: {node.node_type}")

            node.result = result
            node.status = NodeStatus.COMPLETED
            context.results[node_id] = result

            node.end_time = time.time()
            elapsed = node.end_time - node.start_time
            print(f"✅ 節點 {node_id} 完成 ({elapsed:.2f}s)")

        except Exception as e:
            node.status = NodeStatus.FAILED
            node.error = str(e)
            node.end_time = time.time()
            print(f"❌ 節點 {node_id} 失敗: {e}")
            raise

    async def _execute_task_node(
        self, context: DAGExecutionContext, node: DAGNode
    ) -> str:
        """執行任務節點"""
        # 模擬 Agent 執行
        if node.agent_role and node.agent_role in self.agent_registry:
            agent = self.agent_registry[node.agent_role]
            # 這裡應該調用真實的 agent
            await asyncio.sleep(0.5)  # 模擬執行時間
            return f"[{node.agent_role}] 完成任務: {node.task_description}"
        else:
            # 模擬執行
            await asyncio.sleep(0.3)
            return f"Mock result for: {node.task_description}"

    async def _execute_condition_node(
        self, context: DAGExecutionContext, node: DAGNode
    ) -> bool:
        """執行條件節點"""
        if node.condition_func:
            return node.condition_func(context)
        return True

    async def _execute_aggregation_node(
        self, context: DAGExecutionContext, node: DAGNode
    ) -> str:
        """執行聚合節點"""
        if node.aggregation_func:
            return node.aggregation_func(context)
        return "Aggregated result"


print("✓ DAG 執行引擎完成")

In [None]:
# Cell 5: 與 Agents 整合
# 簡化的 Agent 系統整合
class MockAgent:
    def __init__(self, role: str):
        self.role = role

    async def execute(self, task: str, context: Dict = None) -> str:
        """模擬 Agent 執行任務"""
        await asyncio.sleep(0.2)  # 模擬思考時間

        if self.role == "researcher":
            return f"📊 研究報告：{task} - 發現了 3 個關鍵點和 5 篇相關文獻"
        elif self.role == "planner":
            return f"📋 規劃大綱：{task} - 建立了 4 個章節的詳細架構"
        elif self.role == "writer":
            return f"✍️  寫作內容：{task} - 完成了 800 字的專業內容"
        elif self.role == "reviewer":
            return f"🔍 審查結果：{task} - 檢查了 5 個要點，建議 2 處修改"
        else:
            return f"[{self.role}] 完成：{task}"


# 建立 Agent 註冊表
agent_registry = {
    "researcher": MockAgent("researcher"),
    "planner": MockAgent("planner"),
    "writer": MockAgent("writer"),
    "reviewer": MockAgent("reviewer"),
}


# 增強版 DAG 執行器（整合真實 Agent）
class AgentDAGExecutor(DAGExecutor):
    """整合 Agent 的 DAG 執行器"""

    async def _execute_task_node(
        self, context: DAGExecutionContext, node: DAGNode
    ) -> str:
        """執行任務節點（調用真實 Agent）"""
        if node.agent_role and node.agent_role in self.agent_registry:
            agent = self.agent_registry[node.agent_role]

            # 準備上下文資訊
            task_context = {
                "dependencies": {
                    dep_id: context.results.get(dep_id) for dep_id in node.dependencies
                },
                "blackboard": context.blackboard,
                "node_metadata": node.metadata,
            }

            # 執行 Agent 任務
            result = await agent.execute(node.task_description, task_context)
            return result
        else:
            # 回退到基礎實作
            return await super()._execute_task_node(context, node)


print("✓ Agent 整合完成")

In [None]:
# Cell 6: 實際案例：研究報告 DAG
# 建立完整的研究報告 DAG 案例
def create_comprehensive_research_dag():
    """建立綜合研究報告 DAG"""

    def needs_expert_review(context):
        """判斷是否需要專家審查"""
        research_length = len(context.results.get("initial_research", ""))
        return research_length > 1000  # 長研究需要專家審查

    def needs_data_analysis(context):
        """判斷是否需要數據分析"""
        research = context.results.get("initial_research", "")
        return "數據" in research or "statistics" in research.lower()

    def combine_research_sources(context):
        """合併多個研究來源"""
        sources = []
        for key in ["initial_research", "expert_research", "data_analysis"]:
            if key in context.results and context.results[key]:
                sources.append(f"=== {key.upper()} ===\n{context.results[key]}")
        return "\n\n".join(sources)

    def final_quality_check(context):
        """最終品質檢查聚合"""
        review_result = context.results.get("content_review", "")
        expert_result = context.results.get("expert_review", "")

        if expert_result:
            return f"綜合審查：\n內容審查：{review_result}\n專家審查：{expert_result}"
        else:
            return f"內容審查：{review_result}"

    dag = (
        DAGBuilder()
        # 第1層：初始研究
        .add_task_node("initial_research", "執行初步主題研究收集", "researcher")
        # 第2層：條件分支
        .add_condition_node(
            "check_expert_need",
            "檢查是否需要專家審查",
            needs_expert_review,
            ["initial_research"],
        )
        .add_condition_node(
            "check_data_need",
            "檢查是否需要數據分析",
            needs_data_analysis,
            ["initial_research"],
        )
        # 第3層：條件執行
        .add_task_node(
            "expert_research",
            "專家諮詢與深度研究",
            "researcher",
            ["check_expert_need"],
            condition_value=True,
        )
        .add_task_node(
            "data_analysis",
            "相關數據分析",
            "researcher",
            ["check_data_need"],
            condition_value=True,
        )
        # 第4層：研究聚合
        .add_aggregation_node(
            "research_synthesis",
            "研究結果綜合",
            combine_research_sources,
            ["initial_research", "expert_research", "data_analysis"],
        )
        # 第5層：規劃與寫作並行
        .add_task_node(
            "outline_creation", "建立報告架構大綱", "planner", ["research_synthesis"]
        )
        .add_task_node(
            "style_guide", "確定寫作風格指南", "planner", ["research_synthesis"]
        )
        # 第6層：並行寫作
        .add_task_node(
            "intro_section",
            "撰寫引言章節",
            "writer",
            ["outline_creation", "style_guide"],
        )
        .add_task_node(
            "main_content",
            "撰寫主要內容",
            "writer",
            ["outline_creation", "style_guide"],
        )
        .add_task_node(
            "conclusion", "撰寫結論建議", "writer", ["outline_creation", "style_guide"]
        )
        # 第7層：內容整合
        .add_aggregation_node(
            "draft_assembly",
            "組裝完整初稿",
            lambda ctx: "\n\n".join(
                [
                    ctx.results.get("intro_section", ""),
                    ctx.results.get("main_content", ""),
                    ctx.results.get("conclusion", ""),
                ]
            ),
            ["intro_section", "main_content", "conclusion"],
        )
        # 第8層：審查
        .add_task_node("content_review", "內容品質審查", "reviewer", ["draft_assembly"])
        .add_task_node(
            "expert_review",
            "專家最終審查",
            "reviewer",
            ["draft_assembly", "check_expert_need"],
            condition_value=True,
        )
        # 第9層：最終整合
        .add_aggregation_node(
            "final_report",
            "最終報告生成",
            final_quality_check,
            ["content_review", "expert_review"],
        )
        .build()
    )

    return dag


# 建立並執行綜合 DAG
comprehensive_dag = create_comprehensive_research_dag()
print(f"🏗️  建立綜合研究 DAG：{len(comprehensive_dag.nodes)} 個節點")

# 顯示 DAG 結構
print("\n📊 DAG 節點結構：")
for node_id, node in comprehensive_dag.nodes.items():
    deps = f"依賴: {list(node.dependencies)}" if node.dependencies else "無依賴"
    print(f"  {node_id} ({node.node_type.value}) - {deps}")

In [None]:
# Cell 7: 可視化與監控
import matplotlib.pyplot as plt
import networkx as nx
from datetime import datetime

def visualize_dag(context: DAGExecutionContext, save_path: str = None):
    """可視化 DAG 結構與執行狀態"""
    G = nx.DiGraph()

    # 添加節點
    for node_id, node in context.nodes.items():
        G.add_node(node_id,
                  node_type=node.node_type.value,
                  status=node.status.value,
                  agent=node.agent_role or "system")

    # 添加邊
    for node_id, node in context.nodes.items():
        for dep in node.dependencies:
            G.add_edge(dep, node_id)

    # 設定圖形
    plt.figure(figsize=(14, 10))
    pos = nx.spring_layout(G, k=3, iterations=50)

    # 節點顏色依狀態
    color_map = {
        "pending": "lightgray",
        "running": "yellow",
        "completed": "lightgreen",
        "failed": "red",
        "skipped": "orange"
    }

    node_colors = [color_map.get(context.nodes[node].status.value, "gray")
                  for node in G.nodes()]

    # 繪製
    nx.draw(G, pos,
           node_color=node_colors,
           node_size=3000,
           font_size=8,
           font_weight="bold",
           arrows=True,
           arrowsize=20,
           edge_color="gray",
           with_labels=True)

    # 添加圖例
    legend_elements = [plt.Rectangle((0,0),1,1, color=color, label=status.title())
                      for status, color in color_map.items()]
    plt.legend(handles=legend_elements, loc="upper right")

    plt.title("Graph of Thought (GoT) DAG 執行狀態", fontsize=14, fontweight="bold")
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches="tight")
    plt.show()

def generate_execution_report(context: DAGExecutionContext) -> str:
    """生成執行報告"""
    report = []
    report.append("# GoT DAG 執行報告")
    report.append(f"生成時間：{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report.append("")

    # 統計資訊
    total_nodes = len(context.nodes)
    completed = sum(1 for n in context.nodes.values() if n.status == NodeStatus.COMPLETED)
    failed = sum(1 for n in context.nodes.values() if n.status == NodeStatus.FAILED)
    skipped = sum(1 for n in context.nodes.values() if n.status == NodeStatus.SKIPPED)

    report.append("## 執行統計")
    report.append(f"- 總節點數：{total_nodes}")
    report.append(f"- 成功完成：{completed}")
    report.append(f"- 執行失敗：{failed}")
    report.append(f"- 條件跳過：{skipped}")
    report.append("")

    # 執行時間分析
    execution_times = []
    for node in context.nodes.values():
        if node.start_time and node.end_time:
            execution_times.append((node.node_id, node.end_time - node.start_time))

    if execution_times:
        execution_times.sort(key=lambda x: x[1], reverse=True)
        report.append("## 執行時間分析（前5名）")
        for node_id, duration in execution_times[:5]:
            report.append(f"- {node_id}: {duration:.2f}秒")
        report.append("")

    # 節點詳情
    report.append("## 節點執行詳情")
    for node_id, node in context.nodes.items():
        status_icon = {"completed": "✅", "failed": "❌", "skipped": "⏭️"}.get(node.status.value, "⏸️")
        report.append(f"### {status_icon} {node_id}")
        report.append(f"- 類型：{node.node_type.value}")
        report.append(f"- 狀態：{node.status.value}")
        if node.agent_role:
           report.append(f"- 執行角色：{node.agent_role}")
        if node.error:
           report.append(f"- 錯誤：{node.error}")
        if hasattr(node, 'result') and node.result:
           result_preview = str(node.result)[:100] + "..." if len(str(node.result)) > 100 else str(node.result)
           report.append(f"- 結果預覽：{result_preview}")
        report.append("")

    return "\n".join(report)

print("✓ 可視化與監控模組完成")

In [None]:
# Cell 8: 失敗處理與重試
class DAGFailureHandler:
    """DAG 失敗處理與重試機制"""

    def __init__(self, executor: DAGExecutor):
        self.executor = executor

    async def retry_failed_nodes(
        self, context: DAGExecutionContext, max_retries: int = 3
    ) -> bool:
        """重試失敗的節點"""
        failed_nodes = [
            node_id
            for node_id, node in context.nodes.items()
            if node.status == NodeStatus.FAILED
        ]

        if not failed_nodes:
            print("✅ 沒有失敗節點需要重試")
            return True

        print(f"🔄 發現 {len(failed_nodes)} 個失敗節點，開始重試...")

        for retry_count in range(max_retries):
            print(f"\n🔄 重試第 {retry_count + 1} 次")

            # 重置失敗節點狀態
            for node_id in failed_nodes:
                node = context.nodes[node_id]
                node.status = NodeStatus.PENDING
                node.error = None
                node.result = None

            # 重新計算需要執行的節點（包含依賴）
            nodes_to_rerun = self._get_affected_nodes(context, failed_nodes)
            print(f"📋 需要重新執行的節點：{nodes_to_rerun}")

            # 執行子 DAG
            try:
                await self._execute_partial_dag(context, nodes_to_rerun)

                # 檢查是否還有失敗節點
                current_failed = [
                    node_id
                    for node_id, node in context.nodes.items()
                    if node.status == NodeStatus.FAILED
                ]

                if not current_failed:
                    print(f"✅ 重試成功！所有節點執行完成")
                    return True
                else:
                    failed_nodes = current_failed
                    print(f"⚠️  仍有 {len(failed_nodes)} 個節點失敗")

            except Exception as e:
                print(f"❌ 重試過程中發生錯誤：{e}")

        print(f"❌ 達到最大重試次數，仍有失敗節點：{failed_nodes}")
        return False

    def _get_affected_nodes(
        self, context: DAGExecutionContext, failed_nodes: List[str]
    ) -> List[str]:
        """獲取受失敗節點影響的所有節點"""
        affected = set(failed_nodes)

        # 找到所有依賴失敗節點的後續節點
        def find_dependents(node_id):
            dependents = []
            for candidate_id, candidate_node in context.nodes.items():
                if node_id in candidate_node.dependencies:
                    dependents.append(candidate_id)
                    dependents.extend(find_dependents(candidate_id))
            return dependents

        for failed_node in failed_nodes:
            affected.update(find_dependents(failed_node))

        return list(affected)

    async def _execute_partial_dag(
        self, context: DAGExecutionContext, nodes_to_execute: List[str]
    ):
        """執行部分 DAG 節點"""
        # 重置指定節點狀態
        for node_id in nodes_to_execute:
            if node_id in context.nodes:
                node = context.nodes[node_id]
                if node.status in [NodeStatus.FAILED, NodeStatus.COMPLETED]:
                    node.status = NodeStatus.PENDING
                    node.error = None
                    if node_id in context.results:
                        del context.results[node_id]

        # 建立子 DAG 上下文
        sub_context = DAGExecutionContext()
        sub_context.nodes = {
            nid: context.nodes[nid] for nid in nodes_to_execute if nid in context.nodes
        }
        sub_context.edges = {
            nid: context.edges[nid] for nid in context.edges if nid in nodes_to_execute
        }
        sub_context.results = context.results.copy()
        sub_context.blackboard = context.blackboard.copy()

        # 執行子 DAG
        results = await self.executor.execute_dag(sub_context)

        # 更新原始上下文
        context.results.update(results)
        for node_id in nodes_to_execute:
            if node_id in sub_context.nodes:
                context.nodes[node_id] = sub_context.nodes[node_id]

    def create_checkpoint(self, context: DAGExecutionContext) -> Dict:
        """建立檢查點"""
        checkpoint = {
            "timestamp": time.time(),
            "results": context.results.copy(),
            "blackboard": context.blackboard.copy(),
            "node_states": {},
        }

        for node_id, node in context.nodes.items():
            checkpoint["node_states"][node_id] = {
                "status": node.status.value,
                "result": node.result,
                "error": node.error,
            }

        return checkpoint

    def restore_checkpoint(self, context: DAGExecutionContext, checkpoint: Dict):
        """從檢查點恢復"""
        context.results = checkpoint["results"].copy()
        context.blackboard = checkpoint["blackboard"].copy()

        for node_id, state in checkpoint["node_states"].items():
            if node_id in context.nodes:
                node = context.nodes[node_id]
                node.status = NodeStatus(state["status"])
                node.result = state["result"]
                node.error = state["error"]


print("✓ 失敗處理與重試機制完成")

In [None]:
# Cell 9: Smoke Test - 完整 DAG 執行驗證
async def run_got_smoke_test():
    """GoT DAG 完整功能煙霧測試"""
    print("🧪 開始 GoT DAG Smoke Test")
    print("=" * 50)

    # 1. 建立測試 DAG
    print("\n1️⃣ 建立測試 DAG...")
    test_dag = create_comprehensive_research_dag()

    # 2. 建立執行器
    print("\n2️⃣ 初始化執行器...")
    executor = AgentDAGExecutor(agent_registry)
    failure_handler = DAGFailureHandler(executor)

    # 3. 建立檢查點
    print("\n3️⃣ 建立初始檢查點...")
    checkpoint = failure_handler.create_checkpoint(test_dag)

    # 4. 執行 DAG
    print("\n4️⃣ 執行完整 DAG...")
    try:
        start_time = time.time()
        results = await executor.execute_dag(test_dag)
        execution_time = time.time() - start_time

        print(f"\n✅ DAG 執行完成！總耗時：{execution_time:.2f}秒")
        print(f"📊 產生結果數：{len(results)}")

    except Exception as e:
        print(f"\n❌ DAG 執行失敗：{e}")
        print("\n🔄 嘗試失敗處理...")

        # 恢復檢查點並重試
        failure_handler.restore_checkpoint(test_dag, checkpoint)
        retry_success = await failure_handler.retry_failed_nodes(test_dag)

        if retry_success:
            print("✅ 重試成功！")
        else:
            print("❌ 重試失敗")

    # 5. 生成報告
    print("\n5️⃣ 生成執行報告...")
    report = generate_execution_report(test_dag)

    # 儲存報告
    import os

    os.makedirs("outs/got_reports", exist_ok=True)
    report_path = f"outs/got_reports/dag_execution_{int(time.time())}.md"
    with open(report_path, "w", encoding="utf-8") as f:
        f.write(report)
    print(f"📄 報告已儲存：{report_path}")

    # 6. 可視化 DAG（如果有 matplotlib）
    print("\n6️⃣ 生成 DAG 可視化...")
    try:
        viz_path = f"outs/got_reports/dag_visualization_{int(time.time())}.png"
        visualize_dag(test_dag, viz_path)
        print(f"📊 可視化已儲存：{viz_path}")
    except ImportError:
        print("⚠️  matplotlib 未安裝，跳過可視化")
    except Exception as e:
        print(f"⚠️  可視化失敗：{e}")

    # 7. 驗證關鍵結果
    print("\n7️⃣ 驗證測試結果...")

    # 檢查是否有最終結果
    if "final_report" in results:
        print("✅ 最終報告生成成功")
    else:
        print("❌ 最終報告生成失敗")

    # 檢查執行路徑
    completed_nodes = [
        nid
        for nid, node in test_dag.nodes.items()
        if node.status == NodeStatus.COMPLETED
    ]
    print(f"✅ 完成節點數：{len(completed_nodes)}/{len(test_dag.nodes)}")

    # 檢查條件分支
    condition_nodes = [
        nid
        for nid, node in test_dag.nodes.items()
        if node.node_type == NodeType.CONDITION
    ]
    print(f"🔀 條件節點處理：{len(condition_nodes)} 個")

    # 檢查聚合節點
    agg_nodes = [
        nid
        for nid, node in test_dag.nodes.items()
        if node.node_type == NodeType.AGGREGATION
    ]
    print(f"🔗 聚合節點處理：{len(agg_nodes)} 個")

    print("\n" + "=" * 50)
    print("🎉 GoT DAG Smoke Test 完成！")

    return {
        "success": len(
            [n for n in test_dag.nodes.values() if n.status == NodeStatus.COMPLETED]
        )
        > 0,
        "total_nodes": len(test_dag.nodes),
        "completed_nodes": len(completed_nodes),
        "execution_time": execution_time if "execution_time" in locals() else 0,
        "final_results": len(results),
    }


# 執行煙霧測試
smoke_result = await run_got_smoke_test()
print(f"\n📋 測試摘要：{smoke_result}")

In [None]:
# Smoke Test 結果驗證
# 最小驗證：確保 DAG 至少有4個節點執行完成
assert (
    smoke_result["completed_nodes"] >= 4
), f"期望至少4個節點完成，實際：{smoke_result['completed_nodes']}"
assert smoke_result["success"], "DAG 執行應該成功"
assert smoke_result["final_results"] > 0, "應該產生最終結果"

print("✅ 所有煙霧測試驗證通過！")
print("\n🎯 GoT 核心功能：")
print("  ✓ DAG 節點定義與建構")
print("  ✓ 拓撲排序與並行執行")
print("  ✓ 條件分支與聚合節點")
print("  ✓ Agent 角色整合")
print("  ✓ 失敗處理與重試機制")
print("  ✓ 執行監控與可視化")