In [None]:
# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
import time
import asyncio
from enum import Enum
from typing import Dict, Any, Callable, Optional, List
from dataclasses import dataclass, field
from datetime import datetime, timedelta
import logging
import json
from functools import wraps
import random


# Mock LLM adapter for testing
class MockLLMAdapter:
    def __init__(self, failure_rate=0.3):
        self.failure_rate = failure_rate
        self.call_count = 0

    def generate(self, messages, **kwargs):
        self.call_count += 1
        time.sleep(0.1)  # simulate processing time

        if random.random() < self.failure_rate:
            if random.random() < 0.5:
                raise TimeoutError("Model response timeout")
            else:
                raise RuntimeError("Model generation failed")

        return f"Response #{self.call_count}: Generated successfully"


print("✓ Dependencies loaded")

In [None]:
class FailureType(Enum):
    """錯誤類型分類"""

    TIMEOUT = "timeout"
    RATE_LIMIT = "rate_limit"
    MODEL_ERROR = "model_error"
    NETWORK_ERROR = "network_error"
    VALIDATION_ERROR = "validation_error"
    UNKNOWN = "unknown"


@dataclass
class FailureInfo:
    """失敗資訊記錄"""

    failure_type: FailureType
    error_message: str
    timestamp: datetime
    retry_count: int = 0
    context: Dict[str, Any] = field(default_factory=dict)


def classify_error(exception: Exception) -> FailureType:
    """錯誤分類器"""
    if isinstance(exception, TimeoutError):
        return FailureType.TIMEOUT
    elif isinstance(exception, RuntimeError):
        if "rate" in str(exception).lower():
            return FailureType.RATE_LIMIT
        return FailureType.MODEL_ERROR
    elif isinstance(exception, ConnectionError):
        return FailureType.NETWORK_ERROR
    elif isinstance(exception, ValueError):
        return FailureType.VALIDATION_ERROR
    else:
        return FailureType.UNKNOWN


print("✓ Failure classification system ready")

In [None]:
@dataclass
class RetryConfig:
    """重試配置"""

    max_attempts: int = 3
    base_delay: float = 1.0
    max_delay: float = 60.0
    exponential_base: float = 2.0
    jitter: bool = True
    retryable_errors: List[FailureType] = field(
        default_factory=lambda: [
            FailureType.TIMEOUT,
            FailureType.RATE_LIMIT,
            FailureType.NETWORK_ERROR,
        ]
    )


class RetryStrategy:
    def __init__(self, config: RetryConfig):
        self.config = config
        self.failure_history: List[FailureInfo] = []

    def should_retry(self, failure: FailureInfo) -> bool:
        """判斷是否應該重試"""
        if failure.retry_count >= self.config.max_attempts:
            return False
        return failure.failure_type in self.config.retryable_errors

    def calculate_delay(self, attempt: int) -> float:
        """計算重試延遲時間（指數退避）"""
        delay = self.config.base_delay * (self.config.exponential_base**attempt)
        delay = min(delay, self.config.max_delay)

        if self.config.jitter:
            # Add random jitter to avoid thundering herd
            delay *= 0.5 + random.random() * 0.5

        return delay

    def record_failure(self, failure: FailureInfo):
        """記錄失敗"""
        self.failure_history.append(failure)
        if len(self.failure_history) > 100:  # Keep last 100 failures
            self.failure_history = self.failure_history[-100:]


# Test retry strategy
config = RetryConfig(max_attempts=3, base_delay=0.5)
retry_strategy = RetryStrategy(config)

failure = FailureInfo(
    failure_type=FailureType.TIMEOUT,
    error_message="Connection timeout",
    timestamp=datetime.now(),
)

print(f"Should retry: {retry_strategy.should_retry(failure)}")
print(f"Delay for attempt 1: {retry_strategy.calculate_delay(1):.2f}s")
print(f"Delay for attempt 2: {retry_strategy.calculate_delay(2):.2f}s")

In [None]:
class CircuitState(Enum):
    """熔斷器狀態"""

    CLOSED = "closed"  # 正常狀態
    OPEN = "open"  # 熔斷狀態
    HALF_OPEN = "half_open"  # 半開狀態


@dataclass
class CircuitBreakerConfig:
    """熔斷器配置"""

    failure_threshold: int = 5  # 失敗閾值
    recovery_timeout: float = 30.0  # 恢復超時（秒）
    success_threshold: int = 2  # 成功閾值（半開→關閉）
    monitoring_window: float = 60.0  # 監控窗口（秒）


class CircuitBreaker:
    def __init__(self, name: str, config: CircuitBreakerConfig):
        self.name = name
        self.config = config
        self.state = CircuitState.CLOSED
        self.failure_count = 0
        self.success_count = 0
        self.last_failure_time = None
        self.state_change_time = datetime.now()

    def call(self, func: Callable, *args, **kwargs):
        """執行函數調用，帶熔斷保護"""
        if self.state == CircuitState.OPEN:
            if self._should_attempt_reset():
                self._transition_to_half_open()
            else:
                raise RuntimeError(f"Circuit breaker '{self.name}' is OPEN")

        try:
            result = func(*args, **kwargs)
            self._on_success()
            return result
        except Exception as e:
            self._on_failure()
            raise

    def _should_attempt_reset(self) -> bool:
        """檢查是否應該嘗試重置"""
        if self.last_failure_time is None:
            return True
        return (
            datetime.now() - self.last_failure_time
        ).total_seconds() > self.config.recovery_timeout

    def _transition_to_half_open(self):
        """轉換到半開狀態"""
        self.state = CircuitState.HALF_OPEN
        self.success_count = 0
        self.state_change_time = datetime.now()
        print(f"Circuit breaker '{self.name}' -> HALF_OPEN")

    def _on_success(self):
        """處理成功調用"""
        if self.state == CircuitState.HALF_OPEN:
            self.success_count += 1
            if self.success_count >= self.config.success_threshold:
                self._transition_to_closed()
        elif self.state == CircuitState.CLOSED:
            self.failure_count = 0  # Reset failure count on success

    def _on_failure(self):
        """處理失敗調用"""
        self.failure_count += 1
        self.last_failure_time = datetime.now()

        if self.state == CircuitState.CLOSED:
            if self.failure_count >= self.config.failure_threshold:
                self._transition_to_open()
        elif self.state == CircuitState.HALF_OPEN:
            self._transition_to_open()

    def _transition_to_open(self):
        """轉換到開啟狀態"""
        self.state = CircuitState.OPEN
        self.state_change_time = datetime.now()
        print(f"Circuit breaker '{self.name}' -> OPEN (failures: {self.failure_count})")

    def _transition_to_closed(self):
        """轉換到關閉狀態"""
        self.state = CircuitState.CLOSED
        self.failure_count = 0
        self.success_count = 0
        self.state_change_time = datetime.now()
        print(f"Circuit breaker '{self.name}' -> CLOSED")

    def get_stats(self) -> Dict[str, Any]:
        """獲取統計資訊"""
        return {
            "name": self.name,
            "state": self.state.value,
            "failure_count": self.failure_count,
            "success_count": self.success_count,
            "state_duration": (datetime.now() - self.state_change_time).total_seconds(),
        }


# Test circuit breaker
config = CircuitBreakerConfig(failure_threshold=3, recovery_timeout=5.0)
circuit_breaker = CircuitBreaker("test_llm", config)

print("✓ Circuit breaker implemented")

In [None]:
class CircuitState(Enum):
    """熔斷器狀態"""

    CLOSED = "closed"  # 正常狀態
    OPEN = "open"  # 熔斷狀態
    HALF_OPEN = "half_open"  # 半開狀態


@dataclass
class CircuitBreakerConfig:
    """熔斷器配置"""

    failure_threshold: int = 5  # 失敗閾值
    recovery_timeout: float = 30.0  # 恢復超時（秒）
    success_threshold: int = 2  # 成功閾值（半開→關閉）
    monitoring_window: float = 60.0  # 監控窗口（秒）


class CircuitBreaker:
    def __init__(self, name: str, config: CircuitBreakerConfig):
        self.name = name
        self.config = config
        self.state = CircuitState.CLOSED
        self.failure_count = 0
        self.success_count = 0
        self.last_failure_time = None
        self.state_change_time = datetime.now()

    def call(self, func: Callable, *args, **kwargs):
        """執行函數調用，帶熔斷保護"""
        if self.state == CircuitState.OPEN:
            if self._should_attempt_reset():
                self._transition_to_half_open()
            else:
                raise RuntimeError(f"Circuit breaker '{self.name}' is OPEN")

        try:
            result = func(*args, **kwargs)
            self._on_success()
            return result
        except Exception as e:
            self._on_failure()
            raise

    def _should_attempt_reset(self) -> bool:
        """檢查是否應該嘗試重置"""
        if self.last_failure_time is None:
            return True
        return (
            datetime.now() - self.last_failure_time
        ).total_seconds() > self.config.recovery_timeout

    def _transition_to_half_open(self):
        """轉換到半開狀態"""
        self.state = CircuitState.HALF_OPEN
        self.success_count = 0
        self.state_change_time = datetime.now()
        print(f"Circuit breaker '{self.name}' -> HALF_OPEN")

    def _on_success(self):
        """處理成功調用"""
        if self.state == CircuitState.HALF_OPEN:
            self.success_count += 1
            if self.success_count >= self.config.success_threshold:
                self._transition_to_closed()
        elif self.state == CircuitState.CLOSED:
            self.failure_count = 0  # Reset failure count on success

    def _on_failure(self):
        """處理失敗調用"""
        self.failure_count += 1
        self.last_failure_time = datetime.now()

        if self.state == CircuitState.CLOSED:
            if self.failure_count >= self.config.failure_threshold:
                self._transition_to_open()
        elif self.state == CircuitState.HALF_OPEN:
            self._transition_to_open()

    def _transition_to_open(self):
        """轉換到開啟狀態"""
        self.state = CircuitState.OPEN
        self.state_change_time = datetime.now()
        print(f"Circuit breaker '{self.name}' -> OPEN (failures: {self.failure_count})")

    def _transition_to_closed(self):
        """轉換到關閉狀態"""
        self.state = CircuitState.CLOSED
        self.failure_count = 0
        self.success_count = 0
        self.state_change_time = datetime.now()
        print(f"Circuit breaker '{self.name}' -> CLOSED")

    def get_stats(self) -> Dict[str, Any]:
        """獲取統計資訊"""
        return {
            "name": self.name,
            "state": self.state.value,
            "failure_count": self.failure_count,
            "success_count": self.success_count,
            "state_duration": (datetime.now() - self.state_change_time).total_seconds(),
        }


# Test circuit breaker
config = CircuitBreakerConfig(failure_threshold=3, recovery_timeout=5.0)
circuit_breaker = CircuitBreaker("test_llm", config)

print("✓ Circuit breaker implemented")

In [None]:
class ResilientAgentRole:
    """具備彈性機制的代理角色"""

    def __init__(self, name: str, llm_adapter, role_prompt: str):
        self.name = name
        self.llm_adapter = llm_adapter
        self.role_prompt = role_prompt

        # Configure resilience
        retry_config = RetryConfig(max_attempts=3, base_delay=1.0)
        circuit_config = CircuitBreakerConfig(
            failure_threshold=5, recovery_timeout=30.0
        )

        self.executor = ResilientExecutor(
            retry_config, circuit_config, name=f"agent_{name}"
        )
        self.execution_log: List[Dict[str, Any]] = []

    def process(self, task: str, context: Dict[str, Any] = None) -> Dict[str, Any]:
        """處理任務（帶彈性保護）"""

        def _generate_response():
            messages = [
                {"role": "system", "content": self.role_prompt},
                {"role": "user", "content": task},
            ]
            return self.llm_adapter.generate(messages)

        try:
            response = self.executor.execute(_generate_response)

            result = {
                "success": True,
                "response": response,
                "agent": self.name,
                "timestamp": datetime.now().isoformat(),
            }

            self.execution_log.append(result)
            return result

        except Exception as e:
            result = {
                "success": False,
                "error": str(e),
                "agent": self.name,
                "timestamp": datetime.now().isoformat(),
            }

            self.execution_log.append(result)
            return result

    def get_health_status(self) -> Dict[str, Any]:
        """獲取健康狀態"""
        stats = self.executor.get_stats()

        if "no_data" in stats:
            return {"status": "unknown", "reason": "no_execution_data"}

        success_rate = stats["success_rate"]
        circuit_state = stats["circuit_stats"]["state"]

        if circuit_state == "open":
            status = "unhealthy"
            reason = "circuit_breaker_open"
        elif success_rate < 0.5:
            status = "degraded"
            reason = f"low_success_rate_{success_rate:.2f}"
        elif success_rate < 0.8:
            status = "warning"
            reason = f"moderate_success_rate_{success_rate:.2f}"
        else:
            status = "healthy"
            reason = "normal_operation"

        return {"status": status, "reason": reason, "stats": stats}


# Test resilient agent
llm = MockLLMAdapter(failure_rate=0.4)  # 40% failure rate for testing

researcher = ResilientAgentRole(
    name="researcher",
    llm_adapter=llm,
    role_prompt="You are a research assistant. Provide factual information.",
)

print("✓ Resilient agent role created")

In [None]:
class ResilientOrchestrator:
    """具備彈性機制的多代理協調器"""

    def __init__(self, agents: Dict[str, ResilientAgentRole]):
        self.agents = agents
        self.blackboard = {}
        self.execution_history = []
        self.health_monitor = {}

        # Global timeout and circuit breaker
        self.global_timeout = 300  # 5 minutes
        self.max_agent_failures = 3

    def execute_task(
        self, task: str, required_agents: List[str] = None
    ) -> Dict[str, Any]:
        """執行多代理任務"""
        start_time = datetime.now()
        results = {}
        errors = {}

        # Use all agents if none specified
        if required_agents is None:
            required_agents = list(self.agents.keys())

        # Check global timeout
        if self._is_global_timeout():
            return {
                "success": False,
                "error": "global_timeout_exceeded",
                "results": {},
                "health_status": self.get_system_health(),
            }

        for agent_name in required_agents:
            if agent_name not in self.agents:
                errors[agent_name] = f"Agent '{agent_name}' not found"
                continue

            agent = self.agents[agent_name]

            # Check agent health before execution
            health = agent.get_health_status()
            if health["status"] == "unhealthy":
                errors[agent_name] = f"Agent unhealthy: {health['reason']}"
                continue

            # Execute with timeout
            try:
                result = self._execute_with_timeout(
                    agent.process,
                    args=(task, self.blackboard),
                    timeout=60,  # 1 minute per agent
                )

                if result["success"]:
                    results[agent_name] = result["response"]
                    # Update blackboard with successful results
                    self.blackboard[f"{agent_name}_output"] = result["response"]
                else:
                    errors[agent_name] = result["error"]

            except TimeoutError:
                errors[agent_name] = "agent_timeout"
            except Exception as e:
                errors[agent_name] = str(e)

        # Compile final result
        execution_result = {
            "success": len(results) > 0,
            "results": results,
            "errors": errors,
            "execution_time": (datetime.now() - start_time).total_seconds(),
            "health_status": self.get_system_health(),
            "timestamp": datetime.now().isoformat(),
        }

        self.execution_history.append(execution_result)
        return execution_result

    def _execute_with_timeout(self, func, args=(), kwargs=None, timeout=60):
        """帶超時的執行"""
        if kwargs is None:
            kwargs = {}

        start_time = time.time()
        result = func(*args, **kwargs)

        # Simple timeout simulation (in real scenario, use threading/asyncio)
        if time.time() - start_time > timeout:
            raise TimeoutError(f"Execution exceeded {timeout} seconds")

        return result

    def _is_global_timeout(self) -> bool:
        """檢查全域超時"""
        if not self.execution_history:
            return False

        first_execution = datetime.fromisoformat(self.execution_history[0]["timestamp"])
        return (datetime.now() - first_execution).total_seconds() > self.global_timeout

    def get_system_health(self) -> Dict[str, Any]:
        """獲取系統健康狀態"""
        agent_health = {}
        overall_status = "healthy"

        for name, agent in self.agents.items():
            health = agent.get_health_status()
            agent_health[name] = health

            if health["status"] == "unhealthy":
                overall_status = "unhealthy"
            elif (
                health["status"] in ["degraded", "warning"]
                and overall_status == "healthy"
            ):
                overall_status = health["status"]

        # Check recent execution success rate
        if len(self.execution_history) >= 5:
            recent_successes = sum(
                1 for ex in self.execution_history[-5:] if ex["success"]
            )
            if recent_successes < 3:  # Less than 60% success rate
                overall_status = "degraded"

        return {
            "overall_status": overall_status,
            "agent_health": agent_health,
            "total_executions": len(self.execution_history),
            "uptime": "simulated",  # In real scenario, track actual uptime
            "last_check": datetime.now().isoformat(),
        }

    def reset_failed_agents(self):
        """重置失敗的代理"""
        reset_count = 0
        for name, agent in self.agents.items():
            health = agent.get_health_status()
            if health["status"] == "unhealthy":
                # Reset circuit breaker
                agent.executor.circuit_breaker._transition_to_closed()
                reset_count += 1
                print(f"Reset agent '{name}' circuit breaker")

        return reset_count


print("✓ Resilient orchestrator implemented")

In [None]:
# Smoke test: Create a resilient multi-agent system and test failure scenarios

print("=== Smoke Test: Resilient Multi-Agent System ===")

# Create agents with varying failure rates
agents = {
    "researcher": ResilientAgentRole(
        "researcher", MockLLMAdapter(failure_rate=0.3), "You are a research assistant."
    ),
    "planner": ResilientAgentRole(
        "planner", MockLLMAdapter(failure_rate=0.2), "You are a planning assistant."
    ),
    "writer": ResilientAgentRole(
        "writer", MockLLMAdapter(failure_rate=0.4), "You are a writing assistant."
    ),
}

# Create orchestrator
orchestrator = ResilientOrchestrator(agents)

# Test 1: Normal execution
print("\n1. Testing normal execution...")
result = orchestrator.execute_task("Create a report about AI safety")
print(f"Success: {result['success']}")
print(f"Results count: {len(result['results'])}")
print(f"Errors: {result['errors']}")

# Test 2: Check system health
print("\n2. Checking system health...")
health = orchestrator.get_system_health()
print(f"Overall status: {health['overall_status']}")
for agent_name, agent_health in health["agent_health"].items():
    print(f"  {agent_name}: {agent_health['status']} ({agent_health['reason']})")

# Test 3: Trigger circuit breaker (simulate repeated failures)
print("\n3. Testing circuit breaker...")
high_failure_agent = ResilientAgentRole(
    "test_agent", MockLLMAdapter(failure_rate=0.9), "Test agent"  # 90% failure rate
)

# Trigger multiple failures to open circuit breaker
for i in range(6):
    try:
        high_failure_agent.process(f"Test task {i}")
    except:
        pass

health = high_failure_agent.get_health_status()
print(f"High-failure agent status: {health['status']} ({health['reason']})")

# Test 4: Reset and recovery
print("\n4. Testing reset and recovery...")
orchestrator.agents["test_agent"] = high_failure_agent
reset_count = orchestrator.reset_failed_agents()
print(f"Reset {reset_count} agents")

final_health = orchestrator.get_system_health()
print(f"Final system status: {final_health['overall_status']}")

print("\n✓ Smoke test completed - Resilient system working!")

In [None]:
# Integration example: Using resilient agents in the previous orchestrator


def create_resilient_four_agent_system():
    """建立具備彈性機制的四角色系統"""

    # Mock LLM with different characteristics for each role
    researcher_llm = MockLLMAdapter(failure_rate=0.2)  # Most reliable
    planner_llm = MockLLMAdapter(failure_rate=0.3)
    writer_llm = MockLLMAdapter(failure_rate=0.25)
    reviewer_llm = MockLLMAdapter(failure_rate=0.35)

    agents = {
        "researcher": ResilientAgentRole(
            "researcher",
            researcher_llm,
            "You are a research assistant. Find and verify information using available sources.",
        ),
        "planner": ResilientAgentRole(
            "planner",
            planner_llm,
            "You are a planning assistant. Create structured outlines and plans.",
        ),
        "writer": ResilientAgentRole(
            "writer",
            writer_llm,
            "You are a writing assistant. Create clear, well-structured content.",
        ),
        "reviewer": ResilientAgentRole(
            "reviewer",
            reviewer_llm,
            "You are a review assistant. Check for accuracy and consistency.",
        ),
    }

    return ResilientOrchestrator(agents)


# Example workflow with resilience
resilient_system = create_resilient_four_agent_system()

workflow_result = resilient_system.execute_task(
    "Research and write a brief report on the importance of failure handling in AI systems",
    required_agents=["researcher", "planner", "writer", "reviewer"],
)

print("=== Resilient Workflow Result ===")
print(f"Workflow success: {workflow_result['success']}")
print(f"Execution time: {workflow_result['execution_time']:.2f}s")
print(f"Successful agents: {list(workflow_result['results'].keys())}")
print(f"Failed agents: {list(workflow_result['errors'].keys())}")

# Export configuration for reuse
config_export = {
    "retry_config": {"max_attempts": 3, "base_delay": 1.0, "max_delay": 60.0},
    "circuit_config": {
        "failure_threshold": 5,
        "recovery_timeout": 30.0,
        "success_threshold": 2,
    },
    "orchestrator_config": {
        "global_timeout": 300,
        "agent_timeout": 60,
        "max_agent_failures": 3,
    },
}

print(f"\n✓ Configuration exported for shared_utils/agents/resilience.py")
print(f"✓ Integration with nb30-nb34 ready")

In [None]:
# Low-VRAM 友好設定
RETRY_CONFIG = {
    "max_attempts": 3,  # 最大重試次數
    "base_delay": 1.0,  # 基礎延遲（秒）
    "exponential_base": 2.0,  # 指數退避倍數
    "jitter": True,  # 隨機抖動
}

CIRCUIT_CONFIG = {
    "failure_threshold": 5,  # 失敗閾值
    "recovery_timeout": 30.0,  # 恢復超時（秒）
    "success_threshold": 2,  # 成功閾值
}

ORCHESTRATOR_CONFIG = {
    "global_timeout": 300,  # 全域超時（秒）
    "agent_timeout": 60,  # 單一代理超時（秒）
    "max_agent_failures": 3,  # 最大代理失敗數
}