<a href="https://colab.research.google.com/github/ImaginationX4/HybridZero/blob/main/BFS_Bagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gymnasium



###1.Learn Bagging

In [None]:
import gymnasium as gym
import math
import numpy as np
import torch
import torch.nn.functional as F
from dataclasses import dataclass
from typing import List, Optional, Dict
from queue import PriorityQueue
from typing import List, Tuple, Optional
from queue import PriorityQueue
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
from dataclasses import dataclass, field
from functools import lru_cache

class ValueNetwork(nn.Module):
    def __init__(self, env_size):
        super().__init__()
        self.embedding = nn.Embedding(env_size**2, 32)
        self.fc = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, state):
        # state shape: (batch_size,)
        x = self.embedding(state)  # shape: (batch_size, 32)
        return self.fc(x)         # shape: (batch_size, 1)

class Ensemble_Network(nn.Module):
    def __init__(self, env_size, n_networks=5):
        super().__init__()
        self.networks = nn.ModuleList([
            ValueNetwork(env_size)
            for _ in range(n_networks)
        ])
        # 为每个网络维护一个mask数组，表示这个经验是否用于该网络的训练
        self.masks = np.random.binomial(
            n=1, p=0.8,  # 每个经验有0.8的概率被选中
            size=(32, n_networks)
        )
    def forward(self, state):
        # state shape: (batch_size,)
        values = torch.stack([net(state) for net in self.networks])
        # shape: (n_networks, batch_size, 1) -> (batch_size, n_networks, 1)
        return values.permute(1, 0, 2)

    def train_step(self, batch_size=32):
      # 采样经验
      indices, (state, action, reward, next_state, done) = self.buffer.sample(batch_size)
      total_loss = 0
      for i, (online, target) in enumerate(zip(self.online_net.networks, self.target_net.networks)):
          # 获取这个网络的mask
          mask = self.masks[indices, i]
          if not mask.any():
              continue

          # 只使用mask为1的经验进行训练
          masked_state = state[mask]
          masked_action = action[mask]
          masked_reward = reward[mask]
          masked_next_state = next_state[mask]
          masked_done = done[mask]

          # 正常的Q学习更新...
          current_q = online(masked_state).gather(1, masked_action)
          with torch.no_grad():
              next_q = target(masked_next_state).max(1)[0].unsqueeze(1)
              target_q = masked_reward + (1 - masked_done) * 0.99 * next_q

          loss = F.mse_loss(current_q, target_q)
          total_loss += loss
def calculate_uncertainty(values):
    # values shape: (batch_size, n_networks, 1)
    return torch.std(values, dim=1)  # shape: (batch_size, 1)

def select_action(state, ensemble_net, epsilon=0.1):
    # 处理输入
    with torch.no_grad():  # 不需要梯度
        if state.dim() == 0:
            state = state.unsqueeze(0)

        # 获取预测
        q_values = ensemble_net(state)  # (1, n_networks, 1)
        mean_q = q_values.squeeze().mean()  # 标量
        std_q = q_values.squeeze().std()    # 标量

        # 计算动作分数
        action_score = mean_q + std_q * epsilon

        return action_score



###2.Bagging_BFS Frozen-lake

In [None]:
# 神经网络模型定义
class HeuristicNetwork(nn.Module):
    def __init__(self, env_size):
        super().__init__()
        self.embedding = nn.Embedding(env_size**2, 32)
        self.fc = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()  # 输出0-1之间的启发值
        )

    def forward(self, state):
        x = self.embedding(state)
        return self.fc(x)


@dataclass
class Node:
    state: int
    action_taken: Optional[int] = None
    parent: Optional['Node'] = None
    children: dict = field(default_factory=dict)
    visit_count: int = 0
    value: float = 0.0

class NeuralEnhancedBFS:
    def __init__(self, env_size: int = 8, num_simulations: int = 100,
                 buffer_size: int = 1000, batch_size: int = 32, n_networks: int = 5):
        self.env_size = env_size
        self.env = self._create_env()
        self.goal_state = env_size**2 - 1

        # 使用集成网络替换单个网络
        self.online_nets = nn.ModuleList([
            HeuristicNetwork(env_size) for _ in range(n_networks)
        ])
        self.target_nets = nn.ModuleList([
            HeuristicNetwork(env_size) for _ in range(n_networks)
        ])

        # 为每个网络创建优化器
        self.optimizers = [
            optim.Adam(net.parameters(), lr=0.001)
            for net in self.online_nets
        ]

        # 创建mask
        self.masks = np.random.binomial(
            n=1, p=0.8,
            size=(buffer_size, n_networks)
        )

        # 经验回放缓存
        self.replay_buffer = deque(maxlen=buffer_size)
        self.batch_size = batch_size

        # 目标网络同步参数
        self.target_update_interval = 15
        self.train_step_counter = 0
        self.tau = 0.005  # 软更新参数

    def _create_env(self) -> gym.Env:
        return gym.make('FrozenLake-v1',
                       map_name=f"{self.env_size}x{self.env_size}",
                       is_slippery=False,
                       render_mode=None)

    # ... (保留原有的_get_valid_actions和_get_action_path方法)
    def _get_valid_actions(self,state: int, env_size: int) -> List[int]:
      """获取有效动作列表"""
      row, col = state // env_size, state % env_size
      valid_actions = []
      if col > 0: valid_actions.append(0)    # 左
      if row < env_size - 1: valid_actions.append(1)    # 下
      if col < env_size - 1: valid_actions.append(2)    # 右
      if row > 0: valid_actions.append(3)    # 上
      return valid_actions

    def _get_action_path(self,node: Node) -> List[int]:
        """获取动作路径"""
        path = []
        current = node
        while current.parent:
            path.append(current.action_taken)
            current = current.parent
        return list(reversed(path))
    def _calculate_heuristic(self, state: int) -> float:
        """使用集成网络预测启发值"""
        with torch.no_grad():
            state_tensor = torch.LongTensor([state])
            # 获取所有网络的预测
            predictions = torch.stack([
                net(state_tensor) for net in self.online_nets
            ])  # shape: (n_networks, 1, 1)

            # 计算均值和不确定性
            mean_pred = predictions.mean(dim=0).item()
            std_pred = predictions.std(dim=0).item()

            # 返回均值加上不确定性奖励
            return mean_pred + 0.2 * std_pred

    def _update_network(self, states, targets):
        """训练集成网络"""
        states = torch.LongTensor(states)
        targets = torch.FloatTensor(targets)

        # 获取当前batch的mask
        batch_indices = np.random.randint(0, len(self.masks), size=len(states))
        total_loss = 0

        # 训练每个网络
        for i, (online_net, target_net, optimizer) in enumerate(zip(
            self.online_nets, self.target_nets, self.optimizers)):

            # 获取这个网络的mask
            mask = self.masks[batch_indices, i]
            if not mask.any():
                continue

            # 只使用mask为1的经验进行训练
            masked_states = states[mask]
            masked_targets = targets[mask]

            # 清空梯度
            optimizer.zero_grad()

            # 计算预测值和损失
            predictions = online_net(masked_states).squeeze()
            loss = F.mse_loss(predictions, masked_targets)

            # 反向传播和优化
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # 软更新目标网络
        self.train_step_counter += 1
        if self.train_step_counter % self.target_update_interval == 0:
            self._soft_update_target_networks()

        return total_loss / len(self.online_nets)

    def _soft_update_target_networks(self):
        """软更新所有目标网络"""
        for target_net, online_net in zip(self.target_nets, self.online_nets):
            for target_param, online_param in zip(
                target_net.parameters(), online_net.parameters()):
                target_param.data.copy_(
                    target_param.data * (1.0 - self.tau) +
                    online_param.data * self.tau
                )

    def _remember(self, state, target):
        """存储经验"""
        self.replay_buffer.append((state, target))

    def _replay(self):
        """经验回放"""
        if len(self.replay_buffer) < self.batch_size:
            return

        batch = np.random.choice(len(self.replay_buffer), self.batch_size, replace=False)
        states, targets = zip(*[self.replay_buffer[i] for i in batch])

        self._update_network(states, targets)

    def _get_bootstrap_target(self,  state):
      """使用Q学习方式计算训练目标"""
      with torch.no_grad():
          state_tensor = torch.LongTensor([state])
          predictions = torch.stack([
              net(state_tensor) for net in self.target_nets
          ])
          return predictions.mean(dim=0).item()

    def bfs_search(self, start_state: int) -> Tuple[Optional[List[int]], int, Node]:
        visited = set()
        queue = PriorityQueue()
        root_node = Node(state=start_state)
        queue.put((-self._calculate_heuristic(start_state), id(root_node), root_node))
        found_goal = False
        goal_node = None

        while not queue.empty():
            _, _, current_node = queue.get()

            if current_node.state in visited:
                continue
            visited.add(current_node.state)


            # 收集训练数据
            if current_node.parent is not None:
                target = self._get_bootstrap_target(current_node.state)
                self._remember(current_node.parent.state, target)

            if current_node.state == self.goal_state:
                found_goal = True
                goal_node = current_node
                # 传播成功信号
                self._remember(current_node.state, 1.0)
                break

            for action in self._get_valid_actions(current_node.state, self.env_size):
                self.env.reset()
                self.env.unwrapped.s = current_node.state
                next_state, _, terminated, _, _ = self.env.step(action)

                if terminated and next_state != self.goal_state:
                    self._remember(next_state, 0.0)  # 记录失败状态
                    continue

                if next_state not in visited:
                    next_node = Node(
                        state=next_state,
                        action_taken=action,
                        parent=current_node,
                        value=self._calculate_heuristic(next_state)
                    )
                    current_node.children[action] = next_node
                    priority = -next_node.value
                    queue.put((priority, id(next_node), next_node))

            # 进行经验回放
            self._replay()

        if found_goal:
          ##BACKUP##BACKUP##BACKUP##BACKUP
          current = goal_node
          while current.parent:
              current.value += 1.0  # 或其他奖励值
              current = current.parent

          return self._get_action_path(goal_node), 1, root_node
        return None, 0, root_node

    # ... (保留其他辅助方法)
    def get_best_action_from_tree(self, root_node: Node) -> int:
      """基于搜索树选择最佳动作"""
      best_action = None
      best_value = float('-inf')

      for action, child in root_node.children.items():
          # 计算每个动作的价值
          value = self._evaluate_subtree(child)
          if value > best_value:
              best_value = value
              best_action = action

      return best_action #if best_action is not None else self._get_best_heuristic_action(root_node.state)

    def _evaluate_subtree(self, node: Node) -> float:
        """评估子树的价值"""
        # 如果找到目标
        if node.state == self.goal_state:
            return float('inf')

        # 综合考虑多个因素
        #visit_value = node.visit_count  # 访问次数说明这个方向被多次探索
        heuristic_value = node.value#self._calculate_heuristic(node.state)  # 启发式值
        children_value = max([self._evaluate_subtree(child) for child in node.children.values()]) if node.children else 0
        return heuristic_value + 0.5 * children_value  # 可以调整这些因素的权重


    def search(self, root_state: int) -> int:
        _, _, root_node = self.bfs_search(root_state)
        best_action = self.get_best_action_from_tree(root_node)

        '''# 使用最终结果更新网络
        if best_action is not None:
            next_state = self._simulate_step(root_state, best_action)
            if next_state == self.goal_state:
                self._remember(root_state, 1.0)
            else:
                self._remember(root_state, self._get_bootstrap_target(next_state))'''
        return best_action

    def _simulate_step(self, state, action):
        self.env.reset()
        self.env.unwrapped.s = state
        next_state, _, _, _, _ = self.env.step(action)
        return next_state

In [None]:
import gymnasium as gym
import torch
import numpy as np
from collections import defaultdict
from queue import PriorityQueue
from typing import List, Tuple, Dict, Optional, Set
from dataclasses import dataclass

def test_enhanced_bfs():
    # 1. 创建简单的价值网络

    # 2. 初始化环境和算法
    env = gym.make('FrozenLake-v1',map_name="8x8", is_slippery=False)#map_name="8x8",
    #value_net = ValueNetwork(8)

    bfs = NeuralEnhancedBFS()#EnhancedBFS(value_net, num_simulations=10)#Neural


    # 3. 运行多个回合
    num_episodes = 1
    total_reward = 0

    print("\n开始测试Enhanced BFS...")

    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False
        episode_reward = 0
        steps = 0

        print(f"\n回合 {episode + 1}:")
        print(f"起始状态: {state}")

        while not done and steps < 100:
            # 使用算法选择动作
            action = bfs.search(state)
            print(f"Steps {steps}: 在状态 {state} 选择动作 {action}")

            # 执行动作
            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            episode_reward += reward
            steps += 1

            print(f"-> 新状态: {state}, 奖励: {reward}")

            if done:
                if reward > 0:
                    print("成功到达目标！")
                else:
                    print("失败（掉入陷阱或超时）")

        total_reward += episode_reward
        print(f"回合 {episode + 1} 结束 - 总步数: {steps}, 总奖励: {episode_reward}")

    print(f"\n测试完成 - 平均奖励: {total_reward/num_episodes}")

if __name__ == "__main__":
    test_enhanced_bfs()


开始测试Enhanced BFS...

回合 1:
起始状态: 0
Steps 0: 在状态 0 选择动作 1
-> 新状态: 8, 奖励: 0.0
Steps 1: 在状态 8 选择动作 2
-> 新状态: 9, 奖励: 0.0
Steps 2: 在状态 9 选择动作 2
-> 新状态: 10, 奖励: 0.0
Steps 3: 在状态 10 选择动作 2
-> 新状态: 11, 奖励: 0.0
Steps 4: 在状态 11 选择动作 2
-> 新状态: 12, 奖励: 0.0
Steps 5: 在状态 12 选择动作 1
-> 新状态: 20, 奖励: 0.0
Steps 6: 在状态 20 选择动作 1
-> 新状态: 28, 奖励: 0.0
Steps 7: 在状态 28 选择动作 1
-> 新状态: 36, 奖励: 0.0
Steps 8: 在状态 36 选择动作 2
-> 新状态: 37, 奖励: 0.0
Steps 9: 在状态 37 选择动作 2
-> 新状态: 38, 奖励: 0.0
Steps 10: 在状态 38 选择动作 2
-> 新状态: 39, 奖励: 0.0
Steps 11: 在状态 39 选择动作 1
-> 新状态: 47, 奖励: 0.0
Steps 12: 在状态 47 选择动作 1
-> 新状态: 55, 奖励: 0.0
Steps 13: 在状态 55 选择动作 1
-> 新状态: 63, 奖励: 1.0
成功到达目标！
回合 1 结束 - 总步数: 14, 总奖励: 1.0

测试完成 - 平均奖励: 1.0


###3.Bagging_BFS
 Cartpole


In [None]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from queue import PriorityQueue
from collections import deque
from dataclasses import dataclass, field
from typing import Optional, List, Tuple

class ValueNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(4, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
        # 初始化最后一层权重为接近0的小值
        nn.init.uniform_(self.layers[-1].weight, -0.1, 0.1)

    def forward(self, x):
        return self.layers(x)

@dataclass
class Node:
    state: np.ndarray
    action_taken: Optional[int] = None
    parent: Optional['Node'] = None
    children: dict = field(default_factory=dict)
    value: float = 0.0
    depth: int = 0
    done: bool = False
    reward: float = 0.0
    visit_count: int = 1

    def __post_init__(self):
        if self.parent is not None:
            self.depth = self.parent.depth + 1

class BaggingBFS:
    def __init__(self,
                 num_simulations: int = 40,
                 buffer_size: int = 1000,
                 batch_size: int = 64,
                 gamma: float = 0.99,
                 exploration_weight: float = 1.5,
                 n_networks: int = 5):
        self.env = gym.make('CartPole-v1')
        self.num_simulations = num_simulations
        self.gamma = gamma
        self.exploration_weight = exploration_weight
        self.initial_epsilon = 0.2

        # 创建多个网络进行集成
        self.online_nets = nn.ModuleList([
            ValueNetwork() for _ in range(n_networks)
        ])
        self.target_nets = nn.ModuleList([
            ValueNetwork() for _ in range(n_networks)
        ])

        # 为每个网络创建优化器
        self.optimizers = [
            optim.Adam(net.parameters(), lr=0.001)
            for net in self.online_nets
        ]

        # 创建mask用于bagging
        self.masks = np.random.binomial(
            n=1, p=0.8,
            size=(buffer_size, n_networks)
        )

        # 经验回放
        self.replay_buffer = deque(maxlen=buffer_size)
        self.batch_size = batch_size

        # 目标网络更新参数
        self.train_step_counter = 0
        self.target_update_interval = 15
        self.tau = 0.005  # 软更新参数

    def _calculate_value(self, state: np.ndarray) -> Tuple[float, float]:
        """使用集成网络计算状态价值"""
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state)
            # 获取所有网络的预测
            predictions = torch.stack([
                net(state_tensor) for net in self.online_nets
            ])  # shape: (n_networks, 1)

            # 计算均值和不确定性
            mean_pred = predictions.mean().item()
            std_pred = predictions.std().item()

            # 返回均值和不确定性奖励
            return mean_pred, std_pred

    def bfs_search(self, current_state: np.ndarray) -> int:
        """执行一次BFS搜索返回最佳动作"""
        root = Node(current_state)
        queue = PriorityQueue()

        # 计算初始状态的价值
        initial_value, initial_std = self._calculate_value(current_state)
        queue.put((-(initial_value + 0.2 * initial_std), 0, root))

        temp_env = gym.make('CartPole-v1')

        for _ in range(self.num_simulations):
            if queue.empty():
                break

            priority, _, current_node = queue.get()

            # 扩展子节点
            for action in range(self.env.action_space.n):
                temp_env.reset()
                temp_env.unwrapped.state = current_node.state.copy()
                next_state, reward, done, _, _ = temp_env.step(action)

                next_node = Node(
                    state=next_state,
                    action_taken=action,
                    parent=current_node,
                    reward=reward,
                    done=done
                )

                # 使用集成网络预测价值
                next_value, next_std = self._calculate_value(next_state)

                # 终止状态处理
                if done:
                    next_node.value = reward
                else:
                    next_node.value = next_value

                current_node.children[action] = next_node
                self._backpropagate(next_node, next_value)

                # 计算优先级（UCT + 不确定性奖励）
                uncertainty_bonus = 0.2 * next_std
                uct = next_node.value  + uncertainty_bonus

                # 随机探索
                '''epsilon = max(0.01, self.initial_epsilon * (0.995 ** self.train_step_counter))
                if np.random.rand() < epsilon:
                    uct *= 2'''

                queue.put((-uct, id(next_node), next_node))

            current_node.visit_count += 1

            # 经验回填
            if current_node.parent is not None:
                target_value = current_node.reward + (1-done) * self.gamma * current_node.value
                self._remember(current_node.parent.state, target_value)

            # 训练网络
            if len(self.replay_buffer) >= self.batch_size:
                self._replay()

        # 选择最佳动作
        best_action = max(
            [0, 1],
            key=lambda a: root.children[a].value + 0.2 * self._calculate_value(root.children[a].state)[1]
        )

        return best_action, root

    def _remember(self, state: np.ndarray, target_value: float):
        """存储经验"""
        self.replay_buffer.append((state, target_value))

    def _backpropagate(self, node: Node, value: float):
        """回溯更新节点价值"""
        while node is not None:
            node.value = max(node.value, value)
            node.visit_count += 1
            node = node.parent

    def _replay(self):
        """经验回放训练"""
        batch = np.random.choice(len(self.replay_buffer), self.batch_size, replace=False)
        states, target_values = zip(*[self.replay_buffer[i] for i in batch])

        states = torch.FloatTensor(np.array(states))
        target_values = torch.FloatTensor(target_values)

        # 获取当前batch的mask
        batch_indices = np.random.randint(0, len(self.masks), size=len(states))
        total_loss = 0

        # 训练每个网络
        for i, (online_net, optimizer) in enumerate(zip(
            self.online_nets, self.optimizers)):

            # 获取这个网络的mask
            mask = self.masks[batch_indices, i]
            if not mask.any():
                continue

            # 只使用mask为1的经验进行训练
            masked_states = states[mask]
            masked_targets = target_values[mask]

            # 训练网络
            optimizer.zero_grad()
            predictions = online_net(masked_states).squeeze()
            loss = nn.MSELoss()(predictions, masked_targets)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # 软更新目标网络
        self.train_step_counter += 1
        if self.train_step_counter % self.target_update_interval == 0:
            self._soft_update_target_networks()

    def _soft_update_target_networks(self):
        """软更新所有目标网络"""
        for target_net, online_net in zip(self.target_nets, self.online_nets):
            for target_param, online_param in zip(
                target_net.parameters(), online_net.parameters()):
                target_param.data.copy_(
                    target_param.data * (1.0 - self.tau) +
                    online_param.data * self.tau
                )

def train_agent():
    env = gym.make('CartPole-v1')
    agent = BaggingBFS(num_simulations=100)

    for episode in range(2):
        state = env.reset()[0]
        total_reward = 0
        done = False

        while not done:
            action, _ = agent.bfs_search(state)
            next_state, reward, done, _, _ = env.step(action)
            total_reward += reward
            state = next_state
            print(f"Episode {episode+1}, Total Reward: {total_reward}")

        print(f"Episode {episode+1}, Total Reward: {total_reward}")

if __name__ == "__main__":
    train_agent()

Episode 1, Total Reward: 1.0
Episode 1, Total Reward: 2.0
Episode 1, Total Reward: 3.0
Episode 1, Total Reward: 4.0
Episode 1, Total Reward: 5.0
Episode 1, Total Reward: 6.0
Episode 1, Total Reward: 7.0
Episode 1, Total Reward: 8.0
Episode 1, Total Reward: 9.0
Episode 1, Total Reward: 10.0
Episode 1, Total Reward: 11.0
Episode 1, Total Reward: 12.0
Episode 1, Total Reward: 13.0
Episode 1, Total Reward: 14.0
Episode 1, Total Reward: 15.0
Episode 1, Total Reward: 16.0
Episode 1, Total Reward: 17.0
Episode 1, Total Reward: 18.0
Episode 1, Total Reward: 19.0
Episode 1, Total Reward: 20.0
Episode 1, Total Reward: 21.0
Episode 1, Total Reward: 22.0
Episode 1, Total Reward: 22.0
Episode 2, Total Reward: 1.0
Episode 2, Total Reward: 2.0
Episode 2, Total Reward: 3.0
Episode 2, Total Reward: 4.0
Episode 2, Total Reward: 5.0
Episode 2, Total Reward: 6.0
Episode 2, Total Reward: 7.0
Episode 2, Total Reward: 8.0
Episode 2, Total Reward: 9.0
Episode 2, Total Reward: 10.0
Episode 2, Total Reward: 11.