<a href="https://colab.research.google.com/github/ImaginationX4/HybridZero/blob/main/RDN_BFS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gymnasium



###1.Learn RDN

In [None]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import matplotlib.pyplot as plt

# 超参数
GAMMA = 0.99           # 折扣因子
EPSILON_START = 1.0    # 初始 ε (探索率)
EPSILON_END = 0.01     # 最终 ε
EPSILON_DECAY = 500    # ε 衰减步数
BATCH_SIZE = 32        # 批大小
MEMORY_SIZE = 10000    # 经验回放缓冲区大小
TARGET_UPDATE = 10     # 目标网络更新频率
LEARNING_RATE = 0.001  # 学习率
EPISODES = 500         # 训练回合数
RND_SCALE = 1.0        # RND 内部奖励的缩放因子

# 设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# DQN 网络
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# RND 网络
class RNDNetwork(nn.Module):
    def __init__(self, state_size, output_size=32):
        super(RNDNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# 智能体
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=MEMORY_SIZE)
        self.epsilon = EPSILON_START

        # DQN 网络
        self.policy_net = DQN(state_size, action_size).to(device)
        self.target_net = DQN(state_size, action_size).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=LEARNING_RATE)

        # RND 网络
        self.random_net = RNDNetwork(state_size).to(device)  # 固定随机网络
        self.predictor_net = RNDNetwork(state_size).to(device)  # 可训练预测网络
        self.rnd_optimizer = optim.Adam(self.predictor_net.parameters(), lr=LEARNING_RATE)

    def select_action(self, state, step):
        self.epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * np.exp(-step / EPSILON_DECAY)
        if random.random() < self.epsilon:
            return random.randrange(self.action_size)
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            q_values = self.policy_net(state)
        return q_values.argmax().item()

    def compute_rnd_reward(self, next_state):
        next_state = torch.FloatTensor(next_state).unsqueeze(0).to(device)
        with torch.no_grad():
            target = self.random_net(next_state)
        prediction = self.predictor_net(next_state)
        rnd_reward = (target - prediction).pow(2).mean()
        return rnd_reward.item() * RND_SCALE

    def store_transition(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.memory) < BATCH_SIZE:
            return

        batch = random.sample(self.memory, BATCH_SIZE)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.FloatTensor(states).to(device)
        actions = torch.LongTensor(actions).to(device)
        rewards = torch.FloatTensor(rewards).to(device)
        next_states = torch.FloatTensor(next_states).to(device)
        dones = torch.FloatTensor(dones).to(device)

        # DQN 更新
        q_values = self.policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        next_q_values = self.target_net(next_states).max(1)[0]
        targets = rewards + (1 - dones) * GAMMA * next_q_values

        dqn_loss = nn.MSELoss()(q_values, targets.detach())
        self.optimizer.zero_grad()
        dqn_loss.backward()
        self.optimizer.step()

        # RND 更新
        predictor_outputs = self.predictor_net(next_states)
        random_outputs = self.random_net(next_states).detach()
        rnd_loss = (predictor_outputs - random_outputs).pow(2).mean()
        self.rnd_optimizer.zero_grad()
        rnd_loss.backward()
        self.rnd_optimizer.step()

    def update_target(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())



##2.RDN with BFS

### 2.1 FROZEN-LAKE

In [None]:
from typing import List, Tuple, Optional
from queue import PriorityQueue
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
from dataclasses import dataclass, field
from functools import lru_cache

# 神经网络模型定义
class HeuristicNetwork(nn.Module):
    def __init__(self, env_size):
        super().__init__()
        self.embedding = nn.Embedding(env_size**2, 32)
        self.fc = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()  # 输出0-1之间的启发值
        )

    def forward(self, state):
        x = self.embedding(state)
        return self.fc(x)

# RND 网络定义
class RNDNetwork(nn.Module):
    def __init__(self, env_size, output_size=32):
        super().__init__()
        self.embedding = nn.Embedding(env_size**2, 32)
        self.fc = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, output_size)
        )

    def forward(self, state):
        x = self.embedding(state)
        return self.fc(x)

@dataclass
class Node:
    state: int
    action_taken: Optional[int] = None
    parent: Optional['Node'] = None
    children: dict = field(default_factory=dict)
    visit_count: int = 0
    value: float = 0.0

class NeuralEnhancedBFSwithRND:
    def __init__(self, env_size: int = 8, num_simulations: int = 100,
                 buffer_size: int = 10000, batch_size: int = 32, rnd_scale: float = 0.40):
        self.env_size = env_size
        self.env = self._create_env()
        self.goal_state = env_size**2 - 1

        # 启发式网络配置
        self.model = HeuristicNetwork(env_size)
        self.target_model = HeuristicNetwork(env_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.loss_fn = nn.MSELoss()

        # RND 网络配置
        self.random_net = RNDNetwork(env_size).eval()  # 固定随机网络
        self.predictor_net = RNDNetwork(env_size)      # 可训练预测网络
        self.rnd_optimizer = optim.Adam(self.predictor_net.parameters(), lr=0.001)
        self.rnd_scale = rnd_scale  # RND 内部奖励缩放因子

        # 经验回放缓存
        self.replay_buffer = deque(maxlen=buffer_size)
        self.batch_size = batch_size

        # 目标网络同步间隔
        self.target_update_interval = 15
        self.train_step_counter = 0

        self.rnd_scale = rnd_scale
        self.rnd_mean = 0.0  # 移动平均
        self.rnd_std = 1.0   # 移动标准差
        self.rnd_count = 0   # 统计次数
        self.rnd_m2 = 0.0    # 用于 Welford 法计算方差

    def _create_env(self) -> gym.Env:
        return gym.make('FrozenLake-v1',
                       map_name=f"{self.env_size}x{self.env_size}",
                       is_slippery=False,
                       render_mode=None)

    def _get_valid_actions(self, state: int, env_size: int) -> List[int]:
        row, col = state // env_size, state % env_size
        valid_actions = []
        if col > 0: valid_actions.append(0)    # 左
        if row < env_size - 1: valid_actions.append(1)    # 下
        if col < env_size - 1: valid_actions.append(2)    # 右
        if row > 0: valid_actions.append(3)    # 上
        return valid_actions

    def _get_action_path(self, node: Node) -> List[int]:
        path = []
        current = node
        while current.parent:
            path.append(current.action_taken)
            current = current.parent
        return list(reversed(path))

    def _calculate_heuristic(self, state: int) -> float:
        with torch.no_grad():
            state_tensor = torch.LongTensor([state])
            return self.model(state_tensor).item()

    def _calculate_rnd_reward(self, state: int) -> float:
        """计算 RND 内部奖励"""
        state_tensor = torch.LongTensor([state])
        with torch.no_grad():
            target = self.random_net(state_tensor)
            prediction = self.predictor_net(state_tensor)
        raw_rnd = (target - prediction).pow(2).mean().item()

        '''# Welford 在线算法更新均值和标准差
        self.rnd_count += 1
        delta = raw_rnd - self.rnd_mean
        self.rnd_mean += delta / self.rnd_count
        delta2 = raw_rnd - self.rnd_mean
        self.rnd_m2 += delta * delta2
        if self.rnd_count > 1:
            self.rnd_std = np.sqrt(self.rnd_m2 / (self.rnd_count - 1))

        # 归一化并缩放
        normalized_rnd = (raw_rnd - self.rnd_mean) / (self.rnd_std + 1e-5)  # 避免除零
        scaled_rnd = self.rnd_scale * normalized_rnd

        # 裁剪到合理范围'''
        return raw_rnd * 0.2 #根据论文建议，限制最大值

    def _update_network(self, states, targets):
        states = torch.LongTensor(states)
        targets = torch.FloatTensor(targets)

        predictions = self.model(states).squeeze()
        loss = self.loss_fn(predictions, targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.train_step_counter % self.target_update_interval == 0:
            self.target_model.load_state_dict(self.model.state_dict())

    def _update_rnd_network(self, states):
        """更新 RND 的预测网络"""
        states = torch.LongTensor(states)
        predictions = self.predictor_net(states)
        with torch.no_grad():
            targets = self.random_net(states)
        rnd_loss = (predictions - targets).pow(2).mean()

        self.rnd_optimizer.zero_grad()
        rnd_loss.backward()
        self.rnd_optimizer.step()

    def _remember(self, state, target, next_state=None):
        """存储经验，包括 RND 的状态"""
        self.replay_buffer.append((state, target, next_state if next_state is not None else state))

    def _replay(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        batch = np.random.choice(len(self.replay_buffer), self.batch_size, replace=False)
        states, targets, next_states = zip(*[self.replay_buffer[i] for i in batch])

        # 更新启发式网络
        self.train_step_counter += 1
        self._update_network(states, targets)

        # 更新 RND 网络
        self._update_rnd_network(next_states)

    def _get_bootstrap_target(self, state):
        with torch.no_grad():
            state_tensor = torch.LongTensor([state])
            return self.target_model(state_tensor).item()

    def bfs_search(self, start_state: int) -> Tuple[Optional[List[int]], int, Node]:
        visited = set()
        queue = PriorityQueue()
        root_node = Node(state=start_state)
        queue.put((-self._calculate_heuristic(start_state), id(root_node), root_node))
        found_goal = False
        goal_node = None

        while not queue.empty():
            _, _, current_node = queue.get()

            if current_node.state in visited:
                continue
            visited.add(current_node.state)

            if current_node.state == self.goal_state:
                found_goal = True
                goal_node = current_node
                self._remember(current_node.state, 1.0, current_node.state)
                break
            # 收集训练数据，包括 RND 奖励
            if current_node.parent is not None:
                target = self._get_bootstrap_target(current_node.state)
                rnd_reward = self._calculate_rnd_reward(current_node.state)
                total_target = min(target + rnd_reward, 1.0)  # 限制最大值为1
                self._remember(current_node.parent.state, total_target, current_node.state)



            for action in self._get_valid_actions(current_node.state, self.env_size):
                self.env.reset()
                self.env.unwrapped.s = current_node.state
                next_state, _, terminated, _, _ = self.env.step(action)

                if terminated and next_state != self.goal_state:
                    self._remember(next_state, 0.0, next_state)
                    continue

                if next_state not in visited:
                    next_node = Node(
                        state=next_state,
                        action_taken=action,
                        parent=current_node,
                        value=self._calculate_heuristic(next_state) + self._calculate_rnd_reward(next_state)
                    )
                    current_node.children[action] = next_node
                    priority = -next_node.value
                    queue.put((priority, id(next_node), next_node))

            self._replay()

        if found_goal:
            current = goal_node
            while current.parent:
                current.value += 1.0
                current = current.parent

            #return self._get_action_path(goal_node), 1, root_node
        return None, 0, root_node

    def get_best_action_from_tree(self, root_node: Node) -> int:
        best_action = None
        best_value = float('-inf')

        for action, child in root_node.children.items():
            value = self._evaluate_subtree(child)
            if value > best_value:
                best_value = value
                best_action = action

        return best_action #if best_action is not None else random.choice(self._get_valid_actions(root_node.state, self.env_size))

    def _evaluate_subtree(self, node: Node) -> float:
        if node.state == self.goal_state:
            return float('inf')

        heuristic_value = self._calculate_heuristic(node.state)
        rnd_reward = self._calculate_rnd_reward(node.state)
        children_value = max([self._evaluate_subtree(child) for child in node.children.values()]) if node.children else 0
        return node.value + 0.5 * children_value

    def search(self, root_state: int) -> int:
        _, _, root_node = self.bfs_search(root_state)
        best_action = self.get_best_action_from_tree(root_node)
        return best_action

    def _simulate_step(self, state, action):
        self.env.reset()
        self.env.unwrapped.s = state
        next_state, _, _, _, _ = self.env.step(action)
        return next_state


In [None]:
import gymnasium as gym
import torch
import numpy as np
from collections import defaultdict
from queue import PriorityQueue
from typing import List, Tuple, Dict, Optional, Set
from dataclasses import dataclass

def test_enhanced_bfs():
    # 1. 创建简单的价值网络

    # 2. 初始化环境和算法
    env = gym.make('FrozenLake-v1',map_name="8x8", is_slippery=False)#map_name="8x8",
    #value_net = ValueNetwork(8)

    bfs = NeuralEnhancedBFSwithRND()#EnhancedBFS(value_net, num_simulations=10)#Neural


    # 3. 运行多个回合
    num_episodes = 1
    total_reward = 0

    print("\n开始测试NeuralEnhancedBFSwithRND...")

    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False
        episode_reward = 0
        steps = 0

        print(f"\n回合 {episode + 1}:")
        print(f"起始状态: {state}")

        while not done and steps < 100:
            # 使用算法选择动作
            action = bfs.search(state)
            print(f"Steps {steps}: 在状态 {state} 选择动作 {action}")

            # 执行动作
            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            episode_reward += reward
            steps += 1

            print(f"-> 新状态: {state}, 奖励: {reward}")

            if done:
                if reward > 0:
                    print("成功到达目标！")
                else:
                    print("失败（掉入陷阱或超时）")

        total_reward += episode_reward
        print(f"回合 {episode + 1} 结束 - 总步数: {steps}, 总奖励: {episode_reward}")

    print(f"\n测试完成 - 平均奖励: {total_reward/num_episodes}")

if __name__ == "__main__":
    test_enhanced_bfs()


开始测试NeuralEnhancedBFSwithRND...

回合 1:
起始状态: 0
Steps 0: 在状态 0 选择动作 2
-> 新状态: 1, 奖励: 0.0
Steps 1: 在状态 1 选择动作 1
-> 新状态: 9, 奖励: 0.0
Steps 2: 在状态 9 选择动作 1
-> 新状态: 17, 奖励: 0.0
Steps 3: 在状态 17 选择动作 2
-> 新状态: 18, 奖励: 0.0
Steps 4: 在状态 18 选择动作 1
-> 新状态: 26, 奖励: 0.0
Steps 5: 在状态 26 选择动作 2
-> 新状态: 27, 奖励: 0.0
Steps 6: 在状态 27 选择动作 2
-> 新状态: 28, 奖励: 0.0
Steps 7: 在状态 28 选择动作 3
-> 新状态: 20, 奖励: 0.0
Steps 8: 在状态 20 选择动作 2
-> 新状态: 21, 奖励: 0.0
Steps 9: 在状态 21 选择动作 2
-> 新状态: 22, 奖励: 0.0
Steps 10: 在状态 22 选择动作 1
-> 新状态: 30, 奖励: 0.0
Steps 11: 在状态 30 选择动作 2
-> 新状态: 31, 奖励: 0.0
Steps 12: 在状态 31 选择动作 1
-> 新状态: 39, 奖励: 0.0
Steps 13: 在状态 39 选择动作 1
-> 新状态: 47, 奖励: 0.0
Steps 14: 在状态 47 选择动作 1
-> 新状态: 55, 奖励: 0.0
Steps 15: 在状态 55 选择动作 1
-> 新状态: 63, 奖励: 1.0
成功到达目标！
回合 1 结束 - 总步数: 16, 总奖励: 1.0

测试完成 - 平均奖励: 1.0


###2.2 CARTPOLE

In [None]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from queue import PriorityQueue
from collections import deque
from dataclasses import dataclass, field
from typing import Optional, List, Tuple

class ValueNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(4, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
        # 初始化最后一层权重为接近0的小值
        nn.init.uniform_(self.layers[-1].weight, -0.1, 0.1)

    def forward(self, x):
        return self.layers(x)

class RNDNetwork(nn.Module):
    def __init__(self, state_size=4, output_size=32):
        super().__init__()

        self.fc = nn.Sequential(
            nn.Linear(state_size, 64),
            nn.ReLU(),
            nn.Linear(64, output_size)
        )

    def forward(self, state):
        x = state
        return self.fc(x)
@dataclass
class Node:
    state: np.ndarray
    action_taken: Optional[int] = None
    parent: Optional['Node'] = None
    children: dict = field(default_factory=dict)
    value: float = 0.0
    depth: int = 0
    done: bool = False
    reward: float = 0.0
    visit_count: int = 1

    def __post_init__(self):
        if self.parent is not None:
            self.depth = self.parent.depth + 1

class BFS:
    def __init__(self,
                 num_simulations: int = 40,
                 buffer_size: int = 1000,
                 batch_size: int = 64,
                 gamma: float = 0.99,
                 rnd_scale: float = 0.5):
        self.env = gym.make('CartPole-v1')
        self.num_simulations = num_simulations
        self.gamma = gamma
        self.exploration_weight = 1.5
        self.initial_epsilon = 0.2


        # 启发式网络配置
        self.model = ValueNetwork()
        self.target_model = ValueNetwork()
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.loss_fn = nn.MSELoss()

        # RND 网络配置
        self.random_net = RNDNetwork().eval()  # 固定随机网络
        self.predictor_net = RNDNetwork()      # 可训练预测网络
        self.rnd_optimizer = optim.Adam(self.predictor_net.parameters(), lr=0.001)
        self.rnd_scale = rnd_scale  # RND 内部奖励缩放因子

        # 经验回放
        self.replay_buffer = deque(maxlen=buffer_size)
        self.batch_size = batch_size

        # 目标网络更新参数
        self.train_step_counter = 0
        self.target_update_interval = 15
        self.tau = 0.005  # 软更新参数

        self.rnd_scale = rnd_scale
        self.rnd_mean = 0.0  # 移动平均
        self.rnd_std = 1.0   # 移动标准差
        self.rnd_count = 0   # 统计次数
        self.rnd_m2 = 0.0    # 用于 Welford 法计算方差

    def _calculate_value(self, state: np.ndarray) -> Tuple[float, float]:
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state)
            return self.model(state_tensor).item()
    def _calculate_rnd_reward(self, state: np.array) -> float:
      state_tensor = torch.FloatTensor(state)
      with torch.no_grad():
          target = self.random_net(state_tensor)
          prediction = self.predictor_net(state_tensor)
      raw_rnd = (target - prediction).pow(2).mean().item()
      return self.rnd_scale * raw_rnd

    def bfs_search1(self, current_state: np.ndarray) -> int:
      root = Node(current_state)
      queue = PriorityQueue()
      queue.put((0, 0, root))

      temp_env = gym.make('CartPole-v1')
      temp_env.reset()
      temp_env.unwrapped.state = current_state.copy()

      for _ in range(self.num_simulations):
          if queue.empty():
              break
          _, _, current_node = queue.get()
          for action in range(self.env.action_space.n):
              temp_env.unwrapped.state = current_node.state.copy()
              next_state, reward, done, _, _ = temp_env.step(action)
              next_node = Node(state=next_state, action_taken=action, parent=current_node, reward=reward, done=done)
              next_value = self._calculate_value(next_state)
              rnd_reward = self._calculate_rnd_reward(next_state)
              next_node.value = reward + (1 - done) * self.gamma * next_value + rnd_reward
              current_node.children[action] = next_node
              self._backpropagate(next_node, next_node.value)
              uct = next_node.value + self.exploration_weight * rnd_reward
              queue.put((-uct, id(next_node), next_node))
          current_node.visit_count += 1
          if current_node.parent is not None:
              self._remember(current_node.parent.state, current_node.value)

      best_action = max([0, 1], key=lambda a: root.children[a].value)
      return best_action, root

    def bfs_search(self, current_state: np.ndarray) -> int:
        """执行一次BFS搜索返回最佳动作"""
        root = Node(current_state)
        queue = PriorityQueue()

        # 计算初始状态的价值

        queue.put((0, 0, root))

        temp_env = gym.make('CartPole-v1')

        for _ in range(self.num_simulations):
            if queue.empty():
                break

            priority, _, current_node = queue.get()

            # 扩展子节点
            for action in range(self.env.action_space.n):
                temp_env.reset()
                temp_env.unwrapped.state = current_node.state.copy()
                next_state, reward, done, _, _ = temp_env.step(action)

                next_node = Node(
                    state=next_state,
                    action_taken=action,
                    parent=current_node,
                    reward=reward,
                    done=done
                )

                # 使用集成网络预测价值
                next_value = self._calculate_value(next_state) #+ self._calculate_rnd_reward(next_state)


                # 终止状态处理
                if done:
                    next_node.value = reward #+ self._calculate_rnd_reward(next_state)
                else:
                    next_node.value = next_value

                current_node.children[action] = next_node
                self._backpropagate(next_node, next_value)

                # 计算优先级（UCT + 不确定性奖励）

                uct = next_node.value + self._calculate_rnd_reward(next_state)

                '''# 随机探索
                epsilon = max(0.01, self.initial_epsilon * (0.995 ** self.train_step_counter))
                if np.random.rand() < 0.2:
                    uct *= 2'''

                queue.put((-uct, id(next_node), next_node))

            current_node.visit_count += 1

            # 经验回填
            if current_node.parent is not None:


              with torch.no_grad():
                state_tensor = torch.FloatTensor(current_node.state).unsqueeze(0)
                target_value=self.target_model(state_tensor).item()

              total_target = self.gamma * target_value + current_node.reward  # 限制最大值为1
              if done:
                total_target =1
              self._remember(current_node.parent.state, total_target)

            # 训练网络
            if len(self.replay_buffer) >= self.batch_size:
                self._replay()

        # 选择最佳动作
        best_action = max([0, 1], key=lambda a: root.children[a].value)

        return best_action, root

    def _backpropagate(self, node: Node, value: float):
        """回溯更新节点价值"""
        while node is not None:
            node.value = max(node.value, value)
            node = node.parent

    def _remember(self, state: np.ndarray, target_value: float):
        """存储经验"""
        self.replay_buffer.append((state, target_value))

    def _update_network(self, states, targets):
        states = torch.FloatTensor(states)
        targets = torch.FloatTensor(targets)

        predictions = self.model(states).squeeze()
        loss = self.loss_fn(predictions, targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.train_step_counter % self.target_update_interval == 0:
            self.target_model.load_state_dict(self.model.state_dict())
    def _update_rnd_network(self, states):
        """更新 RND 的预测网络"""
        states = torch.FloatTensor(states)
        predictions = self.predictor_net(states)
        with torch.no_grad():
            targets = self.random_net(states)
        rnd_loss = (predictions - targets).pow(2).mean()

        self.rnd_optimizer.zero_grad()
        rnd_loss.backward()
        self.rnd_optimizer.step()


    def _replay(self):
        """经验回放训练"""
        if len(self.replay_buffer) < self.batch_size:
            return

        batch = np.random.choice(len(self.replay_buffer), self.batch_size, replace=False)
        states, targets = zip(*[self.replay_buffer[i] for i in batch])

        # 更新启发式网络
        self.train_step_counter += 1
        self._update_network(states, targets)

        # 更新 RND 网络
        self._update_rnd_network(states)


def train_agent():
    env = gym.make('CartPole-v1')
    agent = BFS(num_simulations=100)

    for episode in range(3):
        state = env.reset()[0]
        total_reward = 0
        done = False

        while not done:
            action, _ = agent.bfs_search(state)
            next_state, reward, done, _, _ = env.step(action)
            total_reward += reward
            state = next_state
            print(f"Episode {episode+1}, Total Reward: {total_reward}")

        print(f"Episode {episode+1}, Total Reward: {total_reward}")

if __name__ == "__main__":
    train_agent()

Episode 1, Total Reward: 1.0
Episode 1, Total Reward: 2.0
Episode 1, Total Reward: 3.0
Episode 1, Total Reward: 4.0
Episode 1, Total Reward: 5.0
Episode 1, Total Reward: 6.0
Episode 1, Total Reward: 7.0
Episode 1, Total Reward: 8.0
Episode 1, Total Reward: 9.0
Episode 1, Total Reward: 10.0
Episode 1, Total Reward: 11.0
Episode 1, Total Reward: 12.0
Episode 1, Total Reward: 13.0
Episode 1, Total Reward: 14.0
Episode 1, Total Reward: 15.0
Episode 1, Total Reward: 16.0
Episode 1, Total Reward: 17.0
Episode 1, Total Reward: 18.0
Episode 1, Total Reward: 19.0
Episode 1, Total Reward: 20.0
Episode 1, Total Reward: 21.0
Episode 1, Total Reward: 22.0
Episode 1, Total Reward: 23.0
Episode 1, Total Reward: 24.0
Episode 1, Total Reward: 25.0
Episode 1, Total Reward: 26.0
Episode 1, Total Reward: 27.0
Episode 1, Total Reward: 28.0
Episode 1, Total Reward: 29.0
Episode 1, Total Reward: 30.0
Episode 1, Total Reward: 31.0
Episode 1, Total Reward: 32.0
Episode 1, Total Reward: 33.0
Episode 1, Total Re

## 3.Roll-**out**

### 3.1 Frozen-lake

In [None]:
from typing import List, Tuple, Optional
from queue import PriorityQueue
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
from dataclasses import dataclass, field
from functools import lru_cache

# 神经网络模型定义
class HeuristicNetwork(nn.Module):
    def __init__(self, env_size):
        super().__init__()
        self.embedding = nn.Embedding(env_size**2, 32)
        self.fc = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()  # 输出0-1之间的启发值
        )

    def forward(self, state):
        x = self.embedding(state)
        return self.fc(x)

# RND 网络定义
class RNDNetwork(nn.Module):
    def __init__(self, env_size, output_size=32):
        super().__init__()
        self.embedding = nn.Embedding(env_size**2, 32)
        self.fc = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, output_size)
        )

    def forward(self, state):
        x = self.embedding(state)
        return self.fc(x)

@dataclass
class Node:
    state: int
    action_taken: Optional[int] = None
    parent: Optional['Node'] = None
    children: dict = field(default_factory=dict)
    visit_count: int = 0
    value: float = 0.0

class NeuralEnhancedBFSwithRND:
    def __init__(self, env_size: int = 8, num_simulations: int = 100,
                 buffer_size: int = 10000, batch_size: int = 32, rnd_scale: float = 0.40):
        self.env_size = env_size
        self.env = self._create_env()
        self.goal_state = env_size**2 - 1

        # 启发式网络配置
        self.model = HeuristicNetwork(env_size)
        self.target_model = HeuristicNetwork(env_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.loss_fn = nn.MSELoss()

        # RND 网络配置
        self.random_net = RNDNetwork(env_size).eval()  # 固定随机网络
        self.predictor_net = RNDNetwork(env_size)      # 可训练预测网络
        self.rnd_optimizer = optim.Adam(self.predictor_net.parameters(), lr=0.001)
        self.rnd_scale = rnd_scale  # RND 内部奖励缩放因子

        # 经验回放缓存
        self.replay_buffer = deque(maxlen=buffer_size)
        self.batch_size = batch_size

        # 目标网络同步间隔
        self.target_update_interval = 15
        self.train_step_counter = 0

        self.rnd_scale = rnd_scale
        self.rnd_mean = 0.0  # 移动平均
        self.rnd_std = 1.0   # 移动标准差
        self.rnd_count = 0   # 统计次数
        self.rnd_m2 = 0.0    # 用于 Welford 法计算方差

        self.rollout_cache = {}

    def _create_env(self) -> gym.Env:
        return gym.make('FrozenLake-v1',
                       map_name=f"{self.env_size}x{self.env_size}",
                       is_slippery=False,
                       render_mode=None)

    def _get_valid_actions(self, state: int, env_size: int) -> List[int]:
        row, col = state // env_size, state % env_size
        valid_actions = []
        if col > 0: valid_actions.append(0)    # 左
        if row < env_size - 1: valid_actions.append(1)    # 下
        if col < env_size - 1: valid_actions.append(2)    # 右
        if row > 0: valid_actions.append(3)    # 上
        return valid_actions

    def _get_action_path(self, node: Node) -> List[int]:
        path = []
        current = node
        while current.parent:
            path.append(current.action_taken)
            current = current.parent
        return list(reversed(path))

    def _calculate_heuristic(self, state: int) -> float:
        with torch.no_grad():
            state_tensor = torch.LongTensor([state])
            return self.model(state_tensor).item()

    def _calculate_rnd_reward(self, state: int) -> float:
        """计算 RND 内部奖励"""
        state_tensor = torch.LongTensor([state])
        with torch.no_grad():
            target = self.random_net(state_tensor)
            prediction = self.predictor_net(state_tensor)
        raw_rnd = (target - prediction).pow(2).mean().item()

        '''# Welford 在线算法更新均值和标准差
        self.rnd_count += 1
        delta = raw_rnd - self.rnd_mean
        self.rnd_mean += delta / self.rnd_count
        delta2 = raw_rnd - self.rnd_mean
        self.rnd_m2 += delta * delta2
        if self.rnd_count > 1:
            self.rnd_std = np.sqrt(self.rnd_m2 / (self.rnd_count - 1))

        # 归一化并缩放
        normalized_rnd = (raw_rnd - self.rnd_mean) / (self.rnd_std + 1e-5)  # 避免除零
        scaled_rnd = self.rnd_scale * normalized_rnd

        # 裁剪到合理范围'''
        return raw_rnd * 0.2 #根据论文建议，限制最大值

    def _update_network(self, states, targets):
        states = torch.LongTensor(states)
        targets = torch.FloatTensor(targets)

        predictions = self.model(states).squeeze()
        loss = self.loss_fn(predictions, targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.train_step_counter % self.target_update_interval == 0:
            self.target_model.load_state_dict(self.model.state_dict())

    def _update_rnd_network(self, states):
        """更新 RND 的预测网络"""
        states = torch.LongTensor(states)
        predictions = self.predictor_net(states)
        with torch.no_grad():
            targets = self.random_net(states)
        rnd_loss = (predictions - targets).pow(2).mean()

        self.rnd_optimizer.zero_grad()
        rnd_loss.backward()
        self.rnd_optimizer.step()

    def _remember(self, state, target, next_state=None):
        """存储经验，包括 RND 的状态"""
        self.replay_buffer.append((state, target, next_state if next_state is not None else state))

    def _replay(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        batch = np.random.choice(len(self.replay_buffer), self.batch_size, replace=False)
        states, targets, next_states = zip(*[self.replay_buffer[i] for i in batch])

        # 更新启发式网络
        self.train_step_counter += 1
        self._update_network(states, targets)

        # 更新 RND 网络
        self._update_rnd_network(next_states)

    def _get_bootstrap_target(self, state):
        with torch.no_grad():
            state_tensor = torch.LongTensor([state])
            return self.target_model(state_tensor).item()

    def _perform_rollout(self, state: int, num_rollouts: int = 3, rollout_depth: int = 3) -> float:
      """
      Perform multiple rollouts from a given state to estimate its value.

      Args:
          state: The starting state
          num_rollouts: Number of simulation sequences to run
          rollout_depth: Steps to look ahead in each rollout

      Returns:
          Average value from the rollouts
      """
      total_value = 0.0

      for _ in range(num_rollouts):
          current_state = state

          # If already at goal, return maximum value
          if current_state == self.goal_state:
              return 1.0

          for depth in range(rollout_depth):
              # If reached goal during simulation, add high value
              if current_state == self.goal_state:
                  total_value += 1.0
                  break

              # Get valid actions
              valid_actions = self._get_valid_actions(current_state, self.env_size)

              # Choose action - 80% based on heuristic, 20% random exploration
              if np.random.random() < 0.8:
                  # Use heuristic to choose best action
                  best_action = None
                  best_value = float('-inf')
                  for action in valid_actions:
                      next_state = self._simulate_step(current_state, action)
                      value = self._calculate_heuristic(next_state)
                      if value > best_value:
                          best_value = value
                          best_action = action
                  action = best_action
              else:
                  # Random exploration
                  action = np.random.choice(valid_actions)

              # Simulate the action
              next_state = self._simulate_step(current_state, action)

              # Check if terminal state reached
              self.env.reset()
              self.env.unwrapped.s = current_state
              _, reward, terminated, _, _ = self.env.step(action)

              if terminated:
                  if next_state == self.goal_state:
                      total_value += 1.0  # Bonus for reaching goal
                  else:
                      total_value -= 0.2  # Penalty for holes
                  break

              # Move to next state
              current_state = next_state

          # If didn't terminate, add final state heuristic value
          if current_state != self.goal_state:
              total_value += 0.5 * self._calculate_heuristic(current_state)

      # Return average value across all rollouts
      return total_value / num_rollouts

    def _evaluate_state_with_rollout(self, state: int) -> float:
      """
      Evaluate a state using both neural network heuristic and rollouts

      Args:
          state: The state to evaluate

      Returns:
          Combined value estimate
      """
      # Neural network heuristic
      heuristic = self._calculate_heuristic(state)

      # Rollout value
      rollout = self._perform_rollout(state, num_rollouts=3, rollout_depth=3)

      # RND exploration bonus
      rnd_bonus = self._calculate_rnd_reward(state)

      # Combine values (weights can be adjusted)
      combined_value = 0.6 * heuristic + 0.3 * rollout + 0.3 * rnd_bonus

      # Limit to range [0, 1]
      return min(max(combined_value, 0.0), 1.0)
    def bfs_search(self, start_state: int) -> Tuple[Optional[List[int]], int, Node]:
      # Add a cache for rollout results to avoid redundant calculations
      self.rollout_cache = {}

      visited = set()
      queue = PriorityQueue()
      root_node = Node(state=start_state)

      # Evaluate root with combined method
      root_value = self._evaluate_state_with_rollout(start_state)
      root_node.value = root_value
      queue.put((-root_value, id(root_node), root_node))

      found_goal = False
      goal_node = None

      while not queue.empty():
          _, _, current_node = queue.get()

          if current_node.state in visited:
              continue
          visited.add(current_node.state)

          if current_node.state == self.goal_state:
              found_goal = True
              goal_node = current_node
              self._remember(current_node.state, 1.0, current_node.state)
              break

          # Collect training data, now with rollout-enhanced values
          if current_node.parent is not None:
            target = self._get_bootstrap_target(current_node.state)
            rnd_reward = self._calculate_rnd_reward(current_node.state)
            total_target = min(target + rnd_reward, 1.0)  # 限制最大值为1
            self._remember(current_node.parent.state, total_target, current_node.state)

          for action in self._get_valid_actions(current_node.state, self.env_size):
              self.env.reset()
              self.env.unwrapped.s = current_node.state
              next_state, _, terminated, _, _ = self.env.step(action)

              if terminated and next_state != self.goal_state:
                  self._remember(next_state, 0.0, next_state)
                  continue

              if next_state not in visited:
                  # Evaluate with rollout
                  next_value = self._evaluate_state_with_rollout(next_state)

                  next_node = Node(
                      state=next_state,
                      action_taken=action,
                      parent=current_node,
                      value=next_value
                  )
                  current_node.children[action] = next_node
                  priority = -next_node.value
                  queue.put((priority, id(next_node), next_node))

          self._replay()

      if found_goal:
          current = goal_node
          while current.parent:
              current.value += 1.0
              current = current.parent

      return None, 0, root_node
    def bfs_search1(self, start_state: int) -> Tuple[Optional[List[int]], int, Node]:
        visited = set()
        queue = PriorityQueue()
        root_node = Node(state=start_state)
        queue.put((-self._calculate_heuristic(start_state), id(root_node), root_node))
        found_goal = False
        goal_node = None

        while not queue.empty():
            _, _, current_node = queue.get()

            if current_node.state in visited:
                continue
            visited.add(current_node.state)

            if current_node.state == self.goal_state:
                found_goal = True
                goal_node = current_node
                self._remember(current_node.state, 1.0, current_node.state)
                break
            # 收集训练数据，包括 RND 奖励
            if current_node.parent is not None:
                target = self._get_bootstrap_target(current_node.state)
                rnd_reward = self._calculate_rnd_reward(current_node.state)
                total_target = min(target + rnd_reward, 1.0)  # 限制最大值为1
                self._remember(current_node.parent.state, total_target, current_node.state)



            for action in self._get_valid_actions(current_node.state, self.env_size):
                self.env.reset()
                self.env.unwrapped.s = current_node.state
                next_state, _, terminated, _, _ = self.env.step(action)

                if terminated and next_state != self.goal_state:
                    self._remember(next_state, 0.0, next_state)
                    continue

                if next_state not in visited:
                    next_node = Node(
                        state=next_state,
                        action_taken=action,
                        parent=current_node,
                        value=self._calculate_heuristic(next_state) + self._calculate_rnd_reward(next_state)
                    )
                    current_node.children[action] = next_node
                    priority = -next_node.value
                    queue.put((priority, id(next_node), next_node))

            self._replay()

        if found_goal:
            current = goal_node
            while current.parent:
                current.value += 1.0
                current = current.parent

            #return self._get_action_path(goal_node), 1, root_node
        return None, 0, root_node

    def get_best_action_from_tree(self, root_node: Node) -> int:
        best_action = None
        best_value = float('-inf')

        for action, child in root_node.children.items():
            value = self._evaluate_subtree(child)
            if value > best_value:
                best_value = value
                best_action = action

        return best_action #if best_action is not None else random.choice(self._get_valid_actions(root_node.state, self.env_size))

    def _evaluate_subtree(self, node: Node) -> float:
      if node.state == self.goal_state:
          return float('inf')

      # Use combined evaluation with rollout
      state_value = self._evaluate_state_with_rollout(node.state)

      children_value = max([self._evaluate_subtree(child) for child in node.children.values()]) if node.children else 0

      return state_value + 0.5 * children_value
    def search(self, root_state: int) -> int:
        _, _, root_node = self.bfs_search(root_state)
        best_action = self.get_best_action_from_tree(root_node)
        return best_action

    def _simulate_step(self, state, action):
        self.env.reset()
        self.env.unwrapped.s = state
        next_state, _, _, _, _ = self.env.step(action)
        return next_state


In [None]:
import gymnasium as gym
import torch
import numpy as np
from collections import defaultdict
from queue import PriorityQueue
from typing import List, Tuple, Dict, Optional, Set
from dataclasses import dataclass

def test_enhanced_bfs():
    # 1. 创建简单的价值网络

    # 2. 初始化环境和算法
    env = gym.make('FrozenLake-v1',map_name="8x8", is_slippery=False)#map_name="8x8",
    #value_net = ValueNetwork(8)

    bfs = NeuralEnhancedBFSwithRND()#EnhancedBFS(value_net, num_simulations=10)#Neural


    # 3. 运行多个回合
    num_episodes = 1
    total_reward = 0

    print("\n开始测试NeuralEnhancedBFSwithRND...")


    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False
        episode_reward = 0
        steps = 0


        print(f"\n回合 {episode + 1}:")
        print(f"起始状态: {state}")

        while not done and steps < 100:
            # 使用算法选择动作
            action = bfs.search(state)
            print(f"Steps {steps}: 在状态 {state} 选择动作 {action}")

            # 执行动作
            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            episode_reward += reward
            steps += 1

            print(f"-> 新状态: {state}, 奖励: {reward}")

            if done:
                if reward > 0:
                    print("成功到达目标！")
                else:
                    print("失败（掉入陷阱或超时）")

        total_reward += episode_reward
        print(f"回合 {episode + 1} 结束 - 总步数: {steps}, 总奖励: {episode_reward}")

    print(f"\n测试完成 - 平均奖励: {total_reward/num_episodes}")

if __name__ == "__main__":
    test_enhanced_bfs()


开始测试NeuralEnhancedBFSwithRND...

回合 1:
起始状态: 0
Steps 0: 在状态 0 选择动作 1
-> 新状态: 8, 奖励: 0.0
Steps 1: 在状态 8 选择动作 2
-> 新状态: 9, 奖励: 0.0
Steps 2: 在状态 9 选择动作 2
-> 新状态: 10, 奖励: 0.0
Steps 3: 在状态 10 选择动作 2
-> 新状态: 11, 奖励: 0.0
Steps 4: 在状态 11 选择动作 2
-> 新状态: 12, 奖励: 0.0
Steps 5: 在状态 12 选择动作 2
-> 新状态: 13, 奖励: 0.0
Steps 6: 在状态 13 选择动作 2
-> 新状态: 14, 奖励: 0.0
Steps 7: 在状态 14 选择动作 2
-> 新状态: 15, 奖励: 0.0
Steps 8: 在状态 15 选择动作 1
-> 新状态: 23, 奖励: 0.0
Steps 9: 在状态 23 选择动作 1
-> 新状态: 31, 奖励: 0.0
Steps 10: 在状态 31 选择动作 1
-> 新状态: 39, 奖励: 0.0
Steps 11: 在状态 39 选择动作 0
-> 新状态: 38, 奖励: 0.0
Steps 12: 在状态 38 选择动作 2
-> 新状态: 39, 奖励: 0.0
Steps 13: 在状态 39 选择动作 3
-> 新状态: 31, 奖励: 0.0
Steps 14: 在状态 31 选择动作 1
-> 新状态: 39, 奖励: 0.0
Steps 15: 在状态 39 选择动作 0
-> 新状态: 38, 奖励: 0.0
Steps 16: 在状态 38 选择动作 0
-> 新状态: 37, 奖励: 0.0
Steps 17: 在状态 37 选择动作 1
-> 新状态: 45, 奖励: 0.0
Steps 18: 在状态 45 选择动作 3
-> 新状态: 37, 奖励: 0.0
Steps 19: 在状态 37 选择动作 2
-> 新状态: 38, 奖励: 0.0
Steps 20: 在状态 38 选择动作 0
-> 新状态: 37, 奖励: 0.0
Steps 21: 在状态 37 选择动作 1
-> 新状态: 45, 奖励: 0.0

###3.2 Cartpole

In [None]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from queue import PriorityQueue
from collections import deque
from dataclasses import dataclass, field
from typing import Optional, List, Tuple

class ValueNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(4, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
        # 初始化最后一层权重为接近0的小值
        nn.init.uniform_(self.layers[-1].weight, -0.1, 0.1)

    def forward(self, x):
        return self.layers(x)

class RNDNetwork(nn.Module):
    def __init__(self, state_size=4, output_size=32):
        super().__init__()

        self.fc = nn.Sequential(
            nn.Linear(state_size, 64),
            nn.ReLU(),
            nn.Linear(64, output_size)
        )

    def forward(self, state):
        x = state
        return self.fc(x)
@dataclass
class Node:
    state: np.ndarray
    action_taken: Optional[int] = None
    parent: Optional['Node'] = None
    children: dict = field(default_factory=dict)
    value: float = 0.0
    depth: int = 0
    done: bool = False
    reward: float = 0.0
    visit_count: int = 1

    def __post_init__(self):
        if self.parent is not None:
            self.depth = self.parent.depth + 1

class BFS:
    def __init__(self,
                 num_simulations: int = 40,
                 buffer_size: int = 1000,
                 batch_size: int = 64,
                 gamma: float = 0.99,
                 rnd_scale: float = 0.5):
        self.env = gym.make('CartPole-v1')
        self.num_simulations = num_simulations
        self.gamma = gamma
        self.exploration_weight = 1.5
        self.initial_epsilon = 0.2


        # 启发式网络配置
        self.model = ValueNetwork()
        self.target_model = ValueNetwork()
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.loss_fn = nn.MSELoss()

        # RND 网络配置
        self.random_net = RNDNetwork().eval()  # 固定随机网络
        self.predictor_net = RNDNetwork()      # 可训练预测网络
        self.rnd_optimizer = optim.Adam(self.predictor_net.parameters(), lr=0.001)
        self.rnd_scale = rnd_scale  # RND 内部奖励缩放因子

        # 经验回放
        self.replay_buffer = deque(maxlen=buffer_size)
        self.batch_size = batch_size

        # 目标网络更新参数
        self.train_step_counter = 0
        self.target_update_interval = 15
        self.tau = 0.005  # 软更新参数

        self.rnd_scale = rnd_scale
        self.rnd_mean = 0.0  # 移动平均
        self.rnd_std = 1.0   # 移动标准差
        self.rnd_count = 0   # 统计次数
        self.rnd_m2 = 0.0    # 用于 Welford 法计算方差

    def _calculate_value(self, state: np.ndarray) -> Tuple[float, float]:
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state)
            return self.model(state_tensor).item()
    def _calculate_rnd_reward(self, state: np.array) -> float:
      state_tensor = torch.FloatTensor(state)
      with torch.no_grad():
          target = self.random_net(state_tensor)
          prediction = self.predictor_net(state_tensor)
      raw_rnd = (target - prediction).pow(2).mean().item()
      return self.rnd_scale * raw_rnd

    def _perform_rollout(self, state: np.ndarray, num_rollouts: int = 3, rollout_depth: int = 5) -> float:
      """执行多次前向模拟来估计状态价值"""
      total_value = 0.0

      # 创建临时环境
      rollout_env = gym.make('CartPole-v1')

      for _ in range(num_rollouts):
          # 每次rollout从初始状态开始
          rollout_env.reset()
          rollout_env.unwrapped.state = state.copy()

          cumulative_reward = 0.0
          discount = 1.0
          terminal = False

          for step in range(rollout_depth):
              # 动作选择策略: 70%使用价值网络, 30%随机探索
              if np.random.random() < 0.7:
                  action_values = []
                  for a in range(rollout_env.action_space.n):
                      # 克隆当前状态
                      temp_env = gym.make('CartPole-v1')
                      temp_env.reset()
                      temp_env.unwrapped.state = rollout_env.unwrapped.state.copy()

                      # 模拟执行动作
                      next_state, r, done, _, _ = temp_env.step(a)
                      value = self._calculate_value(next_state)
                      action_values.append(value)

                  action = np.argmax(action_values)
              else:
                  # 随机探索
                  action = rollout_env.action_space.sample()

              # 执行选择的动作
              next_state, reward, done, _, _ = rollout_env.step(action)

              # 更新累计奖励
              cumulative_reward += discount * reward
              discount *= self.gamma

              # 如果游戏结束，提前终止rollout
              if done:
                  terminal = True
                  break

          # 如果rollout没有自然终止，添加自举值估计
          if not terminal:
              bootstrap_value = self._calculate_value(next_state)
              cumulative_reward += discount * bootstrap_value

          total_value += cumulative_reward

      # 返回平均价值
      return total_value / num_rollouts

    def _evaluate_state_with_rollout(self, state: np.ndarray) -> float:
        """结合神经网络和rollout评估状态价值"""
        # 神经网络预测
        network_value = self._calculate_value(state)

        # Rollout模拟价值
        rollout_value = self._perform_rollout(state, num_rollouts=3, rollout_depth=10)

        # RND探索奖励
        rnd_value = self._calculate_rnd_reward(state)

        # 组合价值（权重可以调整）
        combined_value = 0.5 * network_value + 0.4 * rollout_value + 0.1 * rnd_value

        return combined_value
    def bfs_search(self, current_state: np.ndarray) -> int:
        """执行一次BFS搜索返回最佳动作"""
        root = Node(current_state)
        queue = PriorityQueue()

        # 计算初始状态的价值

        queue.put((0, 0, root))

        temp_env = gym.make('CartPole-v1')

        for _ in range(self.num_simulations):
            if queue.empty():
                break

            priority, _, current_node = queue.get()

            # 扩展子节点
            for action in range(self.env.action_space.n):
                temp_env.reset()
                temp_env.unwrapped.state = current_node.state.copy()
                next_state, reward, done, _, _ = temp_env.step(action)

                next_node = Node(
                    state=next_state,
                    action_taken=action,
                    parent=current_node,
                    reward=reward,
                    done=done
                )

                # 使用集成网络预测价值
                next_value = self._evaluate_state_with_rollout(next_state) #+ self._calculate_rnd_reward(next_state)


                # 终止状态处理
                if done:
                    next_node.value = reward #+ self._calculate_rnd_reward(next_state)
                else:
                    next_node.value = next_value

                current_node.children[action] = next_node
                self._backpropagate(next_node, next_value)

                # 计算优先级（UCT + 不确定性奖励）

                uct = next_node.value #+ self._calculate_rnd_reward(next_state)

                '''# 随机探索
                epsilon = max(0.01, self.initial_epsilon * (0.995 ** self.train_step_counter))
                if np.random.rand() < 0.2:
                    uct *= 2'''

                queue.put((-uct, id(next_node), next_node))

            current_node.visit_count += 1

            # 经验回填
            if current_node.parent is not None:
              with torch.no_grad():
                  # 结合rollout的价值估计
                  target_value = self._evaluate_state_with_rollout(current_node.state)

              total_target = self.gamma * target_value + current_node.reward
              if done:
                  total_target = 1
              self._remember(current_node.parent.state, total_target)

            # 训练网络
            if len(self.replay_buffer) >= self.batch_size:
                self._replay()

        # 选择最佳动作
        best_action = max([0, 1], key=lambda a: root.children[a].value)

        return best_action, root

    def _backpropagate(self, node: Node, value: float):
        """回溯更新节点价值"""
        while node is not None:
            node.value = max(node.value, value)
            node = node.parent

    def _remember(self, state: np.ndarray, target_value: float):
        """存储经验"""
        self.replay_buffer.append((state, target_value))

    def _update_network(self, states, targets):
        states = torch.FloatTensor(states)
        targets = torch.FloatTensor(targets)

        predictions = self.model(states).squeeze()
        loss = self.loss_fn(predictions, targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.train_step_counter % self.target_update_interval == 0:
            self.target_model.load_state_dict(self.model.state_dict())
    def _update_rnd_network(self, states):
        """更新 RND 的预测网络"""
        states = torch.FloatTensor(states)
        predictions = self.predictor_net(states)
        with torch.no_grad():
            targets = self.random_net(states)
        rnd_loss = (predictions - targets).pow(2).mean()

        self.rnd_optimizer.zero_grad()
        rnd_loss.backward()
        self.rnd_optimizer.step()


    def _replay(self):
        """经验回放训练"""
        if len(self.replay_buffer) < self.batch_size:
            return

        batch = np.random.choice(len(self.replay_buffer), self.batch_size, replace=False)
        states, targets = zip(*[self.replay_buffer[i] for i in batch])

        # 更新启发式网络
        self.train_step_counter += 1
        self._update_network(states, targets)

        # 更新 RND 网络
        self._update_rnd_network(states)


def train_agent():
    env = gym.make('CartPole-v1')
    agent = BFS(num_simulations=100)

    for episode in range(3):
        state = env.reset()[0]
        total_reward = 0
        done = False

        while not done:
            action, _ = agent.bfs_search(state)
            next_state, reward, done, _, _ = env.step(action)
            total_reward += reward
            state = next_state
            print(f"Episode {episode+1}, Total Reward: {total_reward}")

        print(f"Episode {episode+1}, Total Reward: {total_reward}")

if __name__ == "__main__":
    train_agent()

Episode 1, Total Reward: 1.0
Episode 1, Total Reward: 2.0
Episode 1, Total Reward: 3.0
Episode 1, Total Reward: 4.0
Episode 1, Total Reward: 5.0
Episode 1, Total Reward: 6.0
Episode 1, Total Reward: 7.0
Episode 1, Total Reward: 8.0
Episode 1, Total Reward: 9.0
Episode 1, Total Reward: 10.0
Episode 1, Total Reward: 11.0
Episode 1, Total Reward: 12.0
Episode 1, Total Reward: 13.0
Episode 1, Total Reward: 14.0
Episode 1, Total Reward: 15.0
Episode 1, Total Reward: 16.0
Episode 1, Total Reward: 17.0
Episode 1, Total Reward: 18.0
Episode 1, Total Reward: 19.0


KeyboardInterrupt: 

## 4.Backpropgrate

#### 4.1 Frozen-lake

In [None]:
from typing import List, Tuple, Optional
from queue import PriorityQueue
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
from dataclasses import dataclass, field
from functools import lru_cache

# 神经网络模型定义
class HeuristicNetwork(nn.Module):
    def __init__(self, env_size):
        super().__init__()
        self.embedding = nn.Embedding(env_size**2, 32)
        self.fc = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()  # 输出0-1之间的启发值
        )

    def forward(self, state):
        x = self.embedding(state)
        return self.fc(x)

# RND 网络定义
class RNDNetwork(nn.Module):
    def __init__(self, env_size, output_size=32):
        super().__init__()
        self.embedding = nn.Embedding(env_size**2, 32)
        self.fc = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, output_size)
        )

    def forward(self, state):
        x = self.embedding(state)
        return self.fc(x)

@dataclass
class Node:
    state: int
    action_taken: Optional[int] = None
    parent: Optional['Node'] = None
    children: dict = field(default_factory=dict)
    visit_count: int = 0
    value: float = 0.0

class NeuralEnhancedBFSwithRND:
    def __init__(self, env_size: int = 8, num_simulations: int = 100,
                 buffer_size: int = 10000, batch_size: int = 32, rnd_scale: float = 0.40):
        self.env_size = env_size
        self.env = self._create_env()
        self.goal_state = env_size**2 - 1

        # 启发式网络配置
        self.model = HeuristicNetwork(env_size)
        self.target_model = HeuristicNetwork(env_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.loss_fn = nn.MSELoss()

        # RND 网络配置
        self.random_net = RNDNetwork(env_size).eval()  # 固定随机网络
        self.predictor_net = RNDNetwork(env_size)      # 可训练预测网络
        self.rnd_optimizer = optim.Adam(self.predictor_net.parameters(), lr=0.001)
        self.rnd_scale = rnd_scale  # RND 内部奖励缩放因子

        # 经验回放缓存
        self.replay_buffer = deque(maxlen=buffer_size)
        self.batch_size = batch_size

        # 目标网络同步间隔
        self.target_update_interval = 15
        self.train_step_counter = 0

        self.rnd_scale = rnd_scale
        self.rnd_mean = 0.0  # 移动平均
        self.rnd_std = 1.0   # 移动标准差
        self.rnd_count = 0   # 统计次数
        self.rnd_m2 = 0.0    # 用于 Welford 法计算方差

    def _create_env(self) -> gym.Env:
        return gym.make('FrozenLake-v1',
                       map_name=f"{self.env_size}x{self.env_size}",
                       is_slippery=False,
                       render_mode=None)

    def _get_valid_actions(self, state: int, env_size: int) -> List[int]:
        row, col = state // env_size, state % env_size
        valid_actions = []
        if col > 0: valid_actions.append(0)    # 左
        if row < env_size - 1: valid_actions.append(1)    # 下
        if col < env_size - 1: valid_actions.append(2)    # 右
        if row > 0: valid_actions.append(3)    # 上
        return valid_actions

    def _get_action_path(self, node: Node) -> List[int]:
        path = []
        current = node
        while current.parent:
            path.append(current.action_taken)
            current = current.parent
        return list(reversed(path))

    def _calculate_heuristic(self, state: int) -> float:
        with torch.no_grad():
            state_tensor = torch.LongTensor([state])
            return self.model(state_tensor).item()

    def _calculate_rnd_reward(self, state: int) -> float:
        """计算 RND 内部奖励"""
        state_tensor = torch.LongTensor([state])
        with torch.no_grad():
            target = self.random_net(state_tensor)
            prediction = self.predictor_net(state_tensor)
        raw_rnd = (target - prediction).pow(2).mean().item()

        '''# Welford 在线算法更新均值和标准差
        self.rnd_count += 1
        delta = raw_rnd - self.rnd_mean
        self.rnd_mean += delta / self.rnd_count
        delta2 = raw_rnd - self.rnd_mean
        self.rnd_m2 += delta * delta2
        if self.rnd_count > 1:
            self.rnd_std = np.sqrt(self.rnd_m2 / (self.rnd_count - 1))

        # 归一化并缩放
        normalized_rnd = (raw_rnd - self.rnd_mean) / (self.rnd_std + 1e-5)  # 避免除零
        scaled_rnd = self.rnd_scale * normalized_rnd

        # 裁剪到合理范围'''
        return raw_rnd * 0.2 #根据论文建议，限制最大值

    def _update_network(self, states, targets):
        states = torch.LongTensor(states)
        targets = torch.FloatTensor(targets)

        predictions = self.model(states).squeeze()
        loss = self.loss_fn(predictions, targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.train_step_counter % self.target_update_interval == 0:
            self.target_model.load_state_dict(self.model.state_dict())

    def _update_rnd_network(self, states):
        """更新 RND 的预测网络"""
        states = torch.LongTensor(states)
        predictions = self.predictor_net(states)
        with torch.no_grad():
            targets = self.random_net(states)
        rnd_loss = (predictions - targets).pow(2).mean()

        self.rnd_optimizer.zero_grad()
        rnd_loss.backward()
        self.rnd_optimizer.step()

    def _remember(self, state, target, next_state=None):
        """存储经验，包括 RND 的状态"""
        self.replay_buffer.append((state, target, next_state if next_state is not None else state))

    def _replay(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        batch = np.random.choice(len(self.replay_buffer), self.batch_size, replace=False)
        states, targets, next_states = zip(*[self.replay_buffer[i] for i in batch])

        # 更新启发式网络
        self.train_step_counter += 1
        self._update_network(states, targets)

        # 更新 RND 网络
        self._update_rnd_network(next_states)

    def _get_bootstrap_target(self, state):
        with torch.no_grad():
            state_tensor = torch.LongTensor([state])
            return self.target_model(state_tensor).item()

    def bfs_search2(self, start_state: int) -> Tuple[Optional[List[int]], int, Node]:
        visited = set()
        queue = PriorityQueue()
        root_node = Node(state=start_state)
        queue.put((-self._calculate_heuristic(start_state), id(root_node), root_node))
        found_goal = False
        goal_node = None

        while not queue.empty():
            _, _, current_node = queue.get()

            if current_node.state in visited:
                continue
            visited.add(current_node.state)

            if current_node.state == self.goal_state:
                found_goal = True
                goal_node = current_node
                self._remember(current_node.state, 1.0, current_node.state)
                break
            # 收集训练数据，包括 RND 奖励
            if current_node.parent is not None:
                target = self._get_bootstrap_target(current_node.state)
                rnd_reward = self._calculate_rnd_reward(current_node.state)
                total_target = min(target + rnd_reward, 1.0)  # 限制最大值为1
                self._remember(current_node.parent.state, total_target, current_node.state)



            for action in self._get_valid_actions(current_node.state, self.env_size):
                self.env.reset()
                self.env.unwrapped.s = current_node.state
                next_state, _, terminated, _, _ = self.env.step(action)

                if terminated and next_state != self.goal_state:
                    self._remember(next_state, 0.0, next_state)
                    continue

                if next_state not in visited:
                    next_node = Node(
                        state=next_state,
                        action_taken=action,
                        parent=current_node,
                        value=self._calculate_heuristic(next_state) + self._calculate_rnd_reward(next_state)
                    )
                    current_node.children[action] = next_node
                    priority = -next_node.value
                    queue.put((priority, id(next_node), next_node))

            self._replay()

        self._backpropagate_values(root_node, goal_node if found_goal else None)
        return None, 0, root_node

    def _backup(self, new_node, root_node):
        """回溯更新节点价值"""
        discount_factor = 0.95  # γ - 未来奖励的折扣因子
        lambda_param = 0.8      # λ - 资格迹衰减参数
        base_learning_rate = 0.1  # α - 基础学习率

        traces = {}
        path_nodes = []
        current = new_node
        while current:
          traces[current.state] = 0.0  # 先清零，稍后递增
          path_nodes.append(current)
          current = current.parent
        path_nodes = list(reversed(path_nodes))
        for i, node in enumerate(path_nodes):
          if i == len(path_nodes) - 1:
            continue
          traces[node.state] = traces.get(node.state, 0.0) + 1.0
          current_reward = 1.0 if new_node.state == self.goal_state else 0.0

          # 可以加入RND奖励
          current_reward += self._calculate_rnd_reward(path_nodes[i+1].state)

          # 使用启发式估计作为未来价值的估计
          bootstrap_value = self._calculate_heuristic(path_nodes[i+1].state)

          # TD目标 = 当前奖励 + 折扣系数 * 未来价值估计
          td_target = current_reward + discount_factor * bootstrap_value

          td_error = td_target - node.value

          for state in traces:
            node = self._find_node_by_state(state, root_node)  # 假设有方法找到对应节点
            node.value += base_learning_rate * td_error * traces[state]

          # 衰减所有资格迹
          for state in traces:
            traces[state] *= discount_factor * lambda_param
    def _backpropagate_values(self, root_node: Node, goal_node: Optional[Node] = None):
      """
      Propagate values through the search tree using TD(λ) principles.

      Args:
          root_node: The root node of the search tree
          goal_node: The goal node if found, None otherwise
      """
      discount_factor = 0.95  # γ - 未来奖励的折扣因子
      lambda_param = 0.8      # λ - 资格迹衰减参数
      base_learning_rate = 0.1  # α - 基础学习率

      traces = {}

      if goal_node:
          print('find goal')
          goal_node.value = 1.0  # 目标节点奖励
          current = goal_node

          # 初始化路径上的资格迹
          while current:
              traces[current.state] = 0.0  # 先清零，稍后递增
              current = current.parent

          # 从目标向上传播
          current = goal_node
          while current:
              # 资格迹递增（当前状态）
              traces[current.state] = traces.get(current.state, 0.0) + 1.0

              # 计算 TD 目标和误差
              r = 1.0 if current == goal_node else 0.0  # 仅目标节点有奖励
              next_value = current.value
              td_target = r + discount_factor * next_value
              td_error = td_target - current.value

              # 更新所有有资格迹的状态的价值
              for state in traces:
                  node = self._find_node_by_state(state,root_node)  # 假设有方法找到对应节点
                  node.value += base_learning_rate * td_error * traces[state]

              # 衰减所有资格迹
              for state in traces:
                  traces[state] *= discount_factor * lambda_param

              current = current.parent
      else:
          print('xxxx')  # 未找到目标，待补充逻辑



    def bfs_search(self, start_state: int) -> Tuple[Optional[List[int]], int, Node]:
        visited = set()
        queue = PriorityQueue()
        root_node = Node(state=start_state)
        queue.put((-self._calculate_heuristic(start_state), id(root_node), root_node))
        found_goal = False
        goal_node = None

        while not queue.empty():
            _, _, current_node = queue.get()

            if current_node.state in visited:
                continue
            visited.add(current_node.state)

            if current_node.state == self.goal_state:
                found_goal = True
                goal_node = current_node
                self._remember(current_node.state, 1.0, current_node.state)
                break

            # 训练数据收集，包括RND奖励
            if current_node.parent is not None:
                target = self._get_bootstrap_target(current_node.state)
                rnd_reward = self._calculate_rnd_reward(current_node.state)
                total_target = min(target + rnd_reward, 1.0)  # 限制最大值为1
                self._remember(current_node.parent.state, total_target, current_node.state)

            for action in self._get_valid_actions(current_node.state, self.env_size):
                self.env.reset()
                self.env.unwrapped.s = current_node.state
                next_state, _, terminated, _, _ = self.env.step(action)

                if terminated and next_state != self.goal_state:
                    self._remember(next_state, 0.0, next_state)
                    continue

                if next_state not in visited:
                    next_node = Node(
                        state=next_state,
                        action_taken=action,
                        parent=current_node,
                        value=self._calculate_heuristic(next_state) + self._calculate_rnd_reward(next_state)
                    )
                    current_node.children[action] = next_node
                    priority = -next_node.value
                    queue.put((priority, id(next_node), next_node))
                    self._backup(next_node,root_node)

            self._replay()

        # 搜索完成后，反向传播价值
        #self._backpropagate_values(root_node, goal_node if found_goal else None)



        return None, 0, root_node
    def _find_node_by_state(self, state, start_node):
      """递归搜索树以找到具有特定状态的节点"""
      # 基本情况：当前节点具有目标状态
      if start_node.state == state:
          return start_node

      # 递归搜索每个子节点
      for child in start_node.children.values():
          found = self._find_node_by_state(state, child)
          if found:
              return found

      # 如果在当前子树中没有找到，返回None
      return None
    def get_best_action_from_tree(self, root_node: Node) -> int:
        best_action = None
        best_value = float('-inf')

        for action, child in root_node.children.items():
            value = self._evaluate_subtree(child)
            if value > best_value:
                best_value = value
                best_action = action

        return best_action #if best_action is not None else random.choice(self._get_valid_actions(root_node.state, self.env_size))

    def _evaluate_subtree(self, node: Node) -> float:
        if node.state == self.goal_state:
            return float('inf')

        heuristic_value = self._calculate_heuristic(node.state)
        rnd_reward = self._calculate_rnd_reward(node.state)
        children_value = max([self._evaluate_subtree(child) for child in node.children.values()]) if node.children else 0
        return  node.value +0.5 * children_value

    def search(self, root_state: int) -> int:
        _, _, root_node = self.bfs_search(root_state)
        best_action = self.get_best_action_from_tree(root_node)
        return best_action

    def _simulate_step(self, state, action):
        self.env.reset()
        self.env.unwrapped.s = state
        next_state, _, _, _, _ = self.env.step(action)
        return next_state


In [None]:
import gymnasium as gym
import torch
import numpy as np
from collections import defaultdict
from queue import PriorityQueue
from typing import List, Tuple, Dict, Optional, Set
from dataclasses import dataclass

def test_enhanced_bfs():
    # 1. 创建简单的价值网络

    # 2. 初始化环境和算法
    env = gym.make('FrozenLake-v1',map_name="8x8", is_slippery=False)#map_name="8x8",
    #value_net = ValueNetwork(8)

    bfs = NeuralEnhancedBFSwithRND()#EnhancedBFS(value_net, num_simulations=10)#Neural


    # 3. 运行多个回合
    num_episodes = 1
    total_reward = 0

    print("\n开始测试NeuralEnhancedBFSwithRND...")

    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False
        episode_reward = 0
        steps = 0

        print(f"\n回合 {episode + 1}:")
        print(f"起始状态: {state}")

        while not done and steps < 100:
            # 使用算法选择动作
            action = bfs.search(state)
            print(f"Steps {steps}: 在状态 {state} 选择动作 {action}")

            # 执行动作
            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            episode_reward += reward
            steps += 1

            print(f"-> 新状态: {state}, 奖励: {reward}")

            if done:
                if reward > 0:
                    print("成功到达目标！")
                else:
                    print("失败（掉入陷阱或超时）")

        total_reward += episode_reward
        print(f"回合 {episode + 1} 结束 - 总步数: {steps}, 总奖励: {episode_reward}")

    print(f"\n测试完成 - 平均奖励: {total_reward/num_episodes}")

if __name__ == "__main__":
    test_enhanced_bfs()


开始测试NeuralEnhancedBFSwithRND...

回合 1:
起始状态: 0
Steps 0: 在状态 0 选择动作 2
-> 新状态: 1, 奖励: 0.0
Steps 1: 在状态 1 选择动作 1
-> 新状态: 9, 奖励: 0.0
Steps 2: 在状态 9 选择动作 1
-> 新状态: 17, 奖励: 0.0
Steps 3: 在状态 17 选择动作 1
-> 新状态: 25, 奖励: 0.0
Steps 4: 在状态 25 选择动作 2
-> 新状态: 26, 奖励: 0.0
Steps 5: 在状态 26 选择动作 2
-> 新状态: 27, 奖励: 0.0
Steps 6: 在状态 27 选择动作 2
-> 新状态: 28, 奖励: 0.0
Steps 7: 在状态 28 选择动作 1
-> 新状态: 36, 奖励: 0.0
Steps 8: 在状态 36 选择动作 1
-> 新状态: 44, 奖励: 0.0
Steps 9: 在状态 44 选择动作 2
-> 新状态: 45, 奖励: 0.0
Steps 10: 在状态 45 选择动作 1
-> 新状态: 53, 奖励: 0.0
Steps 11: 在状态 53 选择动作 1
-> 新状态: 61, 奖励: 0.0
Steps 12: 在状态 61 选择动作 2
-> 新状态: 62, 奖励: 0.0
Steps 13: 在状态 62 选择动作 2
-> 新状态: 63, 奖励: 1.0
成功到达目标！
回合 1 结束 - 总步数: 14, 总奖励: 1.0

测试完成 - 平均奖励: 1.0


#### 4.2 Cartpole