<a href="https://colab.research.google.com/github/ImaginationX4/HybridZero/blob/main/A0_Greddy_BFS_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.0.0


In [None]:
from math import pi
import gymnasium as gym
import numpy as np
from queue import PriorityQueue
import time
import random

import torch
import torch.nn as nn
import torch.optim as optim
from IPython import display

class ValueNetwork(nn.Module):
    def __init__(self, state_dim=16, action_dim=4, hidden_dim=64):
        super(ValueNetwork, self).__init__()
        self.shared_layers = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU()
        )


        self.value_head = nn.Linear(hidden_dim, 1)



    def forward(self, state):
        # one-hot
        if isinstance(state, int):
            one_hot = torch.zeros(16)
            one_hot[state] = 1.0
            state = one_hot

        x = self.shared_layers(state)
        value = self.value_head(x)

        return value

class AlphaZeroGreedyBFS:

  def __init__(self, learning_rate=0.001, gamma=0.99, tau=0.005):
      self.rows = 4
      self.cols = 4
      self.actions = 4
      self.network = ValueNetwork()
      self.optimizer = torch.optim.Adam(self.network.parameters(), lr=learning_rate)

      self.value_net = ValueNetwork()
      self.target_net = ValueNetwork()
      # 初始化目标网络
      self.target_net.load_state_dict(self.value_net.state_dict())
      self.optimizer = torch.optim.Adam(self.value_net.parameters(), lr=learning_rate)
      self.gamma = gamma
      self.tau = tau


      self.replay_buffer = []
      self.buffer_size = 1000
      self.batch_size = 32

  def state_to_tensor(self, state):

    state_tensor = torch.zeros(16)
    state_tensor[state] = 1.0
    return state_tensor

  def get_next_state(self, state, action):
      row = state // self.cols
      col = state % self.cols

      if action == 0:  # LEFT
          col = max(0, col - 1)
      elif action == 1:  # DOWN
          row = min(self.rows - 1, row + 1)
      elif action == 2:  # RIGHT
          col = min(self.cols - 1, col + 1)
      elif action == 3:  # UP
          row = max(0, row - 1)

      return row * self.cols + col

  def manhattan_distance(self, state, goal_state):

      current_row = state // self.cols
      current_col = state % self.cols
      goal_row = goal_state // self.cols
      goal_col = goal_state % self.cols
      return abs(current_row - goal_row) + abs(current_col - goal_col)
  def calculate_alpha(self,v_network, v_simulation, beta=1.0):
    #print(v_network, v_simulation)
    difference = abs(v_network - v_simulation)
    alpha = 1 / (1 + np.exp(-beta * difference))
    return alpha

  def alphazero_heuristic(self, state, action):
      next_state = self.get_next_state(state, action)
      #base_h = self.manhattan_distance(next_state, 15)


      state_tensor = torch.zeros(16)
      state_tensor[next_state] = 1.0
      value = self.value_net(state_tensor).item()
      if next_state == 15:
        return value + 5

      #simulation!!!!!
      env = gym.make('FrozenLake-v1', is_slippery=False)
      env.reset()
      env.unwrapped.s = next_state
      simulated_values = []

      for _ in range(4):
        done = False
        curr_state = next_state
        gamma = 0.95
        step = 0
        cumulative_reward = 0

        while not done and step<10:
          # 加入exploration

          with torch.no_grad():
            action_values = [self.value_net(self.state_to_tensor(
                self.get_next_state(curr_state, a))).detach().item()
                for a in range(4)]
          action = np.argmax(action_values)

          next_state, reward, done, truncated, _ = env.step(action)
          curr_state = next_state

          if done and reward == 0:
              reward = -1
          elif reward == 1:
              reward = 10
          else:
              reward = -0.1

          cumulative_reward += gamma ** step * reward
          step += 1
          done = done or truncated

        simulated_values.append(cumulative_reward)

      simulation_value = np.mean(simulated_values)
      #print(value, simulation_value)

      return 0.65 * value + 0.75 * simulation_value
  def collect_episode(self, episode, episodes):
    trajectory = {
        'states': [],
        'actions': [],
        'values': [],  # Will store discounted final rewards
        'steps_to_goal': []
    }

    env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False)
    state = env.reset()[0]
    done = False
    max_steps = 20  # Add step limit
    steps = 0
    #actions_test=[2,2,1,1,1,2]

    while not done and steps < max_steps:
        trajectory['states'].append(state)
        trajectory['steps_to_goal'].append(len(trajectory['states']))

        epsilon = max(0.2, 1 - episode/episodes)  # Slightly higher exploration
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            action_values = [self.alphazero_heuristic(state, a)
                           for a in range(4)]
            action = np.argmax(action_values)
        #action = actions_test[steps]
        trajectory['actions'].append(action)
        next_state, reward, done, truncated, _ = env.step(action)
        done = done or truncated
        state = next_state
        steps += 1

    # Determine final reward based on outcome
    if done and reward == 1:
        final_reward = 10  # Success
        print("Success")
    elif steps >= max_steps:
        final_reward = -3  # Timeout penalty
    else:
        final_reward = -5  # Failure

    # Calculate values by propagating final reward backwards
    gamma = 0.95
    values = []
    current_value = final_reward
    for _ in range(len(trajectory['states'])):
        values.insert(0, current_value)
        current_value *= gamma

    trajectory['values'] = values

    return trajectory





  def update_network(self, trajectory):


    # 将所有状态转成batch
    states = torch.stack([self.state_to_tensor(s) for s in trajectory['states']])
    targets = torch.tensor(trajectory['values']).float().unsqueeze(1)



    # 一次性计算所有预测
    values = self.value_net(states)

    # 计算损失
    value_loss = nn.MSELoss()(values, targets)

    self.optimizer.zero_grad()
    value_loss.backward()
    self.optimizer.step()
    '''if trajectory['values'][0] > 0:  # 如果是成功的轨迹
        # 多训练几次
        for _ in range(3):
            values = self.network(states)
            value_loss = nn.MSELoss()(values, targets)

            self.optimizer.zero_grad()
            value_loss.backward()
            self.optimizer.step()
    else:
        values = self.network(states)
        value_loss = nn.MSELoss()(values, targets)

        self.optimizer.zero_grad()
        value_loss.backward()
        self.optimizer.step()'''
    return value_loss.item()



  def train(self, episodes=1000):
    #self.TD_Training(episodes)
    for episode in range(episodes):
        trajectory = self.collect_episode(episode, episodes)
        loss = self.update_network(trajectory)
        print(f"Episode {episode + 1}/{episodes}, Loss: {loss}")
        display.clear_output(wait=True)


In [None]:
a = AlphaZeroGreedyBFS()
a.train()

Success
Episode 1000/1000, Loss: 6.190967559814453


In [None]:
def aaa(solver):
  for i in range(16):
    b=  [solver.value_net(solver.state_to_tensor(
                  solver.get_next_state(i, a))).detach().item()
                  for a in range(4)]
    print(f'{i}',b)
aaa(a)

0 [2.9760804176330566, 1.3963580131530762, 3.871591329574585, 2.9760804176330566]
1 [2.9760804176330566, 4.170629501342773, 5.4293718338012695, 3.871591329574585]
2 [3.871591329574585, 5.931889057159424, 3.3932957649230957, 5.4293718338012695]
3 [5.4293718338012695, 3.9126687049865723, 3.3932957649230957, 3.3932957649230957]
4 [1.3963580131530762, 3.911417007446289, 4.170629501342773, 2.9760804176330566]
5 [1.3963580131530762, 5.825281143188477, 5.931889057159424, 3.871591329574585]
6 [4.170629501342773, 8.195137023925781, 3.9126687049865723, 5.4293718338012695]
7 [5.931889057159424, 3.7212297916412354, 3.9126687049865723, 3.3932957649230957]
8 [3.911417007446289, 4.479738235473633, 5.825281143188477, 1.3963580131530762]
9 [3.911417007446289, 6.550695419311523, 8.195137023925781, 4.170629501342773]
10 [5.825281143188477, 9.750393867492676, 3.7212297916412354, 5.931889057159424]
11 [8.195137023925781, 4.119339942932129, 3.7212297916412354, 3.9126687049865723]
12 [4.479738235473633, 4.47

In [None]:
def test(solver):
    env = gym.make('FrozenLake-v1', is_slippery=False)



    state = env.reset()[0]
    done = False

    while not done:

      action_values = [solver.alphazero_heuristic(state, a)
                        for a in range(4)]
      b=  [solver.value_net(solver.state_to_tensor(
                solver.get_next_state(state, a))).detach().item()
                for a in range(4)]
      print('action_values',b)
      action = np.argmax(action_values)
      next_state, reward, done, truncated, _ = env.step(action)

      print('state, action, reward, next_state',(state, action, reward, next_state))
      if done and reward == 0:
        print('fail')
      if reward == 1:
        print('success')


      state = next_state
      done = done or truncated


test(a)

action_values [2.9760804176330566, 1.3963580131530762, 3.871591329574585, 2.9760804176330566]
state, action, reward, next_state (0, 2, 0.0, 1)
action_values [2.9760804176330566, 4.170629501342773, 5.4293718338012695, 3.871591329574585]
state, action, reward, next_state (1, 2, 0.0, 2)
action_values [3.871591329574585, 5.931889057159424, 3.3932957649230957, 5.4293718338012695]
state, action, reward, next_state (2, 1, 0.0, 6)
action_values [4.170629501342773, 8.195137023925781, 3.9126687049865723, 5.4293718338012695]
state, action, reward, next_state (6, 1, 0.0, 10)
action_values [5.825281143188477, 9.750393867492676, 3.7212297916412354, 5.931889057159424]
state, action, reward, next_state (10, 1, 0.0, 14)
action_values [6.550695419311523, 9.750393867492676, 4.119339942932129, 8.195137023925781]
state, action, reward, next_state (14, 2, 1.0, 15)
success


### thoughts about training

In [None]:
def collect_episode(self, episode, episodes):
    trajectory = {
        'states': [],
        'actions': [],
        'final_reward': None,  # 只记录最终reward
        'steps_to_goal': []    # 记录每个状态距离目标的步数
    }

    env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False)
    state = env.reset()[0]
    done = False
    with torch.no_grad():
      while not done:
          trajectory['states'].append(state)
          # 记录当前状态到目标的剩余步数
          trajectory['steps_to_goal'].append(len(trajectory['states']))

          epsilon = max(0.1, 1 - episode/episodes)
          if len(trajectory['states']) > 10:
              trajectory['final_reward'] = -1
              break

          if random.random() < epsilon:
              action = env.action_space.sample()
          else:
              action_values = [self.alphazero_heuristic(state, a)
                            for a in range(4)]
              action = np.argmax(action_values)

          trajectory['actions'].append(action)

          next_state, reward, done, truncated, _ = env.step(action)
          state = next_state
          done = done or truncated

          if done and reward == 0:
              reward = -1
          if reward == 1:
              reward = 10

    trajectory['final_reward'] = reward
    return trajectory

  def update_network(self, trajectory):
      gamma = 0.95  # 折扣因子
      final_reward = trajectory['final_reward']

      # 将所有状态转成batch
      states = torch.stack([self.state_to_tensor(s) for s in trajectory['states']])

      # 根据步数计算每个状态的target value
      steps_to_goal = trajectory['steps_to_goal']
      discounted_rewards = [final_reward * (gamma ** (len(steps_to_goal) - step))
                          for step in steps_to_goal]
      targets = torch.tensor(discounted_rewards).float().unsqueeze(1)

      # 一次性计算所有预测
      values = self.network(states)

      # 计算损失
      value_loss = nn.MSELoss()(values, targets)

      self.optimizer.zero_grad()
      value_loss.backward()
      self.optimizer.step()

      return value_loss.item()