### Deep Q-learning公式（off-policy版本）：

![](assets/253.jpg)

### 例子

![](assets/255.jpg)

![](assets/256.jpg)

### 上述例子代码实现

例子：
使用gym仿真库，gym官网: https://www.gymlibrary.dev/environments/toy_text/frozen_lake/

In [2]:
!pip install gym==0.15.4
!pip install numpy
!pip install torch
!pip install tqdm

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [8]:
from copy import deepcopy
from collections import namedtuple

import numpy as np
from tqdm import tqdm

import gym
from gym import spaces
from gym.envs.registration import register

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


one_step_experience = namedtuple('one_step_experience', field_names=['current_observation', 'current_action', 'reward', 'next_observation'])

class CustomGridWorld(gym.Env):
    def __init__(self, grid_size=(5, 5), goal_position=(3, 2), forbidden_grids=None, action_space=5):
        super(CustomGridWorld, self).__init__()
        # Grid size (rows, columns)
        self.grid_size = grid_size
        self.goal_position = goal_position
        # Define action space: up, right, down, left, unchanged (5 actions)
        self.action_space = spaces.Discrete(action_space)
        # Observation space: grid positions, represented as a flat space
        self.observation_space = spaces.Discrete(grid_size[0] * grid_size[1])
        # Initialize agent's starting position (top-left corner)
        self.state = (0, 0)
        self.done = False
        # Set the forbidden grids (if not specified, use a default list)
        if forbidden_grids is None:
            forbidden_grids = [(1, 1), (1, 2), (2, 2), (3, 1), (3, 3), (4, 1)]  # Given forbidden grids
        self.forbidden_grids = set(forbidden_grids)

    def reset(self):
        """Resets the environment to the initial state"""
        self.state = (0, 0)  # Reset the agent to the top-left corner
        self.done = False
        return self._get_observation()

    def step(self, action):
        """Executes one step in the environment"""
        # if self.done:
        #     return self._get_observation(), 1, True, {}

        x, y = self.state
        # Define movement based on action
        if action == 0:  # Up
            new_x = max(0, x - 1)
            new_y = y
        elif action == 1:  # Right
            new_x = x
            new_y = min(self.grid_size[1] - 1, y + 1)
        elif action == 2:  # Down
            new_x = min(self.grid_size[0] - 1, x + 1)
            new_y = y
        elif action == 3:  # Left
            new_x = x
            new_y = max(0, y - 1)
        elif action == 4:  # Unchanged (stay in the same position)
            new_x = x
            new_y = y

        # Check if the new position is out of bounds
        if new_x < 0 or new_x >= self.grid_size[0] or new_y < 0 or new_y >= self.grid_size[1]:
            reward = -1  # Penalty for trying to go out of bounds
            self.state = (x, y)  # Keep the agent at the same position
        else:
            self.state = (new_x, new_y)
            # Check if the agent reached the goal
            if self.state == self.goal_position:
                reward = 1
                self.done = True
            # Check if the agent stepped into a forbidden grid
            elif self.state in self.forbidden_grids:
                reward = -1  # Penalty for entering a forbidden grid
            else:
                reward = 0  # No penalty for regular move

        return self._get_observation(), reward, self.done, {}

    def render(self, mode='human'):
        """Renders the environment (prints the grid)"""
        grid = np.full(self.grid_size, 'F', dtype=object)  # Default is frozen
        # Set goal, forbidden grids, and agent position
        grid[self.goal_position] = 'G'
        for f in self.forbidden_grids:
            grid[f] = 'H'  # H for hole (forbidden grid)
        # Print the grid with agent position
        grid[self.state] = 'A'
        for row in grid:
            print(' '.join(row))

    def _get_observation(self):
        """Returns the current state as a flat index"""
        return self.state[0] * self.grid_size[1] + self.state[1]

    def close(self):
        """Close the environment"""
        pass
    
    def vis_policy(self, q_table):
        self.reset()
        self.render()
        action_maps = {0: '↑', 1: '→', 2: '↓', 3: '←', 4: '⊙'}
        policy = np.full(self.grid_size, '⊙', dtype=object)
        for row in range(self.grid_size[0]):
            for col in range(self.grid_size[1]):
                index = row * self.grid_size[0] + col
                action = q_table[index].argmax()
                policy[row, col] = action_maps[action]
        print(policy)
        
        
class DQNModel(nn.Module):
    def __init__(self, input_dim: int, hidden_layers: list[int], output_dim: int):
        super().__init__()
        layers = nn.ModuleList()
        layer_dims = [input_dim] + hidden_layers

        for index in range(len(layer_dims) - 1):
            linear = nn.Linear(layer_dims[index], layer_dims[index+1], bias=True)
            activation = nn.ReLU(inplace=True)
            layers.extend([linear, activation])
        out_linear = nn.Linear(layer_dims[-1], output_dim)
        layers.append(out_linear)
        
        self.dqn = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.dqn(x)
    

class DQNDataset(Dataset):
    def __init__(self, replay_buffer, n_observations):
        super().__init__()
        self.replay_buffer = replay_buffer
        self.n_observations = n_observations
    
    def __len__(self):
        return len(self.replay_buffer)
    
    def __getitem__(self, index):
        state = torch.tensor(self.replay_buffer[index].current_observation, dtype=torch.int64)
        state = F.one_hot(state, self.n_observations).to(torch.float32)
        action = torch.tensor(self.replay_buffer[index].current_action, dtype=torch.int)
        reward = torch.tensor(self.replay_buffer[index].reward, dtype=torch.float32)[None]
        state_prime = torch.tensor(self.replay_buffer[index].next_observation, dtype=torch.int64)
        state_prime = F.one_hot(state_prime, self.n_observations).to(torch.float32)
        return state, action, reward, state_prime
    
        
class DQNSolver:
    def __init__(
        self, grid_size: tuple, goal_position: tuple, forbidden_grids: list[tuple], action_space: int,
        hidden_layers: list[int], device: torch.device, batch_size: int, lr: float=1e-3
    ):
        self.device = device
        self.batch_size = batch_size
        self.grid_size = grid_size
        self.action_space = action_space
        self._init_env(grid_size, goal_position, forbidden_grids)
        self._init_model(hidden_layers)
        self._init_trainer(lr)
    
    def _init_env(self, grid_size: tuple, goal_position: tuple, forbidden_grids: list[tuple], action_space: int=5):
        self.env = CustomGridWorld(grid_size=grid_size, goal_position=goal_position, forbidden_grids=forbidden_grids, action_space=action_space)
        self.env.render()
        self.n_observations = self.env.observation_space.n
    
    def _init_model(self, hidden_layers: list[int]):
        self.model = DQNModel(self.n_observations, hidden_layers, self.action_space).to(self.device)
        self.model_y = deepcopy(self.model)
        self._disable_grad(self.model_y)
    
    def _init_trainer(self, lr):
        self.loss_fn = nn.MSELoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr)
    
    def _disable_grad(self, model):
        for param in model.parameters():
            param.requires_grad_(False)
        
    def _generate_episode(self, episode_one: list, n_steps: int):
        current_observation = self.env.reset()
        for _ in range(n_steps):
            current_action = self.env.action_space.sample()
            next_observation, reward, _, _ = self.env.step(current_action)
            ose = one_step_experience(current_observation=current_observation, current_action=current_action, reward=reward, next_observation=next_observation)
            current_observation = next_observation
            episode_one.append(ose)
    
    def _create_dataloader(self, replay_buffer):
        dataset = DQNDataset(replay_buffer, self.n_observations)
        return DataLoader(dataset, self.batch_size, shuffle=True, pin_memory=self.device is torch.device('cuda'), drop_last=True)
    
    def solve(self, n_steps: int, n_episodes: int, n_epochs: int, C_iters: int, log_iters: int, gamma: float, vis_policy: bool=True):
        replay_buffer = []
        print(f'start generating {n_episodes} episodes with {n_steps} steps......')
        for _ in range(n_episodes):
            episode_one = []
            self._generate_episode(episode_one, n_steps)
            replay_buffer += episode_one
        print('done')
        
        dataloader = self._create_dataloader(replay_buffer)
        
        gamma = torch.tensor(gamma, dtype=torch.float32, device=self.device)
        total_iters = 0
        total_loss = 0
        self.model.train()
        print(f'start training for {n_epochs} epochs')
        pbar = tqdm(range(n_epochs))
        for epoch in pbar:
            for states, actions, rewards, states_prime in dataloader:
                total_iters += 1
                states = states.to(self.device)
                actions = actions.to(self.device)
                rewards = rewards.to(self.device)
                states_prime = states_prime.to(self.device)
                # compute yT
                y_T = rewards + gamma * torch.max(self.model_y(states_prime), dim=-1, keepdim=True)[0]
                y_pred = self.model(states)[range(self.batch_size), actions][..., None]
                # compute loss
                loss = self.loss_fn(y_pred, y_T)
                total_loss += loss.item()
                # zero grad
                self.optimizer.zero_grad()
                # backward
                loss.backward()
                # step
                self.optimizer.step()
                
                if total_iters % C_iters:
                    self.model_y = deepcopy(self.model)
                    self._disable_grad(self.model_y)
                
                if total_iters % log_iters == 0:
                    desc = f'Epoch: {epoch + 1} / {n_epochs}, average loss: {total_loss / total_iters:.3f}'
                    pbar.set_description(desc)
        print('done')
        
        if vis_policy:
            print('render final policy...')
            self.vis_policy()
        print('All done!')
    
    def create_fake_qtable(self):
        self.model.eval()
        fake_q_table = torch.zeros([self.n_observations, self.action_space], device=self.device)
        for state in range(self.n_observations):
            state_input = torch.tensor(state, dtype=torch.int64, device=self.device)
            state_input = F.one_hot(state_input, self.n_observations).to(torch.float32)
            with torch.no_grad():
                q_values = self.model(state_input).squeeze(0).detach().cpu().numpy()
                optimal_action = q_values.argmax(0)
                fake_q_table[state, optimal_action] = 1
        return fake_q_table
                
    
    def vis_policy(self):
        fake_q_table = self.create_fake_qtable()
        self.env.vis_policy(fake_q_table.cpu().numpy())
        

if __name__ == '__main__':
    grid_size = (5, 5)
    goal_position = (3, 2)
    forbidden_grids = [(1, 1), (1, 2), (2, 2), (3, 1), (3, 3), (4, 1)]
    action_space = 5
    hidden_layers = [100]
    output_dim = action_space
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    batch_size = 128
    lr = 1e-3
    solver = DQNSolver(
        grid_size, goal_position, forbidden_grids, action_space,
        hidden_layers, device, batch_size, lr 
    )
    n_steps = 1000
    n_episodes = 1
    n_epochs = 200
    C_iters = 10
    log_iters = 200
    gamma = 0.99
    vis_policy = True
    solver.solve(n_steps, n_episodes, n_epochs, C_iters, log_iters, gamma, vis_policy)

A F F F F
F H H F F
F F H F F
F H G H F
F H F F F
start generating 1 episodes with 1000 steps......
done
start training for 200 epochs


Epoch: 200 / 200, average loss: 0.051: 100%|████████████████████████████████████████████| 200/200 [00:14<00:00, 13.45it/s]

done
render final policy...
A F F F F
F H H F F
F F H F F
F H G H F
F H F F F
[['↓' '→' '↓' '↓' '←']
 ['↓' '↓' '↓' '↓' '←']
 ['→' '→' '↓' '↓' '←']
 ['→' '→' '⊙' '←' '←']
 ['↑' '→' '↑' '←' '←']]
All done!



