## DQN for continuous action spaces: Normalized Advantage Function (NAF)

In [None]:
%%capture

!apt-get update && apt-get install -y xvfb
!pip install swig
!pip install gym[box2d]==0.23.1 pytorch-lightning==1.6.0 pyvirtualdisplay

#### Setup virtual display

In [None]:
from pyvirtualdisplay import Display
Display(visible=False, size=(1400, 900)).start()

#### Import the necessary code libraries

In [1]:
import copy
import gym
import random
import torch

import numpy as np
import torch.nn.functional as F

from collections import deque, namedtuple
from IPython.display import HTML
from base64 import b64encode

from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data.dataset import IterableDataset
from torch.optim import AdamW

from pytorch_lightning import LightningModule, Trainer

from gym.wrappers import RecordVideo, RecordEpisodeStatistics


device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
num_gpus = torch.cuda.device_count()

  import pkg_resources
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
  import cgi
  from urllib3.contr

In [3]:
def display_video(episode=0):
  video_file = open(f'/content/videos1/rl-video-episode-{episode}.mp4', "r+b").read()
  video_url = f"data:video/mp4;base64,{b64encode(video_file).decode()}"
  return HTML(f"<video width=600 controls><source src='{video_url}'></video>")

In [5]:
action_dims_1=2
print(action_dims_1 + 1)
print(action_dims_1 * (action_dims_1 + 1) / 2)
print(int(action_dims_1 * (action_dims_1 + 1) / 2))

3
3.0
3


#### Create the Deep Q-Network

In [6]:
class NafDQN(nn.Module):
    
  def __init__(self, hidden_size, obs_size, action_dims, max_action):
    super().__init__()
    self.action_dims = action_dims
    print(f"This is the action_dims {self.action_dims}")
    self.max_action = torch.from_numpy(max_action).to(device)
    self.net = nn.Sequential(
      nn.Linear(obs_size, hidden_size),
      nn.ReLU(),
      nn.Linear(hidden_size, hidden_size),
      nn.ReLU(),   
    )
    self.linear_mu = nn.Linear(hidden_size, action_dims)#see the change here, this is not number of actions, number of actions are infinite
    self.linear_value = nn.Linear(hidden_size, 1)
    self.linear_matrix = nn.Linear(hidden_size, int(action_dims * (action_dims + 1) / 2))

  @torch.no_grad()
  def mu(self, x):
    x = self.net(x)
    x = self.linear_mu(x)
    x = torch.tanh(x) * self.max_action
    return x
  
  @torch.no_grad()
  def value(self, x):
    x = self.net(x)
    x = self.linear_value(x)
    return x

  def forward(self, x, a):
    x = self.net(x)
    mu = torch.tanh(self.linear_mu(x)) * self.max_action
    value = self.linear_value(x)
    matrix = torch.tanh(self.linear_matrix(x))
#     But as you can imagine right now, this matrix variable is actually a vector with values because the

# entries for that matrix have been produced by a linear layer.

# Now it's time for us to put these entries in an actual matrix.

# So what we are going to do is create an empty matrix that will called L because this is a lower triangular

# matrix and we will fill this matrix with the value zero.

# And how many entries should this matrix have?

# Well, it should have action dims by action dimes entries.
#[action_dims*actions_dims]

# But remember that in the forward pass, we are working with a bunch of observations, which means that

# instead of working with a single state, we might be working with eight, 16, 32, etc. So actually

# our matrix should hold batch_size * action_dims * action_dims

# That is one matrix of this size for each of the states in this batch.

# And to do it, we are going to take the states Tensor X and find its butt size, which is in the 0th

# position of its shape property.

# And then the other dimensions should be the action dims, the number of dimensions of each action.
    
    L = torch.zeros((x.shape[0], self.action_dims, self.action_dims)).to(device)
    tril_indices = torch.tril_indices(row=self.action_dims, col=self.action_dims, offset=0).to(device)
    #tril_indices will give us the index values to index a lower triangular matrix

    L[:, tril_indices[0], tril_indices[1]] = matrix
    L.diagonal(dim1=1,dim2=2).exp_()#This will ensure that the values in the diagonal are positive
    #Though I can't understand what the arguments dim1=1 and dim2=2 are doing
    #oh, maybe it's denoting the dimension of which matrices
    # dimension 0 denotes just batch size, the actual matrices dimension inside the 3-D matrix is in dimension 1 and dimension 2
    P = L * L.transpose(2, 1)
    #and here, we want to take transpose of matrices
    u_mu = (a-mu).unsqueeze(dim=1)
    u_mu_t = u_mu.transpose(1, 2)
    #because dimension 0 is the batch size
    
    adv = - 1/2 * u_mu @ P @ u_mu_t
    #after this operation, the dimension is [[[]]] but we need [[],[],]
    adv = adv.squeeze(dim=-1)#This line of code removes the extra dimension
    return value + adv


#### Create the policy

In [7]:
def noisy_policy(state, env, net, epsilon=0.0):
  state = torch.tensor([state]).to(device)#first, we are going to take a state, and create a tensor from it, we'll put it inside a list so that pytorch knows that we are working with a batch with a single item and then we'll make sure that it's on the right device
  amin = torch.from_numpy(env.action_space.low).to(device)#minimum value that each dimension of the action can take(this will be an array, try to imagine and understand)
  amax = torch.from_numpy(env.action_space.high).to(device)#maximum value that each dimension of the action can take(this will be an array, try to imagine and understand)
  mu = net.mu(state)#this will give us the action in the state that our neural estimates has the highest q-value
  mu = mu + torch.normal(0, epsilon, mu.size(), device=device)#we are adding some noise to this action to promote exploration
  action = mu.clamp(amin, amax)
  action = action.squeeze().cpu().numpy()
  return action

#### Create the replay buffer

In [8]:
class ReplayBuffer:

  def __init__(self, capacity):
    self.buffer = deque(maxlen=capacity)

  def __len__(self):
    return len(self.buffer)
  
  def append(self, experience):
    self.buffer.append(experience)
  
  def sample(self, batch_size):
    return random.sample(self.buffer, batch_size)

In [9]:
class RLDataset(IterableDataset):

  def __init__(self, buffer, sample_size=400):
    self.buffer = buffer
    self.sample_size = sample_size
  
  def __iter__(self):
    for experience in self.buffer.sample(self.sample_size):
      yield experience

#### Create the environment

In [10]:
#we will apply the same action several times to keep the agent a bit shaky
#especially in robotics task, where we want out actions to be precise and consistent, if we modify
#our actions too quickly in the environment, our robot can end up having unpredictable behaviour
#to mitigate this problem, we create a new class
class RepeatActionWrapper(gym.Wrapper):#that extends over gym.Wrapper
  #and apply the same action on the following states
  def __init__(self, env, n):
    super().__init__(env)
    self.env = env
    self.n = n
      
  def step(self, action):
    done = False
    total_reward = 0.0
    for _ in range(self.n):
      next_state, reward, done, info = self.env.step(action)
      total_reward += reward
      if done:
        break
    return next_state, total_reward, done, info

In [13]:
def create_environment(name):
  env = gym.make(name)
  env = RecordVideo(env, video_folder='./videos1', episode_trigger=lambda x: x % 50 == 0)
  env = RepeatActionWrapper(env, n=8)
  env = RecordEpisodeStatistics(env)
  return env

#### Update the target network

In [12]:
def polyak_average(net, target_net, tau=0.01):
    for qp, tp in zip(net.parameters(), target_net.parameters()):
        tp.data.copy_(tau * qp.data + (1 - tau) * tp.data)

#### Create the Deep Q-Learning algorithm

In [14]:
class NAFDeepQLearning(LightningModule):
                             
  def __init__(self, env_name, policy=noisy_policy, capacity=100_000, 
               batch_size=256, lr=1e-4, hidden_size=512, gamma=0.99, 
               loss_fn=F.smooth_l1_loss, optim=AdamW, eps_start=2.0, eps_end=0.2, 
               eps_last_episode=1_000, samples_per_epoch=1_000, tau=0.01):

    super().__init__()
    self.env = create_environment(env_name)

    obs_size = self.env.observation_space.shape[0]
    action_dims = self.env.action_space.shape[0]
    max_action = self.env.action_space.high

    self.q_net = NafDQN(hidden_size, obs_size, action_dims, max_action).to(device)
    self.target_q_net = copy.deepcopy(self.q_net)
    self.policy = policy

    self.buffer = ReplayBuffer(capacity=capacity)

    self.save_hyperparameters()#save the hyperaparameters so that they are accessible
    #everywhere in the class

    while len(self.buffer) < self.hparams.samples_per_epoch:

      print(f"{len(self.buffer)} samples in experience buffer. Filling...")
      self.play_episode(epsilon=self.hparams.eps_start)
  
  @torch.no_grad()
  def play_episode(self, policy=None, epsilon=0.):
    obs = self.env.reset()
    done = False

    while not done:
      if policy:
        action = policy(obs, self.env, self.q_net, epsilon=epsilon)
      else:
        action = self.env.action_space.sample()
        
      next_obs, reward, done, info = self.env.step(action)
      exp = (obs, action, reward, done, next_obs)
      self.buffer.append(exp)
      obs = next_obs
  
  def forward(self, x):
    output = self.q_net.mu(x)
    return output

  def configure_optimizers(self):
    q_net_optimizer = self.hparams.optim(self.q_net.parameters(), lr=self.hparams.lr)
    return [q_net_optimizer]

  def train_dataloader(self):
    dataset = RLDataset(self.buffer, self.hparams.samples_per_epoch)
    dataloader = DataLoader(
        dataset=dataset,
        batch_size=self.hparams.batch_size,
    )
    return dataloader

  def training_step(self, batch, batch_idx):
    states, actions, rewards, dones, next_states = batch
    rewards = rewards.unsqueeze(1)
    dones = dones.unsqueeze(1)

    action_values = self.q_net(states, actions)

    next_state_values = self.target_q_net.value(next_states)
    next_state_values[dones] = 0.0
    
    target = rewards + self.hparams.gamma * next_state_values

    loss = self.hparams.loss_fn(action_values, target)
    self.log('episode/MSE Loss', loss)
    return loss

  def on_train_epoch_end(self):

    epsilon = max(
        self.hparams.eps_end,
        self.hparams.eps_start - self.current_epoch / self.hparams.eps_last_episode
    )

    self.play_episode(policy=self.policy, epsilon=epsilon)
    
    polyak_average(self.q_net, self.target_q_net, tau=self.hparams.tau)
    
    self.log("episode/Return", self.env.return_queue[-1])

#### Purge logs and run the visualization tool (Tensorboard)

In [23]:
# Start tensorboard.
# !rm -r /content/lightning_logs/
# !rm -r /content/videos/
# %load_ext tensorboard
%reload_ext tensorboard
%tensorboard --logdir D:/Reinforcement Learning/advanced_rl_dqn_to_sac_complete-main/lightning_logs

ERROR: Failed to launch TensorBoard (exited with 2).
Contents of stderr:
2024-07-03 12:21:09.657781: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-03 12:21:10.737411: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
usage: tensorboard [-h] [--helpfull] [--logdir PATH] [--logdir_spec PATH_SPEC]
                   [--host ADDR] [--bind_all] [--port PORT]
                   [--reuse_port BOOL] [--load_fast {false,auto,true}]
                   [--extra_data_server_flags EXTRA_DATA_SERVER_FLAGS]
                   [--grpc_creds_type {local,ssl,ssl_dev}]

: 

#### Train the policy

In [17]:
algo = NAFDeepQLearning('LunarLanderContinuous-v2')

trainer = Trainer(
    # gpus=num_gpus, 
    max_epochs=10_000
)

trainer.fit(algo)

  logger.warn(


This is the action_dims 2
0 samples in experience buffer. Filling...
14 samples in experience buffer. Filling...
29 samples in experience buffer. Filling...
41 samples in experience buffer. Filling...
63 samples in experience buffer. Filling...
71 samples in experience buffer. Filling...
85 samples in experience buffer. Filling...
99 samples in experience buffer. Filling...
120 samples in experience buffer. Filling...
137 samples in experience buffer. Filling...
147 samples in experience buffer. Filling...
160 samples in experience buffer. Filling...
170 samples in experience buffer. Filling...
182 samples in experience buffer. Filling...
192 samples in experience buffer. Filling...
201 samples in experience buffer. Filling...
212 samples in experience buffer. Filling...
223 samples in experience buffer. Filling...
231 samples in experience buffer. Filling...
244 samples in experience buffer. Filling...
255 samples in experience buffer. Filling...
265 samples in experience buffer. Fill

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
  if not hasattr(tensorboard, "__version__") or LooseVersion(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type   | Params | Mode 
------------------------------------------------
0 | q_net        | NafDQN | 270 K  | train
1 | target_q_net | NafDQN | 270 K  | train
------------------------------------------------
540 K     Trainable params
0         Non-trainable params
540 K     Total params
2.163     Total estimated model params size (MB)
d:\anaconda3\Lib\site-packages\pytorch_lightnin

Training: |          | 0/? [00:00<?, ?it/s]

  state = torch.tensor([state]).to(device)#first, we are going to take a state, and create a tensor from it, we'll put it inside a list so that pytorch knows that we are working with a batch with a single item and then we'll make sure that it's on the right device
`Trainer.fit` stopped: `max_epochs=10000` reached.


#### Check the resulting policy

In [None]:
display_video(episode=4300)