<a href="https://colab.research.google.com/github/Mayu21ad/CART/blob/main/DuelingDQN_TF_Pong.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Run this asap since it takes 30 seconds
%%capture
!apt-get update
!pip install pyglet==1.3.2
!pip install gym[atari] pyvirtualdisplay
!apt-get install -y xvfb python-opengl ffmpeg
!pip install tensorflow==2.1.*
!pip install box2d-py
!pip install gast==0.2.2
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor, AtariPreprocessing, FrameStack
import tensorflow as tf
import numpy as np
import random
import glob
import io
import time
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

In [None]:
gpu_list = tf.config.experimental.list_physical_devices('GPU')
print('Number of GPUS available is {}'.format(len(gpu_list)))

Number of GPUS available is 1


In [None]:
# Load gym environment and get action and state spaces
env = gym.make('PongNoFrameskip-v4')
env = AtariPreprocessing(env,
                         grayscale_obs=True,
                         scale_obs=False,
                         terminal_on_life_loss=True)
env = FrameStack(env, num_stack=4)

num_state_feats = env.observation_space.shape
num_actions = env.action_space.n
max_observation_values = env.observation_space.high
print('Number of state features: {}'.format(num_state_feats))
print('Number of possible actions: {}'.format(num_actions))

Number of state features: (4, 84, 84)
Number of possible actions: 6


In [None]:
layers = tf.keras.layers

class DuelingDQN(tf.keras.Model):
  """Convolutional neural network for the Atari games."""
  def __init__(self, num_actions):
    super(DuelingDQN, self).__init__()
    self.conv1 = layers.Conv2D(
        filters=32,
        kernel_size=8,
        strides=4,
        activation="relu",
        kernel_initializer=tf.keras.initializers.VarianceScaling(2.0),
        bias_initializer=tf.keras.initializers.Zeros(),
        data_format="channels_first",
    )
    self.conv2 = layers.Conv2D(
        filters=64,
        kernel_size=4,
        strides=2,
        activation="relu",
        kernel_initializer=tf.keras.initializers.VarianceScaling(2.0),
        bias_initializer=tf.keras.initializers.Zeros(),
        data_format="channels_first",
    )
    self.conv3 = layers.Conv2D(
        filters=64,
        kernel_size=3,
        strides=1,
        activation="relu",
        kernel_initializer=tf.keras.initializers.VarianceScaling(2.0),
        bias_initializer=tf.keras.initializers.Zeros(),
        data_format="channels_first",
    )
    self.flatten = layers.Flatten()
    self.dense1 = layers.Dense(
        units=512,
        activation="relu",
        kernel_initializer=tf.keras.initializers.VarianceScaling(2.0),
        bias_initializer=tf.keras.initializers.Zeros(),
    )
    self.V = layers.Dense(1)
    self.A = layers.Dense(num_actions)

  @tf.function
  def call(self, states):
    """Forward pass of the neural network with some inputs."""
    x = self.conv1(states)
    x = self.conv2(x)
    x = self.conv3(x)
    x = self.flatten(x)
    x = self.dense1(x)
    V = self.V(x)
    A = self.A(x)
    Q = V + tf.subtract(A, tf.reduce_mean(A, axis=1, keepdims=True))
    return Q


# Create main and target neural networks.
main_nn = DuelingDQN(num_actions)
target_nn = DuelingDQN(num_actions)

# Loss function and optimizer.
loss_fn = tf.keras.losses.Huber()
optimizer = tf.keras.optimizers.Adam(lr=1e-5, clipnorm=10)

In [None]:
def select_epsilon_greedy_action(state, epsilon):
  """Take random action with probability epsilon, else take best action."""
  result = np.random.uniform()
  if result < epsilon:
    return env.action_space.sample() # Random action.
  else:
    qs = main_nn(state).numpy()
    return np.argmax(qs) # Greedy action for state.

In [None]:
class ReplayBuffer(object):
  """Experience replay buffer that samples uniformly."""
  def __init__(self, size):
    self._size = size
    self.buffer = []
    self._next_idx = 0

  def add(self, state, action, reward, next_state, done):
    if self._next_idx >= len(self.buffer):
        self.buffer.append((state, action, reward, next_state, done))
    else:
        self.buffer[self._next_idx] = (state, action, reward, next_state, done)
    self._next_idx = (self._next_idx + 1) % self._size

  def __len__(self):
    return len(self.buffer)

  def sample(self, num_samples):
    states, actions, rewards, next_states, dones = [], [], [], [], []
    idx = np.random.choice(len(self.buffer), num_samples)
    for i in idx:
      elem = self.buffer[i]
      state, action, reward, next_state, done = elem
      states.append(np.array(state, copy=False))
      actions.append(np.array(action, copy=False))
      rewards.append(reward)
      next_states.append(np.array(next_state, copy=False))
      dones.append(done)
    states = np.array(states)
    actions = np.array(actions)
    rewards = np.array(rewards, dtype=np.float32)
    next_states = np.array(next_states)
    dones = np.array(dones, dtype=np.float32)
    return states, actions, rewards, next_states, dones

In [None]:
@tf.function
def train_step(states, actions, rewards, next_states, dones):
  """Perform a training iteration on a batch of data."""
  next_qs_main = main_nn(next_states)
  next_qs_argmax = tf.argmax(next_qs_main, axis=-1)
  next_action_mask = tf.one_hot(next_qs_argmax, num_actions)
  next_qs_target = target_nn(next_states)
  masked_next_qs = tf.reduce_sum(next_action_mask * next_qs_target, axis=-1)
  target = rewards + (1. - dones) * discount * masked_next_qs
  with tf.GradientTape() as tape:
    qs = main_nn(states)
    action_mask = tf.one_hot(actions, num_actions)
    masked_qs = tf.reduce_sum(action_mask * qs, axis=-1)
    loss = loss_fn(target, masked_qs)
  grads = tape.gradient(loss, main_nn.trainable_variables)
  optimizer.apply_gradients(zip(grads, main_nn.trainable_variables))
  return loss

In [None]:
# Hyperparameters.
num_episodes = 1000
epsilon = 1.0
batch_size = 32
discount = 0.99
buffer_size = 200000

In [None]:
buffer = ReplayBuffer(size=buffer_size)

# Start training. Play game once and then train with a batch.
cur_frame, last_100_ep_rewards = 0, []
for episode in range(num_episodes+1):
  state = env.reset()
  ep_reward, done = 0, False
  while not done:
    state_in = np.expand_dims(np.array(state) / 255., axis=0)
    action = select_epsilon_greedy_action(state_in, epsilon)
    next_state, reward, done, info = env.step(action)
    ep_reward += reward
    reward = np.sign(reward)

    # Save to experience replay.
    buffer.add(state, action, reward, next_state, done)
    state = next_state
    cur_frame += 1
    if epsilon > 0.01:
      epsilon -= 1.1e-6

    if len(buffer) >= batch_size:
      states, actions, rewards, next_states, dones = buffer.sample(batch_size)
      states = states / 255.
      next_states = next_states / 255.
      loss = train_step(states, actions, rewards, next_states, dones)

    # Copy main_nn weights to target_nn.
    if cur_frame % 10000 == 0:
      target_nn.set_weights(main_nn.get_weights())

  if len(last_100_ep_rewards) == 100:
    last_100_ep_rewards = last_100_ep_rewards[1:]
  last_100_ep_rewards.append(ep_reward)

  if episode % 25 == 0:
    print(f'Episode: {episode}/{num_episodes}, Epsilon: {epsilon:.3f}, '\
          f'Loss: {loss:.4f}, Return: {np.mean(last_100_ep_rewards):.2f}')

env.close()

Episode: 0/1000, Epsilon: 0.999, Loss: 0.0296, Return: -21.00
Episode: 25/1000, Epsilon: 0.974, Loss: 0.0240, Return: -20.23
Episode: 50/1000, Epsilon: 0.948, Loss: 0.0014, Return: -20.18
Episode: 75/1000, Epsilon: 0.923, Loss: 0.0016, Return: -20.22
Episode: 100/1000, Epsilon: 0.897, Loss: 0.0008, Return: -20.21
Episode: 125/1000, Epsilon: 0.872, Loss: 0.0008, Return: -20.25
Episode: 150/1000, Epsilon: 0.845, Loss: 0.0033, Return: -20.21
Episode: 175/1000, Epsilon: 0.817, Loss: 0.0014, Return: -20.09
Episode: 200/1000, Epsilon: 0.787, Loss: 0.0031, Return: -19.93
Episode: 225/1000, Epsilon: 0.756, Loss: 0.0093, Return: -19.60
Episode: 250/1000, Epsilon: 0.726, Loss: 0.0023, Return: -19.52
Episode: 275/1000, Epsilon: 0.691, Loss: 0.0020, Return: -19.26
Episode: 300/1000, Epsilon: 0.655, Loss: 0.0025, Return: -18.96
Episode: 325/1000, Epsilon: 0.618, Loss: 0.0027, Return: -18.77
Episode: 350/1000, Epsilon: 0.579, Loss: 0.0031, Return: -18.32
Episode: 375/1000, Epsilon: 0.539, Loss: 0.01

In [None]:
def show_video():
  """Enables video recording of gym environment and shows it."""
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else:
    print("Video not found")


def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [None]:
env = gym.make('PongNoFrameskip-v4')
env = AtariPreprocessing(env,
                         grayscale_obs=True,
                         scale_obs=True,
                         terminal_on_life_loss=False)
env = wrap_env(FrameStack(env, num_stack=4))

state = env.reset()
done = False
ep_rew = 0
while not done:
  env.render()
  state = np.array(state)
  state = np.expand_dims(state, axis=0)
  action = select_epsilon_greedy_action(state, epsilon=0.01)
  state, reward, done, info = env.step(action)
  ep_rew += reward
print(f'Total Return: {ep_rew}')
env.close()
show_video()



Total Return: 21.0
