<a href="https://colab.research.google.com/github/KornelWitkowski/Deep-Q-Learning-with-Tensorflow/blob/main/ContinuousLunarLander.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Discretization of continuous action space

In [None]:
%%bash

apt-get install swig

git clone https://github.com/pybox2d/pybox2d
cd pybox2d
python setup.py build
python setup.py install

apt-get install -y xvfb

pip install \
    gym==0.21 \
    gym[box2d]==0.21 \
    pyglet==1.5.27 \
    pyvirtualdisplay

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras.optimizers import Adam


class DeepQLearningModel:
  def __init__(self, observation_size, hidden_size, action_size=25, learning_rate=0.001):

    self.model = Sequential([Dense(observation_size, activation="relu"),
                             Dense(hidden_size, activation="relu"),
                             Dense(hidden_size, activation="relu"),
                             Dense(action_size)])
    self.compile()
    
  def compile(self, learning_rate=0.001):  
    self.model.compile(loss="mse",
                       optimizer=Adam(learning_rate=learning_rate))
    
  def fit(self, x, y):
    history = self.model.fit(x, y, epochs=1, verbose=0)
    loss =  history.history["loss"][0]
    return loss


In [None]:
actions1 = [-1.0, -0.5, 0.0, 0.5, 1]
actions2 = [-1.0, -0.5, 0.0, 0.5, 1]

act = []
for a1 in actions1:
  for a2 in actions2:
    act.append([a1, a2])

In [None]:
import numpy as np

def epsilon_greedy_policy(state, environment, model, epsilon=0.0):

  if np.random.random() < epsilon:
    action = random.randrange(len(act))
    return action
  else:
    q_values = model(tf.expand_dims(state, axis=0))
    action = tf.math.argmax(q_values, axis=1)
    action = int(action)
  return action

In [None]:
import numpy as np

def epsilon_greedy_policy(state, environment, model, epsilon=0.0):

  if np.random.random() < epsilon:
    action = random.randrange(len(act))
    return action
  else:
    q_values = model(tf.expand_dims(state, axis=0))
    action = tf.math.argmax(q_values, axis=1)
    action = int(action)
  return action

In [None]:
import random
from collections import deque

class ReplayBuffer:

  def __init__(self, capacity):
    self.buffer = deque(maxlen=capacity)

  def __len__(self):
    return len(self.buffer)

  def append(self, experience):
    self.buffer.append(experience)

  def sample(self, batch_size):
    return random.sample(self.buffer, batch_size)

In [None]:
from gym.wrappers import RecordVideo, RecordEpisodeStatistics, TimeLimit

def create_environment(name, n=8):
  env = gym.make(name)
  env = RecordVideo(env, video_folder='./recored_episodes', episode_trigger=lambda x: x % 50 == 0)
  env = RepeatActionWrapper(env, n)
  env = RecordEpisodeStatistics(env)
  return env

In [None]:
from pyvirtualdisplay import Display
Display(visible=False, size=(1400, 900)).start()

<pyvirtualdisplay.display.Display at 0x7ff593a645e0>

In [None]:
import copy
import pandas as pd

from gym.wrappers import RecordVideo, RecordEpisodeStatistics, TimeLimit

import gym

class DeepQLearningAlgorithm:

    def __init__(self, environment_name="LunarLanderContinuous-v2", policy=epsilon_greedy_policy, capacity=100_000,
                batch_size=256, learning_rate=1e-3, hidden_size=128, gamma=0.99, epsilon_start=1.0, epsilon_end=0.15,
                epsilon_last_episode=600, samples_per_epoch=2_048//8, q_net_update_rate=10, repeat_action=8):

      self.environment = create_environment(environment_name, repeat_action)
      observation_size = self.environment.observation_space.shape[0]
      actions_size = len(act)

      self.q_net = DeepQLearningModel(observation_size, hidden_size, actions_size, learning_rate)
      self.target_q_net = copy.deepcopy(self.q_net.model)

      self.policy = policy
      self.buffer = ReplayBuffer(capacity=capacity)

       
      self.current_epoch = 0

      # hyperparameters
      self.batch_size = batch_size
      self.learning_rate = learning_rate
      self.gamma = 0.99
      self.epsilon_start = epsilon_start
      self.epsilon_end = epsilon_end
      self.epsilon_last_episode = epsilon_last_episode
      self.samples_per_epoch = samples_per_epoch
      self.q_net_update_rate = 10

      while len(self.buffer) < self.samples_per_epoch:
        self.play_episode(epsilon=0)

    def play_episode(self, policy=None, epsilon=0.):
      state = self.environment.reset()
      done = False

      while not done:
        if policy:
          action = policy(state, self.environment, self.q_net.model, epsilon=epsilon)
        else:
          action = epsilon_greedy_policy(state, self.environment, self.q_net.model, epsilon=1.0)

        next_state, reward, done, info = self.environment.step(act[action])
        exp = (state, action, reward, done, next_state)
        self.buffer.append(exp)
        state = next_state

    def get_batch(self):
      sample = pd.DataFrame(self.buffer.sample(self.batch_size))

      state = np.stack(sample[0].values)
      action = np.stack(sample[1].values)
      reward = np.stack(sample[2].values)
      done = np.stack(sample[3].values)
      next_state = np.stack(sample[4].values)

      return state, action, reward, done, next_state


    def train_step(self):
      states, actions, rewards, dones, next_states = self.get_batch()

      state_action_values = tf.gather(self.q_net.model(states), actions, axis=1, batch_dims=1)

      next_action_values = tf.math.reduce_max(self.target_q_net(next_states), axis=1)
      next_action_values = next_action_values.numpy()
      next_action_values[dones] = 0.0


      expected_state_action_values = rewards + self.gamma * next_action_values

      q_net_predictions = self.q_net.model(states).numpy()
      q_net_predictions[range(self.batch_size), actions] = expected_state_action_values

      loss = self.q_net.fit(states, q_net_predictions)

      return loss

      
    def training_epoch_end(self):

        epsilon = max(self.epsilon_end,
                      self.epsilon_start - self.current_epoch / self.epsilon_last_episode)

        self.play_episode(policy=self.policy, epsilon=epsilon)

        if self.current_epoch % self.q_net_update_rate == 0:
          self.target_q_net = copy.deepcopy(self.q_net.model)

        return

        
    def train(self, epochs):

      for i in range(epochs):

        loss = 0

        for _ in range(self.samples_per_epoch//self.batch_size):  
          loss += self.train_step()
        self.training_epoch_end()

        if self.current_epoch % 50 == 0:
          returns = list(self.environment.return_queue)[-1]
          print(f"Epoch: {self.current_epoch}, loss: {loss}, hp_metric: {tf.math.reduce_mean(returns)}")

        self.current_epoch += 1


In [None]:
!rm -r /content/recored_episodes

alg =  DeepQLearningAlgorithm(
                              hidden_size=256, gamma=0.96, epsilon_start=1.0, epsilon_end=0.05,
                              epsilon_last_episode=1000, samples_per_epoch=2_048//2, repeat_action=4)

alg.train(1000)

Epoch: 0, loss: 102.16668319702148, hp_metric: -562.28662109375
Epoch: 50, loss: 21.141337394714355, hp_metric: -76.96858978271484
Epoch: 100, loss: 16.298704862594604, hp_metric: 35.937355041503906
Epoch: 150, loss: 23.156526803970337, hp_metric: -322.248291015625
Epoch: 200, loss: 16.39613103866577, hp_metric: -95.52153778076172
Epoch: 250, loss: 16.127256870269775, hp_metric: -136.18785095214844
Epoch: 300, loss: 22.193091869354248, hp_metric: -10.861907958984375
Epoch: 350, loss: 19.891846656799316, hp_metric: -30.140968322753906
Epoch: 400, loss: 19.16852378845215, hp_metric: -8.832000732421875
Epoch: 450, loss: 30.884401321411133, hp_metric: -34.16162872314453
Epoch: 500, loss: 25.083611965179443, hp_metric: -142.4987335205078
Epoch: 550, loss: 23.650888442993164, hp_metric: -40.66557693481445
Epoch: 600, loss: 20.624852180480957, hp_metric: 11.072304725646973
Epoch: 650, loss: 22.892570734024048, hp_metric: 62.71717071533203
Epoch: 700, loss: 26.163174152374268, hp_metric: 0.171

# Normalized advantage function Deep Q-learning

In [None]:
%%bash

apt-get install swig

git clone https://github.com/pybox2d/pybox2d
cd pybox2d
python setup.py build
python setup.py install

apt-get install -y xvfb

pip install \
    gym==0.21 \
    gym[box2d]==0.21 \
    pyglet==1.5.27 \
    pyvirtualdisplay

In [None]:
from pyvirtualdisplay import Display
Display(visible=False, size=(1400, 900)).start()

<pyvirtualdisplay.display.Display at 0x7f132fc3c100>

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Dense, Input, Reshape, Add, Lambda
from tensorflow.keras.optimizers import Adam


class TrainingModel(tf.keras.Model):

  def __init__(self, model_mu, model_v, model_p):
    super().__init__()
    self.model_mu = model_mu
    self.model_v = model_v
    self.model_p = model_p

  def call(self, inputs):
    x, a = inputs

    in_mu = self.model_mu(x)
    in_v = self.model_v(x)

    mu = Lambda(lambda z: tf.squeeze(z))(in_mu)
    v = Lambda(lambda z: tf.squeeze(z))(in_v)
    p = self.model_p(x)

    P = Lambda(lambda z: z @ tf.transpose(z, perm=[0,2,1]))(p)

    u_mu = Lambda(lambda z: tf.expand_dims(a - z, axis=1))(in_mu)
    u_mu_t = Lambda(lambda z: tf.transpose(z, perm=[0,2,1]))(u_mu)

    adv = - 0.5 * Lambda(lambda z: z[0] @ z[1] @ z[2])([u_mu, P, u_mu_t])

    output = Add()([adv, in_v])
    output = Lambda(lambda z: tf.squeeze(z))(output)

    return output


class NafDeepQLearningModel:
  def __init__(self, observation_size=8, hidden_size=128, action_size=2, learning_rate=0.001):


    self.common_layer = Sequential([Input(observation_size),
                                   Dense(hidden_size, "relu"),
                                   Dense(hidden_size, "relu")])

    self.model_mu = Sequential([self.common_layer,
                          Dense(action_size, activation="tanh")])
    
    self.model_v = Sequential([self.common_layer,
                         Dense(1)])

    self.model_p = Sequential([self.common_layer,
                         Dense(action_size*action_size, activation="sigmoid"),
                         Reshape((action_size, action_size))])

    self.model = TrainingModel(self.model_mu, self.model_v, self.model_p)

    self.compile(learning_rate)

  def compile(self, learning_rate=0.001):  
    self.model.compile(loss="mse", optimizer=Adam(learning_rate=learning_rate), metrics=["mse"])
    
  def fit(self, x, y):
    history = self.model.fit(x, y, epochs=1, verbose=0)
    loss =  history.history["loss"][0]
    return loss

In [None]:
import gym
from gym.wrappers import RecordVideo, RecordEpisodeStatistics, TimeLimit

def create_enviorment(name):
  env = gym.make(name)
  env = RecordVideo(env, video_folder="./videos", episode_trigger=lambda x: x % 50 == 0)
  env = RepeatActionWrapper(env, 4)
  env = RecordEpisodeStatistics(env)
  return env

In [None]:
environment = gym.make("LunarLanderContinuous-v2")

a_min = environment.action_space.low
a_max = environment.action_space.high

del environment

def noisy_policy(state, env, model, epsilon=0.0):
  state = tf.expand_dims(state, axis=0)
  mu = model(state)
  mu = mu + tf.random.normal(mu.shape, 0, epsilon)
  
  action = tf.clip_by_value(mu, a_min, a_max)
  action = tf.squeeze(action)

  return action

In [None]:
class RepeatActionWrapper(gym.Wrapper):

  def __init__(self, env, n):
    super().__init__(env)
    self.env = env
    self.n = n

  def step(self, action):
    done = False
    total_reward = 0

    for _ in range(self.n):
      next_state, reward, done, info = self.env.step(action)
      total_reward += reward
      if done:
        break

    return next_state, total_reward, done, info

In [None]:
import gym
from gym.wrappers import RecordVideo, RecordEpisodeStatistics, TimeLimit

def create_enviorment(name, n=8):
  env = gym.make(name)
  env = RecordVideo(env, video_folder="./videos", episode_trigger=lambda x: x % 50 == 0)
  env = RepeatActionWrapper(env, n)
  env = RecordEpisodeStatistics(env)
  return env

In [None]:
import random
from collections import deque

class ReplayBuffer:

  def __init__(self, capacity):
    self.buffer = deque(maxlen=capacity)

  def __len__(self):
    return len(self.buffer)

  def append(self, experience):
    self.buffer.append(experience)

  def sample(self, batch_size):
    return random.sample(self.buffer, batch_size)

In [None]:
import copy
import numpy as np
import pandas as pd

class DeepQLearningAlgorithm:
      def __init__(self, environment_name="LunarLanderContinuous-v2", policy=noisy_policy, capacity=100_000,
                batch_size=256, learning_rate=1e-5, hidden_size=512, gamma=0.99, epsilon_start=2.0, epsilon_end=0.02,
                epsilon_last_episode=800, samples_per_epoch=1_024, q_net_update_rate=10, n=8):
        
          self.environment = create_enviorment(environment_name, n)
          observation_size = self.environment.observation_space.shape[0]
          actions_size = self.environment.action_space.shape[0]

          self.q_net = NafDeepQLearningModel(observation_size, hidden_size, actions_size, learning_rate)

          self.policy = policy
          self.buffer = ReplayBuffer(capacity=capacity)

          self.current_epoch = 0

          self.batch_size = batch_size
          self.learning_rate = learning_rate
          self.gamma = 0.99
          self.epsilon_start = epsilon_start
          self.epsilon_end = epsilon_end
          self.epsilon_last_episode = epsilon_last_episode
          self.samples_per_epoch = samples_per_epoch
          self.q_net_update_rate = 10

          while len(self.buffer) < self.samples_per_epoch:
            self.play_episode(epsilon=self.epsilon_start)

          self.target_v_model = tf.keras.models.clone_model(self.q_net.model_v)

              
      def play_episode(self, policy=None, epsilon=0.):
          state = self.environment.reset()
          done = False

          while not done:
            if policy:
              action = policy(state, self.environment, self.q_net.model_mu, epsilon=epsilon)
            else:
              action = self.environment.action_space.sample()

            next_state, reward, done, info = self.environment.step(action)
            exp = (state, action, reward, done, next_state)
            self.buffer.append(exp)
            state = next_state

      def get_batch(self):
        
        sample = pd.DataFrame(self.buffer.sample(self.batch_size))
        
        state = np.stack(sample[0].values)
        action = np.stack(sample[1].values)
        reward = np.stack(sample[2].values)
        done = np.stack(sample[3].values)
        next_state = np.stack(sample[4].values)

        return state, action, reward, done, next_state

      def train_step(self):
        states, actions, rewards, dones, next_states = self.get_batch()
  
        next_state_values = self.target_v_model(next_states).numpy()
        next_state_values[dones] = 0.
        next_state_values = tf.squeeze(next_state_values)

        target = rewards + self.gamma * next_state_values

        loss = self.q_net.fit((states, actions), target)
        
        return loss

      def training_epoch_end(self):

        epsilon = max(self.epsilon_end,
                      self.epsilon_start - self.current_epoch / self.epsilon_last_episode)

        self.play_episode(policy=self.policy, epsilon=epsilon)

        if (self.current_epoch + 1) % self.q_net_update_rate == 0:
          self.target_v_model = tf.keras.models.clone_model(self.q_net.model_v)

        self.current_epoch += 1          

      def train(self, epochs):

        for i in range(epochs):

          loss = 0

          for _ in range(self.samples_per_epoch//self.batch_size):  
            loss += self.train_step()/(self.samples_per_epoch//self.batch_size)

          self.training_epoch_end()

          if self.current_epoch % 50 == 0:
            returns = list(self.environment.return_queue)[-1]
            print(f"Epoch: {self.current_epoch}, loss: {loss}, hp_metric: {tf.math.reduce_mean(returns)}")


In [None]:
alg = DeepQLearningAlgorithm(n=12, learning_rate=1e-3)
alg.train(1000)

  logger.warn(


Epoch: 50, loss: 896.4072113037109, hp_metric: -370.2440490722656
Epoch: 100, loss: 874.0405426025391, hp_metric: -1256.202392578125
Epoch: 150, loss: 795.625732421875, hp_metric: -586.153076171875
Epoch: 200, loss: 835.6949462890625, hp_metric: -666.8700561523438
Epoch: 250, loss: 811.6567230224609, hp_metric: -675.03076171875
Epoch: 300, loss: 772.5878143310547, hp_metric: -651.790771484375
Epoch: 350, loss: 708.3697509765625, hp_metric: -550.282470703125
Epoch: 400, loss: 638.1910095214844, hp_metric: -215.37681579589844
Epoch: 450, loss: 819.2611846923828, hp_metric: -421.616455078125
Epoch: 500, loss: 669.8406219482422, hp_metric: -12.224639892578125
Epoch: 550, loss: 656.6982421875, hp_metric: -219.802490234375
Epoch: 600, loss: 683.8597717285156, hp_metric: -176.8139190673828
Epoch: 650, loss: 650.2520599365234, hp_metric: -360.3421325683594
Epoch: 700, loss: 622.5941162109375, hp_metric: -138.72171020507812
Epoch: 750, loss: 615.2307281494141, hp_metric: -81.72682189941406
Epoc

In [None]:
alg.train(400)

Epoch: 1050, loss: 568.7523193359375, hp_metric: -156.8037567138672
Epoch: 1100, loss: 591.5465545654297, hp_metric: -215.9573516845703
Epoch: 1150, loss: 538.0778350830078, hp_metric: -124.85720825195312
Epoch: 1200, loss: 545.7513580322266, hp_metric: -95.98601531982422
Epoch: 1250, loss: 517.0646286010742, hp_metric: -14.822734832763672
Epoch: 1300, loss: 551.4833908081055, hp_metric: -184.640625
Epoch: 1350, loss: 497.8832092285156, hp_metric: 99.13664245605469
Epoch: 1400, loss: 511.9227294921875, hp_metric: -33.31315231323242


Probably, in the code is an error or the hyperparameters should be better ajusted. The network is evidently doing better with time. However, it cannot achive very good results.