<a href="https://colab.research.google.com/github/KornelWitkowski/Deep-Q-Learning-with-Tensorflow/blob/main/LunarLander_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get install -y xvfb

!pip install \
    gym==0.21 \
    gym[box2d] \
    pyvirtualdisplay

In [1]:
from pyvirtualdisplay import Display
Display(visible=False, size=(1400, 900)).start();

In [2]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras.optimizers import Adam


class DeepQLearningModel:
  def __init__(self, observation_size, hidden_size, action_size, learning_rate=0.001):

    self.model = Sequential([Dense(observation_size, activation="relu"),
                             Dense(hidden_size, activation="relu"),
                             Dense(hidden_size, activation="relu"),
                             Dense(action_size)])
    self.compile()
    
  def compile(self, learning_rate=0.001):  
    self.model.compile(loss="mse",
                       optimizer=Adam(learning_rate=learning_rate))
    
  def fit(self, x, y):
    history = self.model.fit(x, y, epochs=1, verbose=0)
    loss =  history.history["loss"][0]
    return loss


In [3]:
import numpy as np

def epsilon_greedy_policy(state, environment, model, epsilon=0.0):

  if np.random.random() < epsilon:
    action = environment.action_space.sample()
  else:
    q_values = model(tf.expand_dims(state, axis=0))
    action = tf.math.argmax(q_values, axis=1)
    action = int(action)
  return action

In [4]:
from collections import deque, namedtuple
import random

class ReplayBuffer:

  def __init__(self, capacity):
    self.buffer = deque(maxlen=capacity)

  def __len__(self):
    return len(self.buffer)

  def append(self, experience):
    self.buffer.append(experience)

  def sample(self, batch_size):
    return random.sample(self.buffer, batch_size)

In [5]:
import random
from collections import deque

class ReplayBuffer:

  def __init__(self, capacity):
    self.buffer = deque(maxlen=capacity)

  def __len__(self):
    return len(self.buffer)

  def append(self, experience):
    self.buffer.append(experience)

  def sample(self, batch_size):
    return random.sample(self.buffer, batch_size)

In [6]:
import gym
from gym.wrappers import RecordVideo, RecordEpisodeStatistics, TimeLimit

def create_gym_environment(name):
  environment = gym.make(name)
  environment = TimeLimit(environment, max_episode_steps=400)
  environment = RecordVideo(environment, video_folder='./recored_episodes', episode_trigger=lambda x: x % 50 == 0)
  environment = RecordEpisodeStatistics(environment)

  return environment

In [7]:
import copy
import pandas as pd

from gym.wrappers import RecordVideo, RecordEpisodeStatistics, TimeLimit

import gym

class DeepQLearningAlgorithm:

    def __init__(self, environment_name='LunarLander-v2', policy=epsilon_greedy_policy, capacity=100_000,
                batch_size=256, learning_rate=1e-3, hidden_size=128, gamma=0.99, epsilon_start=1.0, epsilon_end=0.15,
                epsilon_last_episode=600, samples_per_epoch=2_048, q_net_update_rate=10):
    
      self.environment = create_gym_environment(environment_name)
      observation_size = self.environment.observation_space.shape[0]
      actions_size = self.environment.action_space.n

      self.q_net = DeepQLearningModel(observation_size, hidden_size, actions_size, learning_rate)
      self.target_q_net = copy.deepcopy(self.q_net.model)

      self.policy = policy
      self.buffer = ReplayBuffer(capacity=capacity)

       
      self.current_epoch = 0


      # hyperparameters
      self.batch_size = batch_size
      self.learning_rate = learning_rate
      self.gamma = 0.99
      self.epsilon_start = epsilon_start
      self.epsilon_end = epsilon_end
      self.epsilon_last_episode = epsilon_last_episode
      self.samples_per_epoch = samples_per_epoch
      self.q_net_update_rate = 10


      while len(self.buffer) < self.samples_per_epoch:
        self.play_episode(epsilon=0)
        
    def play_episode(self, policy=None, epsilon=0.):
      state = self.environment.reset()
      done = False

      while not done:
        if policy:
          action = policy(state, self.environment, self.q_net.model, epsilon=epsilon)
        else:
          action = self.environment.action_space.sample()

        next_state, reward, done, info = self.environment.step(action)
        exp = (state, action, reward, done, next_state)
        self.buffer.append(exp)
        state = next_state

    def get_batch(self):
      sample = pd.DataFrame(self.buffer.sample(self.batch_size))

      state = np.stack(sample[0].values)
      action = np.stack(sample[1].values)
      reward = np.stack(sample[2].values)
      done = np.stack(sample[3].values)
      next_state = np.stack(sample[4].values)

      return state, action, reward, done, next_state


    def train_step(self):
      states, actions, rewards, dones, next_states = self.get_batch()

      state_action_values = tf.gather(self.q_net.model(states), actions, axis=1, batch_dims=1)

      next_action_values = tf.math.reduce_max(self.target_q_net(next_states), axis=1)
      next_action_values = next_action_values.numpy()
      next_action_values[dones] = 0.0

      expected_state_action_values = rewards + self.gamma * next_action_values

      q_net_predictions = self.q_net.model(states).numpy()
      q_net_predictions[range(self.batch_size), actions] = expected_state_action_values

      loss = self.q_net.fit(states, q_net_predictions)

      return loss

      
    def training_epoch_end(self):

        epsilon = max(self.epsilon_end,
                      self.epsilon_start - self.current_epoch / self.epsilon_last_episode)

        self.play_episode(policy=self.policy, epsilon=epsilon)

        if self.current_epoch % self.q_net_update_rate == 0:
          self.target_q_net = copy.deepcopy(self.q_net.model)

        return
        
    def train(self, epochs):

      for i in range(epochs):

        loss = 0

        for _ in range(self.samples_per_epoch//self.batch_size):  
          loss += self.train_step()

        self.training_epoch_end()

        if self.current_epoch % 50 == 0:
          returns = list(self.environment.return_queue)[-1]
          print(f"Epoch: {self.current_epoch}, loss: {loss}, hp_metric: {tf.math.reduce_mean(returns)}")

        self.current_epoch += 1


In [8]:
!rm -r /content/recored_episodes

algorithm =  DeepQLearningAlgorithm()

In [9]:
algorithm.train(2000)

Epoch: 0, loss: 268.12224674224854, hp_metric: -181.4905548095703
Epoch: 50, loss: 65.59887909889221, hp_metric: -153.10211181640625
Epoch: 100, loss: 40.14419388771057, hp_metric: -134.60256958007812
Epoch: 150, loss: 36.258490562438965, hp_metric: -120.94712829589844
Epoch: 200, loss: 51.76798403263092, hp_metric: -59.935302734375
Epoch: 250, loss: 43.43266558647156, hp_metric: -51.29109191894531
Epoch: 300, loss: 25.66166865825653, hp_metric: -27.810462951660156
Epoch: 350, loss: 48.81535291671753, hp_metric: -23.567724227905273
Epoch: 400, loss: 31.40342652797699, hp_metric: -12.756509780883789
Epoch: 450, loss: 32.64399337768555, hp_metric: 46.689064025878906
Epoch: 500, loss: 28.33835005760193, hp_metric: 53.12044906616211
Epoch: 550, loss: 27.020818173885345, hp_metric: 79.65638732910156
Epoch: 600, loss: 17.04546356201172, hp_metric: 76.03502655029297
Epoch: 650, loss: 18.183525025844574, hp_metric: 77.77593994140625
Epoch: 700, loss: 5.392167240381241, hp_metric: 74.3745574951

In [34]:
print(f"Average return for epsilon=0: {tf.math.reduce_mean(list(algorithm.environment.return_queue))}")

Average return for epsilon=0: 213.4485321044922
