In this notebook, we'll train an agent using double deep Q learning. We will use OpenAI's gym to provide us with a game our agent can learn.

We will also install a couple of utilities we can use to create a virtual screen so we can record what our agent is doing.

In [None]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg x11-utils

Reading package lists... Done
Building dependency tree       
Reading state information... Done
ffmpeg is already the newest version (7:3.4.6-0ubuntu0.18.04.1).
Suggested packages:
  libgle3 mesa-utils
The following NEW packages will be installed:
  libxxf86dga1 python-opengl x11-utils xvfb
0 upgraded, 4 newly installed, 0 to remove and 25 not upgraded.
Need to get 1,490 kB of archives.
After this operation, 8,393 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libxxf86dga1 amd64 2:1.1.4-1 [13.7 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 python-opengl all 3.1.0+dfsg-1 [496 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/main amd64 x11-utils amd64 7.7+3build1 [196 kB]
Get:4 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 xvfb amd64 2:1.19.6-1ubuntu4.4 [784 kB]
Fetched 1,490 kB in 1s (1,147 kB/s)
Selecting previously unselected package libxxf86dga1:amd64.
(Reading database ... 144568 files and direc

In [None]:
%tensorflow_version 2.x

import gym
from gym.wrappers import Monitor
import numpy as np
import random
import glob
import base64
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from collections import deque
from IPython.display import HTML, display as ipythondisplay
from pyvirtualdisplay import Display
from tqdm.auto import tqdm

Next, we create a class representing our agent. Note that there are a lot of hyperparameters we could tune here, such as the size of our replay memory, the discount factor gamma, the exploration/exploitation rate epsilon, the architecture of the model, and so on.

Note the modifications compared to the non-double deep Q agent. Especially the `tau` parameter is a new addition, as well as the `target_train` method.

In [None]:
class DDQN:
  def __init__(self, env):
    self.env = env
    self.memory        = deque(maxlen=4096)
    self.gamma         = 0.95
    self.epsilon       = 1.0
    self.epsilon_min   = 0.01
    self.epsilon_decay = 0.995
    self.learning_rate = 0.005
    self.tau           = 0.1
    self.batch_size    = 512
    self.model         = self.create_model()
    self.target_model  = self.create_model()

  def create_model(self):
    model = Sequential()
    state_shape = self.env.observation_space.shape
    model.add(Dense(24, input_dim=state_shape[0], activation="relu"))
    model.add(Dense(24, activation="relu"))
    model.add(Dense(self.env.action_space.n, activation='linear'))
    model.compile(loss="mse", optimizer=Adam(lr=self.learning_rate))
    return model

  def act(self, state):
    # Ask our agent to perform an action
    self.epsilon *= self.epsilon_decay
    self.epsilon  = max(self.epsilon_min, self.epsilon)
    if np.random.random() < self.epsilon:
        return self.env.action_space.sample()
    return np.argmax(self.target_model.predict(state)[0])

  def remember(self, state, action, reward, new_state, done):
    # Add an action tuple to our memory
    self.memory.append([state, action, reward, new_state, done])

  def replay(self, high=False):
    # Train our network using a random memory sample
    if len(self.memory) < self.batch_size: 
      return
    samples     = random.sample(self.memory, self.batch_size)
    states      = np.array([ sample[0][0] for sample in samples])
    new_states  = np.array([ sample[3][0] for sample in samples])
    targets     = self.target_model.predict(states)
    q_news      = self.target_model.predict(new_states)
    for s, sample in enumerate(samples):
      state, action, reward, new_state, done = sample
      targets[s][action] = reward
      if not done:
        Q_future            = np.max(q_news[s])
        targets[s][action] += Q_future * self.gamma
    self.model.train_on_batch(states, targets)

  def target_train(self):
    # Pass weights from model to target model
    weights        = self.model.get_weights()
    target_weights = self.target_model.get_weights()
    for i in range(len(target_weights)):
        target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
    self.target_model.set_weights(target_weights)


Let's now train our agent. Note that we make some small modification as the game is considered solve if we can beat if three times in a row. The less steps the better.

We'll also print out some information regarding where our agent got to from time to time.

In [None]:
env = gym.make("MountainCar-v0")

# State described as
# 0	position	-1.2	0.6
# 1	velocity	-0.07	0.07

nb_states = env.observation_space.shape[0]
trials    = 1000 # Number of episodes or "trials"
trial_len =  200 # How many steps at most in each trial?

dqn_agent = DDQN(env=env)

history = []

for trial in tqdm(range(trials)):
  trial_history = []
  cur_state     = env.reset().reshape(1, nb_states)
  for step in range(trial_len):
    trial_history.append(cur_state[0])
    action                     = dqn_agent.act(cur_state)
    new_state, reward, done, _ = env.step(action)
    new_state                  = new_state.reshape(1, nb_states)
    memory_tuple               = (cur_state, action, reward, new_state, done)
    dqn_agent.remember(*memory_tuple)
    dqn_agent.replay()
    dqn_agent.target_train()
    cur_state = new_state
    if done: break

  trial_history = np.array(trial_history)
  if trial % 10 == 0:
    print('Trial {} position min/max = {}, {} velocity min/max = {}, {}'.format(trial, 
          np.min(trial_history[:, 0]), np.max(trial_history[:, 0]),
          np.min(trial_history[:, 1]), np.max(trial_history[:, 1]),
    ))

  if not history or step < max(history) :
    print('Trial {} obtained better result of {} steps'.format(trial, step))
  history.append(step)

  if all(h < (trial_len - 1) for h in history[-3:]):
    print("Reached goal 3 times in a row, stopping")
    break

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Trial 0 position min/max = -0.71322888303345, -0.41997695716455036 velocity min/max = -0.015366817516443294, 0.013534377927970664
Trial 0 obtained better result of 199 steps
Trial 10 position min/max = -1.2, -0.0037183784640356034 velocity min/max = -0.053025668253096216, 0.04872523198891794
Trial 16 obtained better result of 170 steps
Trial 18 obtained better result of 150 steps
Trial 20 position min/max = -1.1287499915450159, 0.2648587881212022 velocity min/max = -0.04172226373030535, 0.0582026212615617
Trial 22 obtained better result of 183 steps
Trial 23 obtained better result of 162 steps
Trial 24 obtained better result of 183 steps
Reached goal 3 times in a row, stopping


Note that there is definitely some luck involved here. Early good explorations can greatly improve the ability of the agent to find a good solution:

In [None]:
def show_video():
  mp4list = glob.glob('./video/*.mp4')
  if not mp4list:
    print("Could not find video")
    return
  mp4 = mp4list[0]
  video = open(mp4, 'rb').read()
  encoded = base64.b64encode(video).decode('ascii')
  ipythondisplay(HTML(
      '<video autoplay controls style="height: 400px;"><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'.format(encoded)
  ))

def wrap_env(env):
  return Monitor(env, './video/', force=True)

In [None]:
display = Display(visible=0, size=(900, 400))
display.start()

env   = wrap_env(gym.make("MountainCar-v0"))
state = env.reset()
state = np.reshape(state, (1, nb_states))

while True:
  env.render()
  action = dqn_agent.act(state)
  next_state, _, done, _ = env.step(action)
  next_state = np.reshape(next_state, (1, nb_states))
  state = next_state
  if done:
    break

env.close()
display.stop()

<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '900x400x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '900x400x24', ':1001'] oserror=None return_code=0 stdout="" stderr="" timeout_happened=False>

In [None]:
show_video()