In this notebook, we'll train an agent using deep Q learning. We will use OpenAI's gym to provide us with a game our agent can learn.

We will also install a couple of uilities we can use to create a virtual screen so we can record what our agent is doing.

In [None]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg x11-utils

Reading package lists... Done
Building dependency tree       
Reading state information... Done
x11-utils is already the newest version (7.7+3build1).
python-opengl is already the newest version (3.1.0+dfsg-1).
ffmpeg is already the newest version (7:3.4.6-0ubuntu0.18.04.1).
xvfb is already the newest version (2:1.19.6-1ubuntu4.4).
0 upgraded, 0 newly installed, 0 to remove and 25 not upgraded.


In [None]:
%tensorflow_version 2.x

import gym
from gym.wrappers import Monitor
import numpy as np
import random
import glob
import base64
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from collections import deque
from IPython.display import HTML, display as ipythondisplay
from pyvirtualdisplay import Display
from tqdm.auto import tqdm

Next, we create a class representing our agent. Note that there are a lot of hyperparameters we could tune here, such as the size of our replay memory, the discount factor gamma, the exploration/exploitation rate epsilon, the architecture of the model, and so on.

In [None]:
class DQN:
  def __init__(self, env):
    self.env           = env
    self.memory        = deque(maxlen=2048)
    self.gamma         = 0.95
    self.epsilon       = 1.0
    self.epsilon_min   = 0.01
    self.epsilon_decay = 0.900
    self.learning_rate = 0.005
    self.batch_size    = 256
    self.model = self.create_model()

  def create_model(self):
    model = Sequential()
    model.add(Dense(24, input_dim=self.env.observation_space.shape[0], activation="relu"))
    model.add(Dense(24, activation="relu"))
    model.add(Dense(self.env.action_space.n, activation='linear'))
    model.compile(loss="mse", optimizer=Adam(lr=self.learning_rate))
    return model

  def act(self, state):
    # Ask our agent to perform an action
    self.epsilon *= self.epsilon_decay
    self.epsilon  = max(self.epsilon_min, self.epsilon)
    if np.random.random() < self.epsilon:
      return self.env.action_space.sample()
    return np.argmax(self.model.predict(state)[0])

  def remember(self, state, action, reward, new_state, done):
    # Add an action tuple to our memory
    self.memory.append([state, action, reward, new_state, done])

  def replay(self):
    # Train our network using a random memory sample
    if len(self.memory) < self.batch_size: 
      return
    samples     = random.sample(self.memory, self.batch_size)
    states      = np.array([ sample[0][0] for sample in samples])
    new_states  = np.array([ sample[3][0] for sample in samples])
    targets     = self.model.predict(states)
    q_news      = self.model.predict(new_states)
    # Bellman equation: the predicted Q value should be equal to the reward + the best Q value in the next state
    # The best Q value in the next state is obtained by asking our network to make a prediction in the next state
    for s, sample in enumerate(samples):
      state, action, reward, new_state, done = sample
      targets[s][action] = reward
      if not done:
        Q_future            = np.max(q_news[s])
        targets[s][action] += Q_future * self.gamma
    self.model.fit(states, targets, epochs=1, verbose=0, batch_size=len(states))

We can now create our environment and train our agent.

In [None]:
env = gym.make("CartPole-v1")

nb_states = env.observation_space.shape[0]
trials    = 300 # Number of episodes or "trials"
trial_len = 500 # How many steps at most in each trial?

dqn_agent = DQN(env=env)
history   = []

for trial in tqdm(range(trials)):
  # Reset the environment and get the first state
  cur_state = env.reset().reshape(1, nb_states)
  for step in range(trial_len):
    # Take the best action based on network so far
    action                     = dqn_agent.act(cur_state)
    new_state, reward, done, _ = env.step(action)
    new_state                  = new_state.reshape(1, nb_states)
    memory_tuple               = (cur_state, action, reward, new_state, done)
    # Remember this step in our memory
    dqn_agent.remember(*memory_tuple)
    # Train using our memory so far
    dqn_agent.replay()
    # Go to the next state
    cur_state = new_state
    # If the game is over, stop this trial
    if done: break
  
  if not history or step > max(history) :
    print('Trial {} obtained better result of {} steps'.format(trial, step))
  history.append(step)

  # If the last three trials all ran for their maximum length, we assume the agent to be good enough
  # The game is "won"
  if all(h == (trial_len - 1) for h in history[-3:]):
    print("Max number of steps 3 times in a row, stopping")
    break

HBox(children=(IntProgress(value=0, max=300), HTML(value='')))

Trial 0 obtained better result of 26 steps
Trial 11 obtained better result of 32 steps
Trial 35 obtained better result of 50 steps
Trial 36 obtained better result of 299 steps
Trial 41 obtained better result of 355 steps
Trial 43 obtained better result of 426 steps
Trial 51 obtained better result of 430 steps
Trial 111 obtained better result of 499 steps
Max number of steps 3 times in a row, stopping


Let's take a look at what our agent is doing. The following functions are just wrappers so `gym` outputs to a video file which we can show.

In [None]:
def show_video():
  mp4list = glob.glob('./video/*.mp4')
  if not mp4list:
    print("Could not find video")
    return
  mp4 = mp4list[0]
  video = open(mp4, 'rb').read()
  encoded = base64.b64encode(video).decode('ascii')
  ipythondisplay(HTML(
      '<video autoplay controls style="height: 400px;"><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'.format(encoded)
  ))

def wrap_env(env):
  return Monitor(env, './video/', force=True)

In [None]:
display = Display(visible=0, size=(900, 400))
display.start()

<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '900x400x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '900x400x24', ':1001'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

In [None]:
env   = wrap_env(gym.make("CartPole-v1"))
state = env.reset()
state = np.reshape(state, (1, nb_states))

while True:
  env.render()
  action = dqn_agent.act(state)
  next_state, _, done, _ = env.step(action)
  next_state = np.reshape(next_state, (1, nb_states))
  state = next_state
  if done:
    break

env.close()
display.stop()

<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '900x400x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '900x400x24', ':1001'] oserror=None return_code=0 stdout="" stderr="" timeout_happened=False>

In [None]:
show_video()