Credits: Matthias Plappert

Source: https://github.com/matthiasplappert/keras-rl


In [1]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

Using TensorFlow backend.


In [2]:
ENV_NAME = 'CartPole-v0'


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

[2018-01-29 19:46:56,631] Making new env: CartPole-v0


In [3]:
# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
__________

In [4]:
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [5]:
# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
dqn.fit(env, nb_steps=50000, visualize=True, verbose=2)

Training for 50000 steps ...




    31/50000: episode: 1, duration: 2.973s, episode steps: 31, steps per second: 10, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.419 [0.000, 1.000], mean observation: 0.013 [-1.185, 1.776], loss: 0.459659, mean_absolute_error: 0.519437, mean_q: 0.096432




    44/50000: episode: 2, duration: 0.221s, episode steps: 13, steps per second: 59, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.692 [0.000, 1.000], mean observation: -0.069 [-1.867, 1.224], loss: 0.350367, mean_absolute_error: 0.542432, mean_q: 0.291314
    64/50000: episode: 3, duration: 0.330s, episode steps: 20, steps per second: 61, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.550 [0.000, 1.000], mean observation: -0.070 [-1.375, 0.833], loss: 0.232356, mean_absolute_error: 0.555896, mean_q: 0.490309
    83/50000: episode: 4, duration: 0.317s, episode steps: 19, steps per second: 60, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.421 [0.000, 1.000], mean observation: 0.074 [-0.817, 1.505], loss: 0.119146, mean_absolute_error: 0.605534, mean_q: 0.818067
    98/50000: episode: 5, duration: 0.246s, episode steps: 15, steps per second: 61, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], me

   633/50000: episode: 31, duration: 0.381s, episode steps: 23, steps per second: 60, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.391 [0.000, 1.000], mean observation: 0.027 [-1.198, 1.655], loss: 0.256799, mean_absolute_error: 2.855434, mean_q: 5.454647
   654/50000: episode: 32, duration: 0.351s, episode steps: 21, steps per second: 60, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.429 [0.000, 1.000], mean observation: 0.074 [-0.653, 1.449], loss: 0.257734, mean_absolute_error: 2.938653, mean_q: 5.580149
   670/50000: episode: 33, duration: 0.268s, episode steps: 16, steps per second: 60, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.312 [0.000, 1.000], mean observation: 0.088 [-1.156, 1.974], loss: 0.212216, mean_absolute_error: 2.985630, mean_q: 5.764056
   682/50000: episode: 34, duration: 0.198s, episode steps: 12, steps per second: 61, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], 

  2259/50000: episode: 60, duration: 0.998s, episode steps: 200, steps per second: 200, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.139 [-0.885, 1.248], loss: 0.724206, mean_absolute_error: 9.454741, mean_q: 19.211803
  2459/50000: episode: 61, duration: 0.991s, episode steps: 200, steps per second: 202, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.144 [-1.494, 0.834], loss: 0.734959, mean_absolute_error: 10.487144, mean_q: 21.314400
  2639/50000: episode: 62, duration: 0.969s, episode steps: 180, steps per second: 186, episode reward: 180.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: -0.249 [-2.008, 1.010], loss: 1.012814, mean_absolute_error: 11.366416, mean_q: 23.099739
  2789/50000: episode: 63, duration: 0.758s, episode steps: 150, steps per second: 198, episode reward: 150.000, mean reward: 1.0

<keras.callbacks.History at 0x123cc1668>

In [6]:
# After training is done, we save the final weights.
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

In [7]:
# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 192.000, steps: 192
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200


<keras.callbacks.History at 0x12ca1ccf8>