In [3]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

In [4]:
# importing our evnvironment from file
import balance_bot

In [3]:
# env.close()

In [5]:
ENV_NAME = 'balancebot-v0'


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: Environment '<class 'balance_bot.envs.balancebot_env.BalancebotEnv'>' has deprecated methods '_step' and '_reset' rather than 'step' and 'reset'. Compatibility code invoked. Set _gym_disable_underscore_compat = True to disable this behavior.[0m


In [6]:
# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 3)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                64        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
__________

In [7]:
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [69]:
# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
dqn.fit(env, nb_steps=20000, visualize=True, verbose=2)

Training for 20000 steps ...
   130/20000: episode: 1, duration: 1.781s, episode steps: 130, steps per second: 73, episode reward: 4.521, mean reward: 0.035 [-0.153, 0.100], mean action: 4.354 [0.000, 8.000], mean observation: 0.193 [-4.781, 11.950], loss: 0.070520, mean_absolute_error: 12.357471, mean_q: 14.170400
   213/20000: episode: 2, duration: 1.262s, episode steps: 83, steps per second: 66, episode reward: 2.661, mean reward: 0.032 [-0.059, 0.093], mean action: 4.120 [0.000, 8.000], mean observation: -0.838 [-5.866, 4.670], loss: 0.090053, mean_absolute_error: 12.267007, mean_q: 14.045307
   351/20000: episode: 3, duration: 1.942s, episode steps: 138, steps per second: 71, episode reward: 3.037, mean reward: 0.022 [-0.156, 0.098], mean action: 3.616 [0.000, 8.000], mean observation: -0.330 [-14.230, 4.857], loss: 0.078964, mean_absolute_error: 12.291951, mean_q: 14.091850
   527/20000: episode: 4, duration: 2.384s, episode steps: 176, steps per second: 74, episode reward: -4.33

  3846/20000: episode: 30, duration: 1.336s, episode steps: 87, steps per second: 65, episode reward: -0.402, mean reward: -0.005 [-0.149, 0.091], mean action: 3.506 [0.000, 8.000], mean observation: -0.249 [-12.930, 4.920], loss: 0.084048, mean_absolute_error: 11.788267, mean_q: 13.484810
  3968/20000: episode: 31, duration: 1.752s, episode steps: 122, steps per second: 70, episode reward: 5.857, mean reward: 0.048 [-0.093, 0.099], mean action: 4.270 [0.000, 8.000], mean observation: 0.022 [-5.659, 6.220], loss: 0.070025, mean_absolute_error: 12.004807, mean_q: 13.732387
  4103/20000: episode: 32, duration: 2.033s, episode steps: 135, steps per second: 66, episode reward: 6.885, mean reward: 0.051 [-0.092, 0.099], mean action: 4.148 [0.000, 8.000], mean observation: -0.043 [-5.409, 6.680], loss: 0.070174, mean_absolute_error: 11.906301, mean_q: 13.610381
  4193/20000: episode: 33, duration: 1.387s, episode steps: 90, steps per second: 65, episode reward: 3.044, mean reward: 0.034 [-0.

  7508/20000: episode: 59, duration: 1.481s, episode steps: 92, steps per second: 62, episode reward: -1.905, mean reward: -0.021 [-0.165, 0.090], mean action: 3.315 [0.000, 8.000], mean observation: -0.822 [-14.960, 5.198], loss: 0.133133, mean_absolute_error: 13.394708, mean_q: 15.536340
  7626/20000: episode: 60, duration: 1.752s, episode steps: 118, steps per second: 67, episode reward: -4.995, mean reward: -0.042 [-0.274, 0.095], mean action: 3.051 [0.000, 8.000], mean observation: -1.983 [-23.720, 4.115], loss: 0.109172, mean_absolute_error: 13.319695, mean_q: 15.432909
  7821/20000: episode: 61, duration: 2.731s, episode steps: 195, steps per second: 71, episode reward: 1.749, mean reward: 0.009 [-0.233, 0.100], mean action: 4.472 [0.000, 8.000], mean observation: 1.538 [-4.407, 19.860], loss: 0.144706, mean_absolute_error: 13.443761, mean_q: 15.588103
  7896/20000: episode: 62, duration: 1.185s, episode steps: 75, steps per second: 63, episode reward: -1.229, mean reward: -0.01

 10902/20000: episode: 88, duration: 1.685s, episode steps: 120, steps per second: 71, episode reward: 1.219, mean reward: 0.010 [-0.158, 0.100], mean action: 3.517 [0.000, 8.000], mean observation: -0.248 [-14.140, 5.690], loss: 0.076722, mean_absolute_error: 13.215894, mean_q: 15.224345
 11005/20000: episode: 89, duration: 1.494s, episode steps: 103, steps per second: 69, episode reward: -1.678, mean reward: -0.016 [-0.180, 0.099], mean action: 3.359 [0.000, 8.000], mean observation: -1.038 [-17.680, 4.336], loss: 0.104927, mean_absolute_error: 13.370761, mean_q: 15.398659
 11156/20000: episode: 90, duration: 2.078s, episode steps: 151, steps per second: 73, episode reward: 6.203, mean reward: 0.041 [-0.117, 0.100], mean action: 4.245 [0.000, 8.000], mean observation: 0.108 [-5.077, 11.420], loss: 0.101644, mean_absolute_error: 13.391790, mean_q: 15.417027
 11228/20000: episode: 91, duration: 1.153s, episode steps: 72, steps per second: 62, episode reward: 1.594, mean reward: 0.022 [

 14256/20000: episode: 117, duration: 1.744s, episode steps: 117, steps per second: 67, episode reward: 1.402, mean reward: 0.012 [-0.146, 0.098], mean action: 4.308 [0.000, 8.000], mean observation: 0.834 [-5.017, 13.790], loss: 0.152751, mean_absolute_error: 12.993554, mean_q: 15.059550
 14383/20000: episode: 118, duration: 1.765s, episode steps: 127, steps per second: 72, episode reward: 7.484, mean reward: 0.059 [-0.052, 0.098], mean action: 4.008 [0.000, 8.000], mean observation: 0.863 [-1.900, 5.930], loss: 0.150656, mean_absolute_error: 13.120815, mean_q: 15.199865
 14537/20000: episode: 119, duration: 2.182s, episode steps: 154, steps per second: 71, episode reward: 1.924, mean reward: 0.012 [-0.167, 0.100], mean action: 4.338 [0.000, 8.000], mean observation: 1.108 [-5.432, 16.900], loss: 0.124025, mean_absolute_error: 12.879298, mean_q: 14.932262
 14646/20000: episode: 120, duration: 1.583s, episode steps: 109, steps per second: 69, episode reward: -0.745, mean reward: -0.007

 19726/20000: episode: 145, duration: 3.103s, episode steps: 199, steps per second: 64, episode reward: 6.570, mean reward: 0.033 [-0.114, 0.098], mean action: 4.126 [0.000, 8.000], mean observation: 1.030 [-6.218, 10.950], loss: 0.266119, mean_absolute_error: 20.520422, mean_q: 23.672594
 19872/20000: episode: 146, duration: 2.235s, episode steps: 146, steps per second: 65, episode reward: 6.152, mean reward: 0.042 [-0.127, 0.100], mean action: 4.315 [0.000, 8.000], mean observation: 0.417 [-5.295, 9.090], loss: 0.262292, mean_absolute_error: 20.484308, mean_q: 23.605713
done, took 292.503 seconds


<keras.callbacks.History at 0x7ff5336a03c8>

In [11]:
# After training is done, we save the final weights.
dqn.save_weights('dqn_{}_weights_solved{}.h5f'.format(ENV_NAME , 'stable3.0_stand'), overwrite=True)

In [71]:
# Finally, evaluate our algorithm for 5 episodes.
for _ in range(5):
#     env.reset()
#     env.step(env.action_space.sample())
    dqn.test(env, nb_episodes=1, visualize=True)

Testing for 1 episodes ...
Episode 1: reward: 142.531, steps: 1501
Testing for 1 episodes ...
Episode 1: reward: 142.531, steps: 1501
Testing for 1 episodes ...
Episode 1: reward: 142.531, steps: 1501
Testing for 1 episodes ...
Episode 1: reward: 142.531, steps: 1501
Testing for 1 episodes ...
Episode 1: reward: 142.531, steps: 1501


In [8]:
dqn.load_weights('dqn_balancebot-v0_weights_solvedstable2.0_stand.h5f')


In [None]:
dqn.test(env, nb_episodes=20, visualize=True)


Testing for 20 episodes ...
Episode 1: reward: 138.638, steps: 1501
Episode 2: reward: 136.682, steps: 1501
Episode 3: reward: -270.090, steps: 697
Episode 4: reward: -22.058, steps: 954
Episode 5: reward: 13.751, steps: 343
Episode 6: reward: 114.217, steps: 1501
Episode 7: reward: 37.194, steps: 585
Episode 8: reward: 74.122, steps: 1501
Episode 9: reward: 138.638, steps: 1501
Episode 10: reward: 138.638, steps: 1501
Episode 11: reward: 138.638, steps: 1501
Episode 12: reward: 138.638, steps: 1501
Episode 13: reward: 138.638, steps: 1501
Episode 14: reward: 138.638, steps: 1501
Episode 15: reward: 138.638, steps: 1501
