COPYRIGHT © 2018 Kiran Arun <kironni@gmail.com>

### Setup

In [1]:
!pip3 install gym
!pip3 install keras-rl



# RL Challenge

In [2]:
# importing libraries
import numpy as np
import matplotlib.pyplot as plt
import gym

import keras
import keras.layers as layers
from rl import agents,memory,policy

Using TensorFlow backend.


### Hyperparameters

In [0]:
# Hyperparams

# number of steps to keep in experience buffer
memory_limit = 400

# discount value
gamma = 0.99

# how much to update target graph
target_model_update = 1e-2

# learning rate
learning_rate = 1e-2

# number of steps to sample from buffer to train on
batch_size = 32

In [4]:
# DO NOT EDIT

# setting environment
env = gym.make('CartPole-v1')
num_actions = env.action_space.n

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


### Neural Network

In [5]:
keras.backend.clear_session()

# create neural net
nn = keras.models.Sequential()

# keep this as input layer
nn.add(layers.Flatten(input_shape=(1,) + env.observation_space.shape))

# keep this as output layer
nn.add(layers.Dense(num_actions))

nn.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 10        
Total params: 10
Trainable params: 10
Non-trainable params: 0
_________________________________________________________________


### Optimizer

https://keras.io/optimizers/

In [0]:
# set optimizer
optimizer = keras.optimizers.SGD(lr=learning_rate)

### Memory

In [0]:
# DO NOT EDIT

# set experience buffer
mem = memory.SequentialMemory(limit=memory_limit,
                                    window_length=1)

### Policy

choose from: https://github.com/keras-rl/keras-rl/blob/master/rl/policy.py

[info on choosing](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-7-action-selection-strategies-for-exploration-d3a97b7cceaf)

In [0]:
# set policy
pol = policy.BoltzmannQPolicy()

### Agent

choose from: https://github.com/keras-rl/keras-rl/tree/master/rl/agents

and use: https://github.com/keras-rl/keras-rl/tree/master/examples

In [0]:
# create agent
model = agents.dqn.DQNAgent(model=nn,
                            nb_actions=num_actions,
                            memory=mem,
                            gamma=gamma,
                            batch_size=batch_size,
                            nb_steps_warmup=100,
                            target_model_update=target_model_update,
                            policy=pol)

In [0]:
# DO NOT EDIT

# compile model
model.compile(optimizer,
            metrics=['mae'])

### Training

In [11]:
# DO NOT EDIT

# train
history = model.fit(env,
                  nb_steps=1000,
                  verbose=2)

Training for 1000 steps ...
  56/1000: episode: 1, duration: 0.075s, episode steps: 56, steps per second: 751, episode reward: 56.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.554 [0.000, 1.000], mean observation: -0.003 [-2.012, 1.342], loss: --, mean_absolute_error: --, mean_q: --
 104/1000: episode: 2, duration: 0.283s, episode steps: 48, steps per second: 170, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.604 [0.000, 1.000], mean observation: 0.145 [-2.212, 2.289], loss: 0.511626, mean_absolute_error: 0.709396, mean_q: -0.128323
 116/1000: episode: 3, duration: 0.042s, episode steps: 12, steps per second: 286, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.104 [-1.855, 1.148], loss: 0.527065, mean_absolute_error: 0.690373, mean_q: -0.051019
 162/1000: episode: 4, duration: 0.154s, episode steps: 46, steps per second: 299, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000]

done, took 3.560 seconds


### Testing

In [12]:
# DO NOT EDIT

# test
hist = model.test(env, nb_episodes=50, visualize=False)
print('mean:', np.mean(hist.history['episode_reward']))

Testing for 50 episodes ...
Episode 1: reward: 11.000, steps: 11
Episode 2: reward: 9.000, steps: 9
Episode 3: reward: 10.000, steps: 10
Episode 4: reward: 9.000, steps: 9
Episode 5: reward: 10.000, steps: 10
Episode 6: reward: 10.000, steps: 10
Episode 7: reward: 10.000, steps: 10
Episode 8: reward: 9.000, steps: 9
Episode 9: reward: 10.000, steps: 10
Episode 10: reward: 11.000, steps: 11
Episode 11: reward: 9.000, steps: 9
Episode 12: reward: 10.000, steps: 10
Episode 13: reward: 9.000, steps: 9
Episode 14: reward: 10.000, steps: 10
Episode 15: reward: 9.000, steps: 9
Episode 16: reward: 8.000, steps: 8
Episode 17: reward: 9.000, steps: 9
Episode 18: reward: 10.000, steps: 10
Episode 19: reward: 9.000, steps: 9
Episode 20: reward: 10.000, steps: 10
Episode 21: reward: 10.000, steps: 10
Episode 22: reward: 9.000, steps: 9
Episode 23: reward: 10.000, steps: 10
Episode 24: reward: 9.000, steps: 9
Episode 25: reward: 10.000, steps: 10
Episode 26: reward: 9.000, steps: 9
Episode 27: rewar