COPYRIGHT © 2018 Kiran Arun <kironni@gmail.com>

### Setup

In [22]:
!pip3 install gym
!pip3 install keras-rl



# RL Challenge

In [0]:
# importing libraries
import numpy as np
import matplotlib.pyplot as plt
import gym

import keras
import keras.layers as layers
from rl import agents,memory,policy

### Hyperparameters

In [0]:
# Hyperparams

# number of steps to keep in experience buffer
memory_limit = 400

# discount value
gamma = 0.99

# how much to update target graph
target_model_update = 1e-2

# learning rate
learning_rate = 1e-2

# number of steps to sample from buffer to train on
batch_size = 32

In [25]:
# DO NOT EDIT

# setting environment
env = gym.make('CartPole-v1')
num_actions = env.action_space.n

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


### Neural Network

In [26]:
keras.backend.clear_session()

# create neural net
nn = keras.models.Sequential()

# keep this as input layer
nn.add(layers.Flatten(input_shape=(1,) + env.observation_space.shape))

# keep this as output layer
nn.add(layers.Dense(num_actions))

nn.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 10        
Total params: 10
Trainable params: 10
Non-trainable params: 0
_________________________________________________________________


### Optimizer

https://keras.io/optimizers/

In [0]:
# set optimizer
optimizer = keras.optimizers.SGD(lr=learning_rate)

### Memory

In [0]:
# DO NOT EDIT

# set experience buffer
mem = memory.SequentialMemory(limit=memory_limit,
                                    window_length=1)

### Policy

choose from: https://github.com/keras-rl/keras-rl/blob/master/rl/policy.py

[info on choosing](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-7-action-selection-strategies-for-exploration-d3a97b7cceaf)

In [0]:
# set policy
pol = policy.BoltzmannQPolicy()

### Agent

choose from: https://github.com/keras-rl/keras-rl/tree/master/rl/agents

and use: https://github.com/keras-rl/keras-rl/tree/master/examples

In [0]:
# create agent
model = agents.dqn.DQNAgent(model=nn,
                            nb_actions=num_actions,
                            memory=mem,
                            gamma=gamma,
                            batch_size=batch_size,
                            nb_steps_warmup=100,
                            target_model_update=target_model_update,
                            policy=pol)

In [0]:
# DO NOT EDIT

# compile model
model.compile(optimizer,
            metrics=['mae'])

### Training

In [32]:
# DO NOT EDIT

# train
history = model.fit(env,
                  nb_steps=1000,
                  verbose=2)

Training for 1000 steps ...
  10/1000: episode: 1, duration: 0.042s, episode steps: 10, steps per second: 236, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.118 [-2.990, 1.983], loss: --, mean_absolute_error: --, mean_q: --
  35/1000: episode: 2, duration: 0.016s, episode steps: 25, steps per second: 1550, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.280 [0.000, 1.000], mean observation: 0.024 [-2.133, 3.061], loss: --, mean_absolute_error: --, mean_q: --
  66/1000: episode: 3, duration: 0.019s, episode steps: 31, steps per second: 1641, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.613 [0.000, 1.000], mean observation: -0.015 [-2.130, 1.414], loss: --, mean_absolute_error: --, mean_q: --
  80/1000: episode: 4, duration: 0.009s, episode steps: 14, steps per second: 1573, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.714 [0.000, 1.000]

 883/1000: episode: 52, duration: 0.174s, episode steps: 51, steps per second: 293, episode reward: 51.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.569 [0.000, 1.000], mean observation: -0.039 [-2.349, 1.396], loss: 0.514085, mean_absolute_error: 1.942288, mean_q: 3.229285
 904/1000: episode: 53, duration: 0.069s, episode steps: 21, steps per second: 304, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.381 [0.000, 1.000], mean observation: 0.092 [-1.017, 1.972], loss: 0.477412, mean_absolute_error: 1.983689, mean_q: 3.291059
 917/1000: episode: 54, duration: 0.043s, episode steps: 13, steps per second: 301, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.846 [0.000, 1.000], mean observation: -0.099 [-2.803, 1.738], loss: 0.424549, mean_absolute_error: 2.006532, mean_q: 3.374639
 932/1000: episode: 55, duration: 0.050s, episode steps: 15, steps per second: 301, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean a

### Testing

In [34]:
# DO NOT EDIT

# test
hist = model.test(env, nb_episodes=50, visualize=False)
print('mean:', np.mean(hist.history['episode_reward']))

Testing for 50 episodes ...
Episode 1: reward: 10.000, steps: 10
Episode 2: reward: 10.000, steps: 10
Episode 3: reward: 9.000, steps: 9
Episode 4: reward: 10.000, steps: 10
Episode 5: reward: 10.000, steps: 10
Episode 6: reward: 11.000, steps: 11
Episode 7: reward: 9.000, steps: 9
Episode 8: reward: 10.000, steps: 10
Episode 9: reward: 10.000, steps: 10
Episode 10: reward: 9.000, steps: 9
Episode 11: reward: 9.000, steps: 9
Episode 12: reward: 9.000, steps: 9
Episode 13: reward: 10.000, steps: 10
Episode 14: reward: 10.000, steps: 10
Episode 15: reward: 9.000, steps: 9
Episode 16: reward: 11.000, steps: 11
Episode 17: reward: 9.000, steps: 9
Episode 18: reward: 9.000, steps: 9
Episode 19: reward: 11.000, steps: 11
Episode 20: reward: 10.000, steps: 10
Episode 21: reward: 10.000, steps: 10
Episode 22: reward: 10.000, steps: 10
Episode 23: reward: 9.000, steps: 9
Episode 24: reward: 10.000, steps: 10
Episode 25: reward: 8.000, steps: 8
Episode 26: reward: 10.000, steps: 10
Episode 27: r