COPYRIGHT © 2018 Kiran Arun <kironni@gmail.com>

### Setup

In [1]:
!pip3 install gym
!pip3 install keras-rl

Collecting keras-rl
  Downloading https://files.pythonhosted.org/packages/fc/a3/d39bd04750e9acf2205827e9331a5a01e45a618ad0fd00a0210d70b68025/keras-rl-0.4.0.tar.gz
Building wheels for collected packages: keras-rl
  Running setup.py bdist_wheel for keras-rl ... [?25l- \ done
[?25h  Stored in directory: /content/.cache/pip/wheels/35/e9/0e/2eefa4b6383571cbeac2e615ff6d3cdffe32d0e4268e19d17d
Successfully built keras-rl
Installing collected packages: keras-rl
Successfully installed keras-rl-0.4.0


# RL Challenge

In [2]:
# importing libraries
import numpy as np
import matplotlib.pyplot as plt
import gym

import keras
import keras.layers as layers
from rl import agents,memory,policy

Using TensorFlow backend.


### Hyperparameters

In [0]:
# Hyperparams

# number of steps to keep in experience buffer
memory_limit = 400

# discount value
gamma = 0.99

# how much to update target graph
target_model_update = 1e-2

# learning rate
learning_rate = 1e-2

# number of steps to sample from buffer to train on
batch_size = 32

In [4]:
# DO NOT EDIT

# setting environment
env = gym.make('CartPole-v1')
num_actions = env.action_space.n

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


### Neural Network

In [5]:
keras.backend.clear_session()

# create neural net
model = keras.models.Sequential()

# keep this as input layer
model.add(layers.Flatten(input_shape=(1,) + env.observation_space.shape))

# keep this as output layer
model.add(layers.Dense(num_actions))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 10        
Total params: 10
Trainable params: 10
Non-trainable params: 0
_________________________________________________________________


### Optimizer

https://keras.io/optimizers/

In [0]:
# set optimizer
optimizer = keras.optimizers.SGD(lr=learning_rate)

### Memory

In [0]:
# DO NOT EDIT

# set experience buffer
memory = memory.SequentialMemory(limit=memory_limit,
                                    window_length=1)

### Policy

choose from: https://github.com/keras-rl/keras-rl/blob/master/rl/policy.py

[info on choosing](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-7-action-selection-strategies-for-exploration-d3a97b7cceaf)

In [0]:
# set policy
policy = policy.BoltzmannQPolicy()

### Agent

choose from: https://github.com/keras-rl/keras-rl/tree/master/rl/agents

and use: https://github.com/keras-rl/keras-rl/tree/master/examples

In [0]:
# create agent
model = agents.dqn.DQNAgent(model=model,
                            nb_actions=num_actions,
                            memory=memory,
                            gamma=gamma,
                            batch_size=batch_size,
                            nb_steps_warmup=100,
                            target_model_update=target_model_update,
                            policy=policy)

In [0]:
# DO NOT EDIT

# compile model
model.compile(optimizer,
            metrics=['mae'])

### Training

In [13]:
# DO NOT EDIT

# train
history = model.fit(env,
                  nb_steps=1000,
                  verbose=2)

Training for 1000 steps ...
  12/1000: episode: 1, duration: 0.045s, episode steps: 12, steps per second: 269, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.417 [0.000, 1.000], mean observation: 0.124 [-0.766, 1.459], loss: --, mean_absolute_error: --, mean_q: --
  32/1000: episode: 2, duration: 0.013s, episode steps: 20, steps per second: 1498, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.066 [-0.784, 1.224], loss: --, mean_absolute_error: --, mean_q: --
  44/1000: episode: 3, duration: 0.008s, episode steps: 12, steps per second: 1430, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.114 [-0.955, 1.634], loss: --, mean_absolute_error: --, mean_q: --
  61/1000: episode: 4, duration: 0.013s, episode steps: 17, steps per second: 1274, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.412 [0.000, 1.000], 

 821/1000: episode: 46, duration: 0.061s, episode steps: 13, steps per second: 214, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.154 [0.000, 1.000], mean observation: 0.096 [-1.768, 2.823], loss: 0.889339, mean_absolute_error: 2.162183, mean_q: 3.790550
 832/1000: episode: 47, duration: 0.045s, episode steps: 11, steps per second: 242, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.273 [0.000, 1.000], mean observation: 0.133 [-1.143, 1.925], loss: 0.741677, mean_absolute_error: 2.164570, mean_q: 3.833999
 855/1000: episode: 48, duration: 0.100s, episode steps: 23, steps per second: 229, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.304 [0.000, 1.000], mean observation: 0.039 [-1.916, 2.807], loss: 0.877889, mean_absolute_error: 2.238765, mean_q: 3.939010
 871/1000: episode: 49, duration: 0.073s, episode steps: 16, steps per second: 220, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean act

### Testing

In [14]:
# DO NOT EDIT

# test
hist = model.test(env, nb_episodes=50, visualize=False)
print('mean:', np.mean(hist.history['episode_reward']))

Testing for 50 episodes ...
Episode 1: reward: 10.000, steps: 10
Episode 2: reward: 8.000, steps: 8
Episode 3: reward: 9.000, steps: 9
Episode 4: reward: 8.000, steps: 8
Episode 5: reward: 9.000, steps: 9
Episode 6: reward: 10.000, steps: 10
Episode 7: reward: 8.000, steps: 8
Episode 8: reward: 10.000, steps: 10
Episode 9: reward: 9.000, steps: 9
Episode 10: reward: 8.000, steps: 8
Episode 11: reward: 9.000, steps: 9
Episode 12: reward: 8.000, steps: 8
Episode 13: reward: 10.000, steps: 10
Episode 14: reward: 10.000, steps: 10
Episode 15: reward: 9.000, steps: 9
Episode 16: reward: 10.000, steps: 10
Episode 17: reward: 10.000, steps: 10
Episode 18: reward: 10.000, steps: 10
Episode 19: reward: 8.000, steps: 8
Episode 20: reward: 10.000, steps: 10
Episode 21: reward: 9.000, steps: 9
Episode 22: reward: 9.000, steps: 9
Episode 23: reward: 9.000, steps: 9
Episode 24: reward: 9.000, steps: 9
Episode 25: reward: 9.000, steps: 9
Episode 26: reward: 9.000, steps: 9
Episode 27: reward: 10.000,