COPYRIGHT © 2018 Kiran Arun <kironni@gmail.com>

### Setup

In [13]:
!pip3 install gym
!pip3 install keras-rl



# RL Challenge

In [0]:
# importing libraries
import numpy as np
import matplotlib.pyplot as plt
import gym

import keras
import keras.layers as layers
from rl import agents,memory,policy

### Hyperparameters

In [0]:
# Hyperparams

# number of steps to keep in experience buffer
memory_limit = 400

# discount value
gamma = 0.99

# how much to update target graph
target_model_update = 1e-2

# learning rate
learning_rate = 1e-2

# number of steps to sample from buffer to train on
batch_size = 32

In [16]:
# DO NOT EDIT

# setting environment
env = gym.make('CartPole-v1')
num_actions = env.action_space.n

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


### Neural Network

In [17]:
keras.backend.clear_session()

# create neural net
model = keras.models.Sequential()

# keep this as input layer
model.add(layers.Flatten(input_shape=(1,) + env.observation_space.shape))

model.add(layers.Dense(4,activation='relu'))

# keep this as output layer
model.add(layers.Dense(num_actions))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 20        
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 10        
Total params: 30
Trainable params: 30
Non-trainable params: 0
_________________________________________________________________


### Optimizer

https://keras.io/optimizers/

In [0]:
# set optimizer
optimizer = keras.optimizers.SGD(lr=learning_rate)

### Memory

In [0]:
# DO NOT EDIT

# set experience buffer
memory = memory.SequentialMemory(limit=memory_limit,
                                    window_length=1)

### Policy

choose from: https://github.com/keras-rl/keras-rl/blob/master/rl/policy.py

[info on choosing](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-7-action-selection-strategies-for-exploration-d3a97b7cceaf)

In [0]:
# set policy
policy = policy.BoltzmannQPolicy()

### Agent

choose from: https://github.com/keras-rl/keras-rl/tree/master/rl/agents

and use: https://github.com/keras-rl/keras-rl/tree/master/examples

In [0]:
# create agent
model = agents.dqn.DQNAgent(model=model,
                            nb_actions=num_actions,
                            memory=memory,
                            gamma=gamma,
                            batch_size=batch_size,
                            nb_steps_warmup=100,
                            target_model_update=target_model_update,
                            policy=policy)

In [0]:
# DO NOT EDIT

# compile model
model.compile(optimizer,
            metrics=['mae'])

In [23]:
# DO NOT EDIT

# train
history = model.fit(env,
                  nb_steps=1000,
                  verbose=2)

Training for 1000 steps ...
  13/1000: episode: 1, duration: 0.053s, episode steps: 13, steps per second: 243, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.692 [0.000, 1.000], mean observation: -0.086 [-1.949, 1.205], loss: --, mean_absolute_error: --, mean_q: --
  33/1000: episode: 2, duration: 0.013s, episode steps: 20, steps per second: 1485, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.124 [-0.937, 0.565], loss: --, mean_absolute_error: --, mean_q: --
  58/1000: episode: 3, duration: 0.017s, episode steps: 25, steps per second: 1501, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: -0.105 [-1.379, 0.430], loss: --, mean_absolute_error: --, mean_q: --
 111/1000: episode: 4, duration: 0.331s, episode steps: 53, steps per second: 160, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.491 [0.000, 1.000]

done, took 3.561 seconds


In [24]:
# DO NOT EDIT

# test
hist = model.test(env, nb_episodes=25, visualize=False)
print('mean:', np.mean(hist.history['episode_reward']))

Testing for 25 episodes ...
Episode 1: reward: 39.000, steps: 39
Episode 2: reward: 30.000, steps: 30
Episode 3: reward: 36.000, steps: 36
Episode 4: reward: 29.000, steps: 29
Episode 5: reward: 34.000, steps: 34
Episode 6: reward: 35.000, steps: 35
Episode 7: reward: 11.000, steps: 11
Episode 8: reward: 32.000, steps: 32
Episode 9: reward: 48.000, steps: 48
Episode 10: reward: 14.000, steps: 14
Episode 11: reward: 37.000, steps: 37
Episode 12: reward: 12.000, steps: 12
Episode 13: reward: 33.000, steps: 33
Episode 14: reward: 35.000, steps: 35
Episode 15: reward: 14.000, steps: 14
Episode 16: reward: 31.000, steps: 31
Episode 17: reward: 12.000, steps: 12
Episode 18: reward: 34.000, steps: 34
Episode 19: reward: 40.000, steps: 40
Episode 20: reward: 32.000, steps: 32
Episode 21: reward: 28.000, steps: 28
Episode 22: reward: 10.000, steps: 10
Episode 23: reward: 35.000, steps: 35
Episode 24: reward: 13.000, steps: 13
Episode 25: reward: 32.000, steps: 32
mean: 28.24
