# Importation and setup

In [1]:
from collections import deque, namedtuple
import random

import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

In [2]:
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

env = gym.make('CartPole-v1')
env.seed(SEED)

[42]

# CartPole environment description

In [3]:
print('Action space:', env.action_space)
print('Observation (or state) space:', env.observation_space)

Action space: Discrete(2)
Observation (or state) space: Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)


According to [OpenAI gym wiki](https://github.com/openai/gym/wiki/CartPole-v0):

There are 2 actions we can take:
- Move to the left (denoted as `0`).
- Move to the right (denoted as `1`).

An observation contains a list of 4 floating-point numbers, they are, respectively:
- Cart position.
- Cart velocity.
- Pole angle.
- Pole velocity at tip.

Common APIs of the game:
- env.seed(seed: int) -> None. Set random seed.
- env.reset() -> None. Reset the game, used when we get a game-over.
- env.action_space.sample() -> int (an action id). Get a random valid action.
- env.step(action: int) -> None. Simulate this action.

# Hyper-parameters

In [4]:
GAMMA = 0.8 # the discount factor of future reward
TARGET_LEARNING_RATE = 0.01

MIN_MEMORY_SIZE = 1000
MAX_MEMORY_SIZE = 5000
SIMULATIONS_PER_TRAINING = 4 # at each step, we simulate this number of experiences before training a minibatch

BATCH_SIZE = 32
LEARNING_RATE = 0.001
TRAIN_ROUNDS = 200 # a round means a game

# higher epsilon means more exploration.
# epsilon is decreased gradually.
def get_epsilon(curr_round: int, train_rounds: int):
    return 1 - curr_round/train_rounds

# Class and function definitions

In [5]:
Experience = namedtuple('Experience', 'observation action reward new_observation done')

# Experience Replay
class ExperienceMemory():
    def __init__(self, memory_capacity):
        self.memory = deque(maxlen=memory_capacity)
    
    def __len__(self):
        return len(self.memory)
    
    def push(self, e: Experience):
        self.memory.append(e)
    
    def sample(self, size):
        return random.sample(self.memory, size)

In [6]:
class Model():
    def __init__(self, observation_shape, action_shape, lr):        
        inputs = layers.Input(shape=(observation_shape))
        h1 = layers.Dense(20, activation='relu')(inputs)
        h1 = layers.Dense(10, activation='relu')(h1)
        V = layers.Dense(1)(h1)
        
        h2 = layers.Dense(20, activation='relu')(inputs)
        h2 = layers.Dense(10, activation='relu')(h2)
        advantage = layers.Dense(action_shape)(h2)
        mean_adv = layers.Lambda(lambda x: tf.keras.backend.mean(x, axis=1))(advantage)
        
        outputs = layers.Add()([V, layers.Subtract()([advantage, mean_adv])])
    
        self.model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
        
        self.model.compile(
            loss=tf.keras.losses.Huber(), 
            optimizer=tf.optimizers.Adam(learning_rate=lr), 
            metrics=['accuracy']
        )
        
    def set_weights(self, weights):
        self.model.set_weights(weights)
    
    def get_weights(self):
        return self.model.get_weights()
    
    def forward(self, batch_observations):
        return self.model.predict(batch_observations)
            
    def train(self, batch_x, batch_y):
        self.model.train_on_batch(batch_x, batch_y)

In [7]:
def simulate_experience(env, model, eps, curr_observation):
    # select the next action (from the current state)
    if random.random() < eps:
        # use a random state
        action = env.action_space.sample()
    else:
        # use the best action (according to the current model)
        action_scores = model.forward(np.expand_dims(curr_observation, 0)).flatten()
        action = np.argmax(action_scores)

    # simulate this action and push the experience to experience_memory
    new_observation, reward, done, info = env.step(action)
    env.render()
    return Experience(curr_observation, action, reward, new_observation, done)

In [8]:
def train(model, minibatch):
    # batch_curr_observations shape: [batch size, observation shape]
    batch_curr_observations = [e.observation for e in minibatch]
    # batch_curr_qualities shape: [batch size, action size]
    batch_curr_qualities = model.forward(np.array(batch_curr_observations))
    
    # batch_new_observations shape: [batch size, observation shape]
    batch_new_observations = [e.new_observation for e in minibatch]
    # batch_new_qualities shape: [batch size, action size]
    batch_new_qualities = model.forward(np.array(batch_new_observations))
    
    batch_x, batch_y = [], []
    for id, e in enumerate(minibatch):
        observation = e.observation
        action = e.action
        reward = e.reward
        new_observation = e.new_observation
        done = e.done
        
        # curr_qualities shape: [action size]
        curr_qualities = batch_curr_qualities[id]
        # new_qualities shape: [action size]
        new_qualities = batch_new_qualities[id]
        if done:
            updated_quality : float = reward
        else:
            updated_quality : float = reward + GAMMA*max(new_qualities)
        # desired_qualities shape: [action size]
        desired_qualities = curr_qualities
        desired_qualities[action] = (1 - TARGET_LEARNING_RATE)*curr_qualities[action] + TARGET_LEARNING_RATE*updated_quality
        
        batch_x.append(observation)
        batch_y.append(desired_qualities)
    
    model.train(np.array(batch_x), np.array(batch_y))

# Model training

In [9]:
experience_memory = ExperienceMemory(memory_capacity=MAX_MEMORY_SIZE)

model = Model(observation_shape=env.observation_space.shape, action_shape=env.action_space.n, lr=LEARNING_RATE)

In [10]:
%%time

train_step_count = 0
eps = 1
done = True

# Filled the experience_memory with minimum number of experiences.
while len(experience_memory) < MIN_MEMORY_SIZE:
    if done:
        observation = env.reset()
        done = False
    
    e = simulate_experience(env, model, eps, observation)
    experience_memory.push(e)
    
    observation = e.new_observation
    done = e.done

# train the deep-q-model
for curr_round in range(1, TRAIN_ROUNDS + 1):
    observation = env.reset()
    eps = get_epsilon(curr_round, TRAIN_ROUNDS)
    cumulative_reward = 0
    done = False
    
    while not done:
        # simulation to get new experiences
        for sim_id in range(SIMULATIONS_PER_TRAINING):
            e = simulate_experience(env, model, eps, observation)
            experience_memory.push(e)
            
            observation = e.new_observation
            cumulative_reward += e.reward
            done = e.done
            
            if done:
                break
        
        # train the main model
        if len(experience_memory) >= MIN_MEMORY_SIZE:
            train_step_count += 1
            minibatch = experience_memory.sample(BATCH_SIZE)
            train(model, minibatch)
        
    print(f'round: {curr_round:3}, reward: {int(cumulative_reward):3}, experience size: {len(experience_memory):5}')

env.close()

round:   1, reward:  12, experience size:  1012
round:   2, reward:  24, experience size:  1036
round:   3, reward:   9, experience size:  1045
round:   4, reward:  28, experience size:  1073
round:   5, reward:  12, experience size:  1085
round:   6, reward:  34, experience size:  1119
round:   7, reward:  16, experience size:  1135
round:   8, reward:  17, experience size:  1152
round:   9, reward:  28, experience size:  1180
round:  10, reward:  31, experience size:  1211
round:  11, reward:  63, experience size:  1274
round:  12, reward:  13, experience size:  1287
round:  13, reward:  17, experience size:  1304
round:  14, reward:  58, experience size:  1362
round:  15, reward:  11, experience size:  1373
round:  16, reward:  15, experience size:  1388
round:  17, reward:  32, experience size:  1420
round:  18, reward:  42, experience size:  1462
round:  19, reward:  14, experience size:  1476
round:  20, reward:  29, experience size:  1505
round:  21, reward:  15, experience size