In [1]:
import gym
import numpy as np
import time

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import TensorBoard
import tensorflow as tf

import os
from rl.memory import SequentialMemory

In [2]:
from keras.callbacks import TensorBoard

# Own Tensorboard class
class ModifiedTensorBoard(TensorBoard):

    # Overriding init to set initial step and writer (we want one log file for all .fit() calls)
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.step = 1
        self.writer = tf.summary.create_file_writer(self.log_dir)
        self._log_write_dir = self.log_dir

    # Overriding this method to stop creating default log writer
    def set_model(self, model):
        pass

    # Overrided, saves logs with our step number
    # (otherwise every .fit() will start writing from 0th step)
    def on_epoch_end(self, epoch, logs=None):
        self.update_stats(**logs)

    # Overrided
    # We train for one batch only, no need to save anything at epoch end
    def on_batch_end(self, batch, logs=None):
        pass

    # Overrided, so won't close writer
    def on_train_end(self, _):
        pass

    # Custom method for saving own metrics
    # Creates writer, writes custom metrics and closes writer
    def update_stats(self, **stats):
        with self.writer.as_default():
            for key, value in stats.items():
                tf.summary.scalar(key, value, step=self.step)
                self.writer.flush()

In [3]:
# Hyper params
# replay_memory_size = 50_000
# min_replay_memory_size = 1_000
# model_name="cart_ddqn"
# seed = 0
# learning_rate = 0.001
# gamma = 0.99 # Discount factor for past rewards
# epsilon = 1.0  # Epsilon greedy parameter
# epsilon_min = 0.1  # Minimum epsilon greedy parameter
# epsilon_max = 1.0  # Maximum epsilon greedy parameter
# epsilon_interval = (epsilon_max - epsilon_min)  # Rate at which to reduce chance of random action being taken
# mini_batch_size = 32  # Size of batch taken from replay buffer
# max_steps_per_episode = 10000

DISCOUNT = 0.99
REPLAY_MEMORY_SIZE = 50_000  # How many last steps to keep for model training
MIN_REPLAY_MEMORY_SIZE = 100  # Minimum number of steps in a memory to start training
MINIBATCH_SIZE = 64  # How many steps (samples) to use for training
UPDATE_TARGET_EVERY = 5  # Terminal states (end of episodes)
MODEL_NAME = 'cart_ddqn'
MIN_REWARD = -200  # For model save
MEMORY_FRACTION = 0.20

# Env settings
EPISODES = 100

# Exploration settings
epsilon = 1
EPSILON_DECAY = 0.99975
EPSILON_MIN = 0.001

# Stats settings
AGGREGATE_STATS_EVERY = 5 #episodes
SHOW_PREVIEW = True

In [23]:
class DDQN_Agent:
    def __init__(self, states, actions, learning_rate):
        self.model = self.create_model(states, actions, learning_rate)
        self.target_model = self.create_model(states, actions, learning_rate)
        self.target_model.set_weights(self.model.get_weights())
        
        self.memory = SequentialMemory(limit=REPLAY_MEMORY_SIZE, window_length=1)

        self.tensorboard = ModifiedTensorBoard(log_dir="logs/{}-{}".format(MODEL_NAME, int(time.time())))
        
        self.target_update_counter = 0

    def create_model(self, n_states, n_actions, lr):
        model = Sequential([
            Dense(128, activation='relu', input_shape=(1, n_states)),
            Dense(64, activation='relu'),
            Dropout(0.2),
            Dense(n_actions, activation='linear'),
        ])
        
        model.compile(loss="mse", optimizer=Adam(learning_rate=lr), metrics="accuracy")        
        return model
    
    # Adds step's data to a memory replay array
    # (observation space, action, reward, new observation space, done)
    # ARGUMENT:
        # Experience: (observation, action, reward, terminal)
    def update_replay_memory(self, experience):
        self.memory.append(*experience)
        
    # Queries main network for Q values given current observation space (environment state)
    def get_qs(self, state):
        return self.model.predict(np.array(state).reshape(-1, *state.shape)/255)[0]
    
    def train(self, terminal_state, step):

        # Start training only if certain number of samples is already saved
        if self.memory.nb_entries < MIN_REPLAY_MEMORY_SIZE:
            return

        # Get a minibatch of random samples from memory replay table
        # 
        minibatch = self.memory.sample(MINIBATCH_SIZE)

        # Get current states from minibatch, then query NN model for Q values
        current_states = np.array([transition[0] for transition in minibatch])/255
        current_qs_list = self.model.predict(current_states)

        # Get future states from minibatch, then query NN model for Q values
        # When using target network, query it, otherwise main network should be queried
        new_current_states = np.array([transition[3] for transition in minibatch])/255
        future_qs_list = self.target_model.predict(new_current_states)

        X = []
        y = []

        # Now we need to enumerate our batches
        for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):

            # If not a terminal state, get new q from future states, otherwise set it to 0
            # almost like with Q Learning, but we use just part of equation here
            if not done:
                max_future_q = np.max(future_qs_list[index])
                new_q = reward + DISCOUNT * max_future_q
            else:
                new_q = reward

            # Update Q value for given state
            current_qs = current_qs_list[index][0]
            current_qs[action] = new_q

            # And append to our training data
            X.append(current_state)
            y.append(current_qs)

        # Fit on all samples as one batch, log only on terminal state
        self.model.fit(np.array(X)/255, np.array(y), batch_size=MINIBATCH_SIZE, verbose=0, shuffle=False, callbacks=[self.tensorboard] if terminal_state else None)

        # Update target network counter every episode
        if terminal_state:
            self.target_update_counter += 1

        # If counter reaches set value, update target network with weights of main network
        if self.target_update_counter > UPDATE_TARGET_EVERY:
            self.target_model.set_weights(self.model.get_weights())
            self.target_update_counter = 0

In [24]:
env_name = 'CartPole-v1'
env = gym.make(env_name)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n

agent = DDQN_Agent(
    n_states,
    n_actions,
    learning_rate=0.0003
)

In [25]:
# For repeatable results
np.random.seed(1)
tf.random.set_seed(1)

# For stats
ep_rewards = [-200]

In [26]:
for episode in range(1, EPISODES + 1):
    # Update tensorboard step every episode
    agent.tensorboard.step = episode
    
    # Restarting episode -> reset ep reward and step
    episode_reward = 0
    step = 1
    
    current_state = env.reset()
    
    done = False
    while not done:
    if np.random.random() > epsilon:
            # Get action from Q table
            action = np.argmax(agent.get_qs(current_state))
        else:
            # Get random action
            action = np.random.randint(0, n_actions)

        new_state, reward, done, _ = env.step(action)
        
        # Update episodic reward
        episode_reward += reward
        
        if SHOW_PREVIEW:
            env.render()
            
        # Every step we update replay memory and train the main network
        agent.update_replay_memory((current_state, action, reward, done))
        agent.train(done, step)
        current_state = new_state
        step += 1
        
    # Append episode reward to a list and log state (every given number of episodes)
    ep_rewards.append(episode_reward)
    if not episode % AGGREGATE_STATS_EVERY or episode == 1:
        average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
        min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
        max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
        agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon)

        # we would save checkpoints here
        # ...
        
    # Finally, handle the epsilon decay
    if epsilon > EPSILON_MIN:
        epsilon *= EPSILON_DECAY
        epsilon = max(EPSILON_MIN, epsilon)



AttributeError: 'ModifiedTensorBoard' object has no attribute '_train_dir'