# Imports

In [1]:
import gym
import numpy as np
import time

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

from rl.memory import SequentialMemory
from rl.policy import MaxBoltzmannQPolicy
from rl.agents.dqn import DQNAgent

# Hyperparams

In [2]:
DISCOUNT = 0.99
REPLAY_MEM_SIZE = 10_000
MIN_MEM_SIZE = 1_000
EPISODES = 100_000
LEARNING_RATE = 0.001
BATCH_SIZE = 64
WINDOW_LENGTH=1
SEED = 0
MODEL_NAME="DDQN_Keras"
UPDATE_TARGET_EVERY=10_000

SHOW_PREVIEW = False

np.random.seed(SEED)

# Modified tensorboard

In [3]:
from keras.callbacks import TensorBoard

# Own Tensorboard class
class ModifiedTensorBoard(TensorBoard):

    # Overriding init to set initial step and writer (we want one log file for all .fit() calls)
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.step = 1
        self.writer = tf.summary.create_file_writer(self.log_dir)
        self._log_write_dir = self.log_dir
        

    # Overriding this method to stop creating default log writer
    def set_model(self, model):
        pass

    # Overrided, saves logs with our step number
    # (otherwise every .fit() will start writing from 0th step)
    def on_epoch_end(self, epoch, logs=None):
        self.update_stats(**logs)

    # Overrided
    # We train for one batch only, no need to save anything at epoch end
    def on_batch_end(self, batch, logs=None):
        pass

    # Overrided, so won't close writer
    def on_train_end(self, _):
        pass

    # Custom method for saving own metrics
    # Creates writer, writes custom metrics and closes writer
    def update_stats(self, **stats):
        with self.writer.as_default():
            for key, value in stats.items():
                tf.summary.scalar(key, value, step=self.step)
                self.writer.flush()

# Our agent

In [4]:
class DDQN_Agent:
    def __init__(self,
                 n_states,
                 n_actions,
                 lr,
                 gamma,
                 mem_size,
                 min_mem_size,
                 model_name,
                 update_target_every=2000,
                 batch_size=64,
                 window_length=1):
        
#         self.tensorboard = ModifiedTensorBoard(log_dir="logs/{}-{}".format(model_name, int(time.time())))
        
        model = self.create_model(n_states, n_actions, lr)
        policy = MaxBoltzmannQPolicy()
        replay_memory = SequentialMemory(limit=mem_size, window_length=window_length)
        
        self.ddqn = DQNAgent(model=model,
                             gamma=gamma,
                             policy=policy,
                             enable_double_dqn=True,
                             memory = replay_memory,
                             nb_steps_warmup=min_mem_size,
                             batch_size=batch_size,
                             target_model_update=update_target_every,
                             nb_actions=n_actions)
        
        self.ddqn.compile(optimizer=Adam(learning_rate=lr), metrics=["accuracy"])
        
    def fit(self, **kwargs):
        self.ddqn.fit(**kwargs)#, callbacks=[self.tensorboard])
        
    def create_model(self, n_states, n_actions, lr):
        model = Sequential([
            Flatten(input_shape=(1, n_states)),
            Dense(128, activation='relu'),
            Dense(128, activation='relu'),
            Dropout(0.2),
            Dense(n_actions, activation='linear')
        ])
        model.summary()
        return model

In [5]:
env = gym.make('MountainCar-v0')
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n

agent = DDQN_Agent(n_states=n_states,
                   n_actions=n_actions,
                   lr=LEARNING_RATE,
                   gamma=DISCOUNT,
                   mem_size=REPLAY_MEM_SIZE,
                   min_mem_size=MIN_MEM_SIZE,
                   model_name=MODEL_NAME,
                   update_target_every=UPDATE_TARGET_EVERY,
                   window_length=1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 2)                 0         
                                                                 
 dense (Dense)               (None, 128)               384       
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 3)                 387       
                                                                 
Total params: 17,283
Trainable params: 17,283
Non-trainable params: 0
_________________________________________________________________


2022-01-16 17:57:29.537433: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
agent.fit(env=env, 
          nb_steps=EPISODES, 
          visualize=SHOW_PREVIEW, 
          verbose=1)

Training for 100000 steps ...
Interval 1 (0 steps performed)
  181/10000 [..............................] - ETA: 5s - reward: -1.0000

  updates=self.state_updates,


50 episodes - episode_reward: -198.780 [-200.000, -139.000] - loss: 0.003 - accuracy: 0.315 - mean_q: -0.968

Interval 2 (10000 steps performed)
50 episodes - episode_reward: -199.000 [-200.000, -150.000] - loss: 0.004 - accuracy: 0.273 - mean_q: -1.927

Interval 3 (20000 steps performed)
50 episodes - episode_reward: -199.140 [-200.000, -157.000] - loss: 0.011 - accuracy: 0.332 - mean_q: -2.902

Interval 4 (30000 steps performed)
52 episodes - episode_reward: -194.000 [-200.000, -110.000] - loss: 0.022 - accuracy: 0.278 - mean_q: -3.857

Interval 5 (40000 steps performed)
50 episodes - episode_reward: -197.640 [-200.000, -146.000] - loss: 0.037 - accuracy: 0.306 - mean_q: -4.792

Interval 6 (50000 steps performed)
50 episodes - episode_reward: -200.000 [-200.000, -200.000] - loss: 0.060 - accuracy: 0.353 - mean_q: -5.729

Interval 7 (60000 steps performed)
51 episodes - episode_reward: -199.667 [-200.000, -183.000] - loss: 0.085 - accuracy: 0.420 - mean_q: -6.631

Interval 8 (70000 st