In [1]:
import numpy as np
import gym
import random
import sys

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Embedding, Reshape
from tensorflow.keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory


print("Python: "+str(sys.version))
print("Tensorflow version: "+tf.__version__)
print("Keras version: "+tf.keras.__version__)


Python: 3.9.6 (tags/v3.9.6:db3ff76, Jun 28 2021, 15:26:21) [MSC v.1929 64 bit (AMD64)]
Tensorflow version: 2.6.2
Keras version: 2.6.0


In [2]:
ENV_NAME = "Taxi-v3"
env = gym.make(ENV_NAME)
env.render()

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |[35mB[0m: |
+---------+



In [3]:
print("Number of actions: %d" % env.action_space.n)
print("Number of states: %d" % env.observation_space.n)

Number of actions: 6
Number of states: 500


In [4]:
action_size = env.action_space.n
state_size = env.observation_space.n

In [5]:
np.random.seed(123)
env.seed(123)

[123]

In [6]:
env.reset()
env.step(env.action_space.sample())[0]

351

In [12]:
model = Sequential()
model.add(Embedding(500, 10, input_length=1))
model.add(Reshape((10,)))
model.add(Dense(50, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(action_size, activation='linear'))
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1, 10)             5000      
_________________________________________________________________
reshape_3 (Reshape)          (None, 10)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 50)                550       
_________________________________________________________________
dense_5 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_6 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_7 (Dense)              (None, 6)                 306       
Total params: 10,956
Trainable params: 10,956
Non-trainable params: 0
__________________________________________________

In [None]:
memory = SequentialMemory(limit=50000, window_length=1)
policy = EpsGreedyQPolicy()
dqn_only_embedding = DQNAgent(model=model, nb_actions=action_size, memory=memory, nb_steps_warmup=500, target_model_update=1e-2, policy=policy)
dqn_only_embedding.compile(Adam(learning_rate=1e-3), metrics=['mae'])
dqn_only_embedding.fit(env, nb_steps=1000000, visualize=False, verbose=1, nb_max_episode_steps=99, log_interval=100000)

Training for 1000000 steps ...
Interval 1 (0 steps performed)
    67/100000 [..............................] - ETA: 2:33 - reward: -9.1940  



1023 episodes - episode_reward: -128.652 [-927.000, 15.000] - loss: 4.334 - mae: 26.184 - mean_q: -27.211 - prob: 1.000

Interval 2 (100000 steps performed)
5290 episodes - episode_reward: -3.755 [-279.000, 15.000] - loss: 0.624 - mae: 8.384 - mean_q: 6.632 - prob: 1.000

Interval 3 (200000 steps performed)
6768 episodes - episode_reward: 2.301 [-74.000, 15.000] - loss: 0.003 - mae: 7.359 - mean_q: 12.681 - prob: 1.000

Interval 4 (300000 steps performed)
6771 episodes - episode_reward: 2.329 [-112.000, 15.000] - loss: 0.002 - mae: 7.358 - mean_q: 12.682 - prob: 1.000

Interval 5 (400000 steps performed)

In [None]:
dqn_only_embedding.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=99)

In [None]:
dqn_only_embedding.save_weights('dqn_{}_weights.h5f'.format("Taxi-v3"), overwrite=True)

In [None]:
memory = SequentialMemory(limit=50000, window_length=1)
policy = EpsGreedyQPolicy()
dqn = DQNAgent(model=model, nb_actions=action_size, memory=memory, nb_steps_warmup=500, target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=1000000, visualize=False, verbose=1, nb_max_episode_steps=99, log_interval=100000)

In [None]:
dqn.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=99)

In [None]:
dqn.save_weights('dqn_{}_weights.h5f'.format("Taxi-v2"), overwrite=True)