In [None]:
# Refer from
# https://keras.io/examples/rl/deep_q_network_breakout/
import gym
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Input, ReLU, Conv2D, Flatten
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
from ER import ReplayMemory
from PER import ProportionalPrioritizedMemory
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(\
        gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*3)])
  except RuntimeError as e:
    print(e)

In [None]:
def get_q_network(observation_space, action_space):
  # Input shape is expected as (84,84,4)
  X_input = Input(shape=observation_space)
  # Convolution Layers
  X = X_input
  X = Conv2D(filters=32, kernel_size=8, strides=4, padding='valid', activation="relu", data_format='channels_last')(X)
  X = Conv2D(filters=64, kernel_size=4, strides=2, padding='valid', activation="relu", data_format='channels_last')(X)
  X = Conv2D(filters=64, kernel_size=3, strides=1, padding='valid', activation="relu", data_format='channels_last')(X)
  X = Flatten()(X)
  X = Dense(units=512,          activation='relu',   kernel_initializer='he_uniform')(X)
  X = Dense(units=action_space, activation='linear', kernel_initializer='he_uniform')(X)
  model = Model(inputs=X_input, outputs=X)
  model.build(input_shape=observation_space)

  return model

In [None]:
class DQNAgent():
  def __init__(self, env, cfg):
    self.env_cfg    = cfg['ENV']
    self.rl_cfg     = cfg['RL']
    self.er_cfg     = cfg['RL']['ER']
    self.er_type    = self.er_cfg["ALGORITHM"].upper()
    self.img_size   = self.env_cfg['IMG_SIZE']
    self.state_size = self.env_cfg['IMG_SIZE']
    self.action_size= env.action_space.n

    # Hyper-parameters for learning
    self.discount_factor = 0.99
    self.learning_rate  = 0.005
    self.epsilon        = 1.0
    self.epsilon_decay  = 0.999
    self.epsilon_min    = 0.1
    self.tau            = 0.005
    self.start_to_train = self.er_cfg["TRAIN_START"]
    self.batch_size     = self.er_cfg["BATCH_SIZE"]
    self.buffer_size    = self.er_cfg["MEMORY_SIZE"]
    self.update_freq    = self.rl_cfg['UPDATE_FREQ']
    self.train_freq     = self.rl_cfg['TRAIN_FREQ']

    # DQN Architecture
    self.model        = get_q_network(self.state_size, self.action_size)
    self.target_model = get_q_network(self.state_size, self.action_size)
    self.optimizer    = Adam(learning_rate=self.learning_rate, clipnorm=1.0)
    self.model.summary()

    # Experience Replay
    if self.er_type == "ER":
      self.memory = ReplayMemory(capacity=self.buffer_size)
    elif self.er_type == "PER":
      self.memory = ProportionalPrioritizedMemory(capacity=self.buffer_size)

    # Miscellaneous
    self.show_media_info = False
    self.steps = 0
    
  def get_actions(self, state):
    self.steps += 1
    # Exploration and Exploitation
    if ((np.random.rand() <= self.epsilon) or ()):
      return random.randrange(self.action_size)
    else:
      state = tf.convert_to_tensor([state], dtype=tf.float32)
      return np.argmax(self.model(state))

  def remember(self, state, action, reward, next_state, done):
    state       = np.array(state,       dtype=np.float32)
    action      = np.array([action])
    reward      = np.array([reward],    dtype=np.float32)
    done        = np.array([done],      dtype=np.float32)
    next_state  = np.array(next_state,  dtype=np.float32)
    transition  = (state, action, reward, next_state, done)
    self.memory.append(transition)
    return

  def train(self):
    if self.steps < self.start_to_train:
      return 0.0
    # Sampling from the memory
    if self.steps % self.train_freq == 0:
      return 0.0
    # Decaying Exploration Ratio
    if self.epsilon > self.epsilon_min:
        self.epsilon *= self.epsilon_decay
    if self.er_type == "ER":
      mini_batch = self.memory.sample(self.batch_size)
    elif self.er_type == "PER":
      mini_batch, idxs, is_weights = self.memory.sample(self.batch_size)

    states      = tf.convert_to_tensor(np.array([sample[0] for sample in mini_batch]))
    actions     = tf.convert_to_tensor(np.array([sample[1][0] for sample in mini_batch]))
    rewards     = tf.convert_to_tensor(np.array([sample[2] for sample in mini_batch]))
    next_states = tf.convert_to_tensor(np.array([sample[3] for sample in mini_batch]))
    dones       = tf.convert_to_tensor(np.array([sample[4] for sample in mini_batch]))
    
    if self.show_media_info == False:
      self.show_media_info = True
      print('Start to train, check batch shapes')
      print('**** shape of mini_batch', np.shape(mini_batch),type(mini_batch))
      print('**** shape of states', np.shape(states),type(states))
      print('**** shape of actions', np.shape(actions),type(actions))
      print('**** shape of rewards', np.shape(rewards),type(rewards))
      print('**** shape of next_states', np.shape(next_states),type(next_states))
      print('**** shape of dones', np.shape(dones),type(dones))

    model_params = self.model.trainable_variables
    with tf.GradientTape() as tape:
      # get q value
      q = self.model(states)
      one_hot_action = tf.one_hot(actions, self.action_size)
      q = tf.reduce_sum(one_hot_action * q, axis=1)
      q = tf.expand_dims(q,axis=1)
      # Target q and maximum target q
      target_q = tf.stop_gradient(self.target_model(next_states))
      max_q = tf.reduce_max(target_q,axis=1)
      max_q = tf.expand_dims(max_q,axis=1)
      
      targets = rewards + (1 - dones) * self.discount_factor * max_q
      td_error = targets - q
      if self.er_type == "PER":
        loss = tf.reduce_mean(is_weights * tf.square(targets - q))
      else:
        loss = tf.reduce_mean(tf.square(targets - q))
        
    grads = tape.gradient(loss, model_params)
    self.optimizer.apply_gradients(zip(grads, model_params))

    if self.er_type == "PER":
      sample_importance = td_error.numpy()
      for i in range(self.batch_size):
        self.memory.update(idxs[i], sample_importance[i])

    return loss

  def update_target_net(self):
    if self.steps % self.update_freq == 0:
      self.target_model.set_weights(self.model.get_weights())
    return

  def load_model(self,at):
    self.model.load_weights( at + self.filename + "_TF")
    self.target_model.load_weights(at + self.filename + "_TF")
    return

  def save_model(self,at):
    self.model.save_weights( at + self.filename + "_TF", save_format="tf")
    self.target_model.save_weights(at + self.filename + "_TF", save_format="tf")
    return

In [None]:
import cv2
class Featurization():
  def __init__(self, observation_size):
    self.obs_size = observation_size
    self.is_first = True
    self.feature = np.zeros(observation_size)

  def preprocessing(self, img):
    img_rgb_resize = cv2.resize(img, self.obs_size[0:2], interpolation=cv2.INTER_CUBIC)
    img_rgb_resize = np.transpose(img_rgb_resize,axes=(1,0,2))
    img_k_resize = cv2.cvtColor(img_rgb_resize,cv2.COLOR_RGB2GRAY)
    img_k_resize = img_k_resize / 255.0 # scaling 0 ~ 1
    state = np.array(img_k_resize,dtype=np.float32)
    state = np.expand_dims(state,axis=2)
    if self.is_first == True:
      for i in range(self.obs_size[2]):
        self.feature = np.append(self.feature, state, axis=2)
        self.feature = np.delete(self.feature, obj=0, axis=2)
      self.is_first = False
    else:
      self.feature = np.append(self.feature, state, axis=2)
      self.feature = np.delete(self.feature, obj=0, axis=2)
    return self.feature

In [None]:
scores_avg, scores_raw, epsilons, losses, loss_list, score_avg, end = [], [], [], [], [], 0, False
FILENAME = "BreakoutDeterministic-v4_DQN"
def save_statistics():
    # View data
    plt.clf()
    plt.subplot(311)
    plt.plot(scores_avg, 'b')
    plt.plot(scores_raw, 'b', alpha=0.8, linewidth=0.5)
    plt.xlabel('Episodes'); plt.ylabel('average score'); plt.grid()
    plt.title(FILENAME)
    plt.subplot(312)
    plt.plot(epsilons, 'b')
    plt.xlabel('Episodes'); plt.ylabel('epsilon'); plt.grid()
    plt.subplot(313)
    plt.plot(losses, 'b')
    plt.xlabel('Episodes'); plt.ylabel('losses') ;plt.grid()
    plt.savefig(FILENAME + "_TF.jpg", dpi=100)

In [None]:
%matplotlib tk

EPISODES = 10000
MAX_STEP_PER_EPISODE = 10000
END_SCORE = 40
SAVE_FREQ = 10
cfg = {\
  "ENV":{
    "NAME":"BreakoutDeterministic-v4",
    "IMG_SIZE":(84,84,4)
  },
  "RL":{
    "ALGORITHM":'DQN',
    "ER":{
      "ALGORITHM":'ER',
      "BATCH_SIZE":64,
      "TRAIN_START":20000,
      "MEMORY_SIZE":200000,
    },
    "TRAIN_FREQ":4,
    "UPDATE_FREQ":1000,
  },
}
ENV_NAME = cfg['ENV']['NAME']
if __name__ == "__main__":
  env = gym.make(ENV_NAME)
  print('States ',env.observation_space, env.observation_space.shape,', Actions ', env.action_space, env.action_space.n)
  agent = DQNAgent(env, cfg)
  featurization = Featurization(cfg['ENV']['IMG_SIZE'])
  global_steps = 0
  for e in range(EPISODES):
    observe = env.reset()
    feature = featurization.preprocessing(observe)
    episode_score = 0
    episode_step = 0
    loss_list = []
    while True:
      # obs = env.render(mode='human')
      # action = env.action_space.sample()
      action = agent.get_actions(feature)
      observe, reward, done, info = env.step(action=action)
      next_feature = featurization.preprocessing(observe)
      agent.remember(feature, action, reward, next_feature, done)
      loss = agent.train()
      agent.update_target_net()

      episode_score += reward
      episode_step += 1
      global_steps += 1
      feature = next_feature
      loss_list.append(loss)
      # break
      if (done == True) or (episode_step > MAX_STEP_PER_EPISODE):
        score_avg = 0.9 * score_avg + 0.1 * episode_score if score_avg != 0 else episode_score
        print('{} epi with {} steps, epi score {}, score_avg {}'.format(e+1,global_steps,episode_score, score_avg))
        scores_avg.append(score_avg)
        scores_raw.append(episode_score)
        losses.append(np.mean(loss_list))
        epsilons.append(agent.epsilon)
        if e % SAVE_FREQ == 0:
          save_statistics()
        if score_avg > END_SCORE:
          agent.save_model("")
          save_statistics()
          end = True
        break
    if end == True:
      env.close()
      print("End")
      break