In [0]:
import tensorflow as tf
import numpy as np
import random as rd
import gym
from collections import deque

In [0]:
env = gym.make('CartPole-v0')
input_size = env.observation_space.shape[0]
output_size = env.action_space.n
env._max_episode_step = 10001

DIS = 0.99
MAX_EPISODE = 5000
REPLAY_MEMORY = 50000
BATCH_SIZE = 64
TARGET_UPDATE_FREQUENCY = 5

# minimum epsilon for epsilon greedy
MIN_E = 0.0
# epsilon will be `MIN_E` at `EPSILON_DECAYING_EPISODE`
EPSILON_DECAYING_EPISODE = MAX_EPISODE * 0.01

In [0]:
def annealing_epsilon(episode: int, min_e: float, max_e: float, target_episode: int) -> float:
    """Return an linearly annealed epsilon
    Epsilon will decrease over time until it reaches `target_episode`
         (epsilon)
             |
    max_e ---|\
             | \
             |  \
             |   \
    min_e ---|____\_______________(episode)
                  |
                 target_episode
     slope = (min_e - max_e) / (target_episode)
     intercept = max_e
     e = slope * episode + intercept
    Args:
        episode (int): Current episode
        min_e (float): Minimum epsilon
        max_e (float): Maximum epsilon
        target_episode (int): epsilon becomes the `min_e` at `target_episode`
    Returns:
        float: epsilon between `min_e` and `max_e`
    """

    slope = (min_e - max_e) / (target_episode)
    intercept = max_e

    return max(min_e, slope * episode + intercept)

In [0]:
class DQN:
#   def __init__(self, sess, input_size, output_size, name="main"):
#     self.sess = sess
#     self.input_size = input_size
#     self.output_size = output_size
#     self.net_name = name
    
#     self._build_network()
  
#   def _build_network(self, h_size=16, l_rate=1e-3):
#     with tf.device('/device:GPU:0'):
#       with tf.variable_scope(self.net_name):
#         self._x = tf.placeholder(tf.float32, shape=[None, self.input_size], name="input_x")
#         self._keep_prob = tf.placeholder(tf.float32, name="prob")
        
#         W1 = tf.get_variable('W1', shape=[self.input_size, h_size],
#                               initializer = tf.contrib.layers.xavier_initializer())
#         L1 = tf.nn.relu(tf.matmul(self._x, W1))
#         L1 = tf.nn.dropout(L1, self._keep_prob)
        
#         W2 = tf.get_variable('W2', shape=[h_size, h_size],
#                               initializer = tf.contrib.layers.xavier_initializer())
#         L2 = tf.nn.relu(tf.matmul(L1, W2))
#         L2 = tf.nn.dropout(L2, self._keep_prob)
        
        
#         W3 = tf.get_variable('W3', shape=[h_size, self.output_size],
#                               initializer = tf.contrib.layers.xavier_initializer())
#         self._Qpred = tf.matmul(L2, W3)
      
#       # Need to define the parts of the network needed for learning a
#       # Policy
#       self._y = tf.placeholder(shape=[None, self.output_size], dtype=tf.float32)
      
#       # Loss
#       self._loss = tf.reduce_mean(tf.square(self._y - self._Qpred))
#       # Training
#       self._train = tf.train.AdamOptimizer(l_rate, epsilon=1e-5).minimize(self._loss)
      
#   def predict(self, state):
#     x = np.reshape(state, [-1, self.input_size])
#     return self.sess.run(self._Qpred, feed_dict = {self._x : x, self._keep_prob:1.0})
  
#   def update(self, x_stack, y_stack):
#     return self.sess.run([self._loss, self._train],
#                          feed_dict={self._x: x_stack, self._y: y_stack, self._keep_prob:0.7})
  def __init__(self, session: tf.Session, input_size: int, output_size: int, name: str="main") -> None:
    """DQN Agent can
    1) Build network
    2) Predict Q_value given state
    3) Train parameters
    Args:
        session (tf.Session): Tensorflow session
        input_size (int): Input dimension
        output_size (int): Number of discrete actions
        name (str, optional): TF Graph will be built under this name scope
    """
    self.session = session
    self.input_size = input_size
    self.output_size = output_size
    self.net_name = name

    self._build_network()

  def _build_network(self, h_size=16, l_rate=0.001) -> None:
    """DQN Network architecture (simple MLP)
    Args:
        h_size (int, optional): Hidden layer dimension
        l_rate (float, optional): Learning rate
    """
    with tf.device('/device:GPU:0'):
      with tf.variable_scope(self.net_name):
        self._X = tf.placeholder(tf.float32, [None, self.input_size], name="input_x")
#         self._prob = tf.placeholder(tf.float32, name="prob")
        net = self._X

        net = tf.layers.dense(net, h_size, activation=tf.nn.relu, bias_initializer=tf.contrib.layers.xavier_initializer())
#         net = tf.nn.dropout(net, self._prob)

        net = tf.layers.dense(net, h_size, activation=tf.nn.relu, bias_initializer=tf.contrib.layers.xavier_initializer())
        net = tf.layers.dense(net, self.output_size)
        self._Qpred = net

        self._Y = tf.placeholder(tf.float32, shape=[None, self.output_size])
        self._loss = tf.losses.mean_squared_error(self._Y, self._Qpred)

        optimizer = tf.train.AdamOptimizer(learning_rate=l_rate)
        self._train = optimizer.minimize(self._loss)

  def predict(self, state: np.ndarray) -> np.ndarray:
      """Returns Q(s, a)
      Args:
          state (np.ndarray): State array, shape (n, input_dim)
      Returns:
          np.ndarray: Q value array, shape (n, output_dim)
      """
      x = np.reshape(state, [-1, self.input_size])
      return self.session.run(self._Qpred, feed_dict={self._X: x})

  def update(self, x_stack: np.ndarray, y_stack: np.ndarray) -> list:
      """Performs updates on given X and y and returns a result
      Args:
          x_stack (np.ndarray): State array, shape (n, input_dim)
          y_stack (np.ndarray): Target Q array, shape (n, output_dim)
      Returns:
          list: First element is loss, second element is a result from train step
      """
      feed = {
          self._X: x_stack,
          self._Y: y_stack
      }
      return self.session.run([self._loss, self._train], feed)

In [0]:
def get_copy_var_ops(*, dest="target", src="main"):
  op_holder = []
  
  src_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=src)
  dest_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=dest)
  
  for src_var, dest_var in zip(src_vars, dest_vars):
    op_holder.append((dest_var.assign(src_var.value())))
  
  return op_holder

In [0]:
def replay_train(mainDQN, targetDQN, train_batch):
#   x_stack = np.empty(0).reshape(0, input_size)
#   y_stack = np.empty(0).reshape(0, output_size)
  
#   for state, action, reward, next_state, done in train_batch:
#     Q = mainDQN.predict(state)
#     nQ = mainDQN.predict(next_state)
#     tQ = targetDQN.predict(next_state)
    
#     if done:
#       Q[-1, action] = reward
#     else:
#       Q[-1, action] = reward + dis * tQ[-1, np.argmax(nQ)]
      
#     y_stack = np.vstack([y_stack, Q])
#     x_stack = np.vstack([x_stack, state])

  states = np.vstack([x[0] for x in train_batch])
  actions = np.array([x[1] for x in train_batch])
  rewards = np.array([x[2] for x in train_batch])
  next_states = np.vstack([x[3] for x in train_batch])
  done = np.array([x[4] for x in train_batch])

  X = states

  Q_target = rewards + DIS * np.max(targetDQN.predict(next_states), axis=1) * ~done

  y = mainDQN.predict(states)
  y[np.arange(len(X)), actions] = Q_target

#   Train our network using target and predicted Q values on each episode
  return mainDQN.update(X, y)

#   return mainDQN.update(x_stack, y_stack)

In [0]:
def bot_play(mainDQN):
  s = env.reset()
  reward_sum = 0
  
  while True:
#     env.render()
    a = np.argmax(mainDQN.predict(s))
    s, reward, done, _ = env.step(a)
    reward_sum += reward
    
    if done:
      print("Total score: {}".format(reward_sum))
      break

In [0]:
def main():
  # Store the previous observations in replay memory
  replay_buffer = deque(maxlen=REPLAY_MEMORY)
  last_100_game_reward = deque(maxlen=100)
  
  with tf.Session() as sess:
    main   = DQN(sess, input_size, output_size, "main")
    target = DQN(sess, input_size, output_size, "target")
    tf.global_variables_initializer().run()
    
    copy_ops = get_copy_var_ops(dest="target", src="main")
    sess.run(copy_ops)
    
    for episode in range(MAX_EPISODE):
      e = 1. / ((episode + 10) + 1)
      done = False
      step_cnt = 0
      state = env.reset()
      
      while not done:
        if np.random.rand(1) < e:
          action = env.action_space.sample()
        else:
          # Choose an action by greedily from the Q-net
          action = np.argmax(main.predict(state))
        
        # Get new state and reward from environment
        next_state, reward, done, _ = env.step(action)
        if done:
          reward = -5
        
        # Save the experience to the buffer
        replay_buffer.append(
               (state, action, reward, next_state, done)
        )
        
        
        if len(replay_buffer) > BATCH_SIZE:
          minibatch = rd.sample(replay_buffer, BATCH_SIZE)
          loss, _ = replay_train(main, target, minibatch)
          
        if step_cnt % TARGET_UPDATE_FREQUENCY == 0:
          sess.run(copy_ops)
        
        state = next_state
        step_cnt += 1
        if step_cnt > 10000:
          break
            
      print("Episode: {}    step: {}".format(episode, step_cnt))
      last_100_game_reward.append(step_cnt)
      if len(last_100_game_reward) == last_100_game_reward.maxlen:
        avg_reward = np.mean(last_100_game_reward)

        if avg_reward > 199:
            print(f"Game Cleared in {episode} episodes with avg reward {avg_reward}")
            break
    bot_play(main)

In [9]:
if __name__ == "__main__":
  main()


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Episode: 0    step: 10
Episode: 1    step: 8
Episode: 2    step: 10
Episode: 3    step: 9
Episode: 4    step: 10
Episode: 5    step: 10
Episode: 6    step: 11
Episode: 7    step: 9
Episode: 8    step: 12
Episode: 9    step: 9
Episode: 10    step: 9
Episode: 11    step: 9
Episode: 12    step: 9
Episode: 13    step: 9
Episode: 14    step: 10
Episode: 15    step: 9
Episode: 16    step: 10
Episode: 17    step: 9
Episode: 18    step: 10
Episode: 19    step: 8
Episode: 20    step: 10
Episode: 21    step: 9
Episode: 22    step: 10
Episode: 23    step: 10
Episode: 24    step: 9
Episode:

In [0]:
def main():
  max_episodes = 5000
  
  # Store the previous observations in replay memory
  replay_buffer = deque(maxlen=REPLAY_MEMORY)
  last_100_game_reward = deque(maxlen=100)
  
  with tf.Session() as sess:
    main   = DQN(sess, input_size, output_size, "main2")
    target = DQN(sess, input_size, output_size, "target2")
    tf.global_variables_initializer().run()
    
    copy_ops = get_copy_var_ops(dest="target2", src="main2")
    sess.run(copy_ops)
    
    for episode in range(max_episodes):
      e = annealing_epsilon(episode, MIN_E, 1.0, EPSILON_DECAYING_EPISODE)
      done = False
      step_cnt = 0
      state = env.reset()
      
      while not done:
        if np.random.rand(1) < e:
          action = env.action_space.sample()
        else:
          # Choose an action by greedily from the Q-net
          action = np.argmax(main.predict(state))
        
        # Get new state and reward from environment
        next_state, reward, done, _ = env.step(action)
        if done:
          reward = -5
        
        # Save the experience to the buffer
        replay_buffer.append(
               (state, action, reward, next_state, done)
        )
        
        
        if len(replay_buffer) > BATCH_SIZE:
          minibatch = rd.sample(replay_buffer, BATCH_SIZE)
          loss, _ = replay_train(main, target, minibatch)
          
        if step_cnt % TARGET_UPDATE_FREQUENCY == 0:
          sess.run(copy_ops)
        
        state = next_state
        step_cnt += 1
        if step_cnt > 10000:
          break
            
      print("Episode: {}    step: {}".format(episode, step_cnt))
      last_100_game_reward.append(step_cnt)
      if len(last_100_game_reward) == last_100_game_reward.maxlen:
        avg_reward = np.mean(last_100_game_reward)

        if avg_reward > 199:
            print(f"Game Cleared in {episode} episodes with avg reward {avg_reward}")
            break
    bot_play(main)