In [0]:
import tensorflow as tf
import numpy as np
import random as rd
import gym
from collections import deque

In [0]:
class DQN:
  def __init__(self, sess, input_size, output_size, name="main"):
    self.sess = sess
    self.input_size = input_size
    self.output_size = output_size
    self.net_name = name
    
    self._build_network()
  
  def _build_network(self, h_size=16, l_rate=1e-3):
    with tf.device('/device:GPU:0'):
      with tf.variable_scope(self.net_name):
        self._x = tf.placeholder(tf.float32, shape=[None, self.input_size], name="input_x")
        
        W1 = tf.get_variable('W1', shape=[self.input_size, h_size],
                              initializer = tf.contrib.layers.xavier_initializer())
        L1 = tf.nn.tanh(tf.matmul(self._x, W1))
        
        W2 = tf.get_variable('W2', shape=[h_size, self.output_size],
                              initializer = tf.contrib.layers.xavier_initializer())
        self._Qpred = tf.matmul(L1, W2)
      
      # Need to define the parts of the network needed for learning a
      # Policy
      self._y = tf.placeholder(shape=[None, self.output_size], dtype=tf.float32)
      
      # Loss
      self._loss = tf.reduce_mean(tf.square(self._y - self._Qpred))
      # Training
      self._train = tf.train.AdamOptimizer(l_rate, epsilon=1e-5).minimize(self._loss)
      
  def predict(self, state):
    x = np.reshape(state, [-1, self.input_size])
    return self.sess.run(self._Qpred, feed_dict = {self._x : x})
  
  def update(self, x_stack, y_stack):
    return self.sess.run([self._loss, self._train],
                         feed_dict={self._x: x_stack, self._y: y_stack})

In [0]:
def simple_replay_train(DQN, train_batch):
#   x_stack = np.empty(0).reshape(0, DQN.input_size)
#   y_stack = np.empty(0).reshape(0, DQN.output_size)
  
#   # Get stored information from the buffer
#   for state, action, reward, next_state, done in train_batch:
#     Q = DQN.predict(state)
    
#     # Terminal?
#     if done:
#       Q[0, action] = reward
#     else:
#       Q[0, action] = reward + dis * np.max(DQN.predict(next_state))
      
#     y_stack = np.vstack([y_stack, Q])
#     x_stack = np.vstack([x_stack, state])
    
#   return DQN.update(x_stack, y_stack)
  states = np.vstack([x[0] for x in train_batch])
  actions = np.array([x[1] for x in train_batch])
  rewards = np.array([x[2] for x in train_batch])
  next_states = np.vstack([x[3] for x in train_batch])
  done = np.array([x[4] for x in train_batch])

  X = states

  Q = rewards + dis * np.max(DQN.predict(next_states), axis=1) * ~done

  y = DQN.predict(states)
  y[np.arange(len(X)), actions] = Q

  # Train our network using target and predicted Q values on each episode
  return DQN.update(X, y)

In [0]:
def bot_play(mainDQN):
  state = env.reset()
  reward_sum = 0
  
  while True:
    env.render()
    a = np.argmax(mainDQN.predict(s))
    
    s, reward, done, _ = env.step(a)
    reward_sum += reward
    if done:
      print("Total score: {}".format(reward_sum))
      break

In [0]:
env = gym.make('CartPole-v1')

env._max_episode_step = 10001
input_size = env.observation_space.shape[0]
output_size = env.action_space.n

dis = 0.99
REPLAY_MEMORY = 50000
BATCH_SIZE = 64

In [0]:
def main():
  max_episodes = 5000
  
  # Store the previous observations in replay memory
  replay_buffer = deque(maxlen=REPLAY_MEMORY)
  last_100_game_reward = deque(maxlen=100)
  
  with tf.Session() as sess:
    dqn = DQN(sess, input_size, output_size)
    tf.global_variables_initializer().run()
    
    for episode in range(max_episodes):
      e = 1. / ((episode + 10) + 1)
      done = False
      step_cnt = 0
      
      state = env.reset()
      
      while not done:
        if np.random.rand(1) < e:
          action = env.action_space.sample()
        else:
          # Choose an action by greedily from the Q-net
          action = np.argmax(dqn.predict(state))
        
        # Get new state and reward from environment
        next_state, reward, done, _ = env.step(action)
        if done:
          reward = -100
        
        # Save the experience to the buffer
        replay_buffer.append(
               (state, action, reward, next_state, done)
        )
        
        
        
        state = next_state
        step_cnt += 1
        if step_cnt > 10000:
          break
            
      print("Episode: {}    step: {}".format(episode, step_cnt))
      if step_cnt > 10000:
        pass
        break
    
      if episode % 10 == 1 and episode != 1:
        # Get a random batch of experiences
        for _ in range(50):
          # Minibatch works better
          minibatch = rd.sample(replay_buffer, 100)
          loss, _ = simple_replay_train(dqn, minibatch)
        print("Loss: ", loss)
        
  bot_play(dqn)

In [7]:
env._max_episode_step

10001

In [8]:
if __name__ == "__main__":
  main()


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Colocations handled automatically by placer.
Episode: 0    step: 10
Episode: 1    step: 9
Episode: 2    step: 10
Episode: 3    step: 10
Episode: 4    step: 9
Episode: 5    step: 8
Episode: 6    step: 8
Episode: 7    step: 8
Episode: 8    step: 10
Episode: 9    step: 9
Episode: 10    step: 8
Episode: 11    step: 10
Loss:  553.3398
Episode: 12    step: 10
Episode: 13    step: 10
Episode: 14    step: 9
Episode: 15    step: 10
Episode: 16    step: 10
Episode: 17    step: 10
Episode: 18    step: 10
Episode: 19    step: 10
Episode: 20    step: 9
Episode: 21    step: 9
Loss:  493.44067
Episode: 22    step: 49
Episode: 23    step: 44
Episode: 24    step: 59
Episode: 25    step: 81
Episode: 26    step: 47
Episode: 27    step: 61
Episode

KeyboardInterrupt: ignored