In [1]:
import numpy as np
import tensorflow as tf
import random
from collections import deque

import gym

In [2]:
env = gym.make('CartPole-v0')
env._max_episode_steps = 15000
input_size = env.observation_space.shape[0]
output_size = env.action_space.n

dis = 0.99
REPLAY_MEMORY = 50000


In [3]:
class DQN :
    def __init__(self , session, input_size, output_size , name ='main'):
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.net_name = name
        
        self._build_network()
    
    def _build_network(self, h_size=10, l_rate = 1e-1) :
        with tf.variable_scope(self.net_name):
            self._X = tf.placeholder(tf.float32, [None, self.input_size], name ='input_x')

            W1 = tf.get_variable("W1" , shape=[self.input_size, h_size], initializer= tf.contrib.layers.xavier_initializer())
            layer1 = tf.nn.tanh(tf.matmul(self._X , W1))

            W2 = tf.get_variable("W2" , shape=[h_size, self.output_size], initializer=tf.contrib.layers.xavier_initializer())

            self._Qpred = tf.matmul(layer1,W2)
       
        self._Y = tf.placeholder(shape=[None, self.output_size], dtype=tf.float32)
        
        self._loss = tf.reduce_mean(tf.square(self._Y - self._Qpred))
        
        self._train = tf.train.AdamOptimizer(learning_rate = l_rate).minimize(self._loss)
    
    def predict(self, state) :
        x = np.reshape(state, [1, self.input_size])
        return self.session.run(self._Qpred , feed_dict={self._X : x})
    
    def update(self, x_stack, y_stack) :
        return self.session.run([self._loss , self._train] , feed_dict={self._X : x_stack, self._Y : y_stack})
    
    

In [None]:
def simple_replay_train(DQN , train_batch) :
    x_stack = np.empty(0).reshape(0, DQN.input_size)
    y_stack = np.empty(0).reshape(0, DQN.output_size)
    
    for state, action, reward, next_state, done in train_batch :
        Q = DQN.predict(state)
        
        if done :
            Q[0, action] = reward
        else : 
            Q[0,action] = reward + dis * np.max(DQN.predict(next_state))
            
        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])
    
    return DQN.update(x_stack, y_stack)
            

In [None]:
def bot_play(mainDQN) :
    s = env.reset()
    reward_sum = 0
    while True :
        env.render()
        a = np.argmax(mainDQN.predict(s))
        s, reward, done, _ = env.step(a)
        reward_sum += reward
        if done :
            print("Total score : {}".format(reward_sum))
            break
        

In [None]:
def main() :
    max_episodes = 5000
    
    replay_buffer = deque()
    
    with tf.Session() as sess :
        mainDQN = DQN(sess, input_size , output_size)
        tf.global_variables_initializer().run()
        
        for episode in range(max_episodes) :
            e = 1. / ((episode / 10 ) + 1 )
            done = False
            step_count = 0
            state = env.reset()
            
            
            while not done :
                if np.random.rand(1) < e :
                    action = env.action_space.sample()
                else :
                    action = np.argmax(mainDQN.predict(state))
                
                next_state, reward, done, _ = env.step(action)
                
                if done :
                    reward = -100
                    
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY :
                    replay_buffer.popleft()
                    
                state = next_state
                step_count += 1
                if step_count > 10000 :
                    break
                
            print("Episode : {}  steps : {}".format(episode, step_count))
            if step_count > 10000 :
                pass
                break
                
            if episode % 10 == 1:
                for _ in range(50) :
                    minibatch = random.sample(replay_buffer , 10)
                    loss, _ = simple_replay_train(mainDQN, minibatch)
                print('Loss : ', loss)
        bot_play(mainDQN)           
                

In [None]:
if __name__ == "__main__" :
    main()


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Colocations handled automatically by placer.
Episode : 0  steps : 68
Episode : 1  steps : 50
Loss :  6.1584926
Episode : 2  steps : 12
Episode : 3  steps : 14
Episode : 4  steps : 17
Episode : 5  steps : 12
Episode : 6  steps : 15
Episode : 7  steps : 17
Episode : 8  steps : 9
Episode : 9  steps : 15
Episode : 10  steps : 9
Episode : 11  steps : 10
Loss :  1001.79285
Episode : 12  steps : 37
Episode : 13  steps : 41
Episode : 14  steps : 28
Episode : 15  steps : 62
Episode : 16  steps : 77
Episode : 17  steps : 69
Episode : 18  steps : 61
Episode : 19  steps : 42
Episode : 20  steps : 43
Episode : 21  steps : 26
Loss :  1.0665042
Episode : 22  steps : 9
Episode : 23  steps : 16
Episode : 24  steps : 9
Episode : 25  steps : 14
E

Episode : 278  steps : 1605
Episode : 279  steps : 182
Episode : 280  steps : 531
Episode : 281  steps : 72
Loss :  481.71527
Episode : 282  steps : 1560
Episode : 283  steps : 647
Episode : 284  steps : 57
Episode : 285  steps : 412
Episode : 286  steps : 752
Episode : 287  steps : 1107
Episode : 288  steps : 316
Episode : 289  steps : 124
Episode : 290  steps : 988
Episode : 291  steps : 84
Loss :  9.868605
Episode : 292  steps : 1068
Episode : 293  steps : 807
Episode : 294  steps : 115
Episode : 295  steps : 1152
Episode : 296  steps : 184
Episode : 297  steps : 95
Episode : 298  steps : 561
Episode : 299  steps : 89
Episode : 300  steps : 1812
Episode : 301  steps : 85
Loss :  2.2826207
Episode : 302  steps : 2017
Episode : 303  steps : 2603
Episode : 304  steps : 10001
