In [1]:
%matplotlib inline
import random
from time import sleep

import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

env = gym.make('CartPole-v0')


In [2]:
nepisode = 200

In [3]:
class Network:
    def __init__(self, learning_rate=0.01):
#         with tf.variable_scope('network'):
        self.input_state = tf.placeholder(shape=[None, 4], dtype=tf.float32)
        self.weight_1 = tf.Variable(tf.random_uniform([4, 10], dtype=tf.float32))
        self.bias_1 =  tf.Variable(tf.random_uniform([1, 10], dtype=tf.float32))
        self.hidden_1 = tf.nn.tanh(tf.add(tf.matmul(self.input_state, self.weight_1), self.bias_1))

        self.weight_2 = tf.Variable(tf.random_uniform([10, 2], dtype=tf.float32))
        self.bias_2 =  tf.Variable(tf.random_uniform([1, 2], dtype=tf.float32))
        self.pred_Q = tf.add(tf.matmul(self.hidden_1, self.weight_2), self.bias_2)
        self.select_action = tf.argmax(self.pred_Q, 1)

        self.expect_Q = tf.placeholder(shape=[None, 2], dtype=tf.float32)
#         self.error = tf.reduce_mean(tf.square(self.expect_Q - self.pred_Q))
        self.loss = tf.losses.mean_squared_error(self.expect_Q, self.pred_Q)
        self.trainer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss)
    
    def predict(self, sess, state):
        select_action, pred_Q = sess.run([self.select_action, self.pred_Q], \
                                         feed_dict={self.input_state:state})
        return select_action, pred_Q
    
    def update(self, sess, X, Y):
        loss, _ = sess.run([self.loss, self.trainer], feed_dict={self.input_state:X, self.expect_Q:Y})
        return loss
        
class Agent:
    def __init__(self, session, beta=0.9, discount=0.3, batch_iter=32, batch_size=16, max_memory=1000):
        self.sess = session
        self.net = Network()
        self.beta = beta
        self.experience_memory = list()
        self.max_memory = max_memory
        self.batch_size = batch_size
        self.batch_iter = batch_iter
        self.discount = discount
        
    # state, action, reward, next_state, done
    def memory(self, pack):
        if len(self.experience_memory) > self.max_memory:
            self.experience_memory.pop(0)
        self.experience_memory.append( pack )
    
    def predict(self, state):
        is_random = False
        select_action, pred_Q = self.net.predict(self.sess, state)
        if np.random.rand(1) < self.beta:
            is_random = True
            select_action = [env.action_space.sample()]
        return select_action, pred_Q, is_random
    
    
    def replay_experience(self):
        losses = list()
        for batchiter in range(self.batch_iter):
            X = list()
            Y = list()
            batch = random.sample(self.experience_memory, \
                min(len(self.experience_memory), self.batch_size))

            for state, action, reward, next_state, done in batch:
                pq = self.predict([state])[1]
                if done:
                    pq[0, action] = reward
                else:
                    pq[0, action] = reward + self.discount * np.max(self.predict([next_state])[1])
                X.append(state)
                Y.append(pq[0])
            loss = self.net.update(sess, np.array(X), np.array(Y))
            losses.append( loss )
        return np.mean(losses)

        

In [4]:

steps = []
memory = []

with tf.Session() as sess:
    agent = Agent(sess)    
    sess.run(tf.global_variables_initializer())
    
    for episode in range(nepisode+1):
        done = False
        step = 0
        state = env.reset()
        while not done:
            action, pred_Q, _ = agent.predict([state])
            next_state, reward, done, _ = env.step(action[0])
            
            # Big Penalty to Agent
            if done: 
                reward = -100

            agent.memory( (state, action, reward, next_state, done) )
            
            step += 1
            state = next_state
        steps.append(step)
        agent.beta = agent.beta*0.7    
        
        if episode%10 == 0 and episode != 0:
            loss = agent.replay_experience()
            print("in Episode {:4d} mean step: {:.2f}, Agent Replay Experience, get loss: {:.2f} "\
                      .format(episode, np.mean(steps[episode-10:episode+1]), loss))

    
    step = 0
    done = False
    state = env.reset()
    while not done:
        action, Q, _ = agent.predict([state])
        state, reward, done, _ = env.step(action[0])
        step += 1
        env.render()
        sleep(0.1)
    print('Bot play game and archive {} step'.format(step))
    env.close()
    

in Episode   10 mean step: 10.27, Agent Replay Experience, get loss: 513.46 
in Episode   20 mean step: 182.64, Agent Replay Experience, get loss: 30.24 
in Episode   30 mean step: 200.00, Agent Replay Experience, get loss: 20.33 
in Episode   40 mean step: 169.27, Agent Replay Experience, get loss: 40.64 
in Episode   50 mean step: 46.73, Agent Replay Experience, get loss: 81.38 
in Episode   60 mean step: 11.82, Agent Replay Experience, get loss: 91.03 
in Episode   70 mean step: 9.55, Agent Replay Experience, get loss: 151.38 
in Episode   80 mean step: 9.82, Agent Replay Experience, get loss: 176.87 
in Episode   90 mean step: 16.45, Agent Replay Experience, get loss: 202.30 
in Episode  100 mean step: 17.09, Agent Replay Experience, get loss: 273.75 
in Episode  110 mean step: 16.82, Agent Replay Experience, get loss: 304.31 
in Episode  120 mean step: 18.91, Agent Replay Experience, get loss: 254.69 
in Episode  130 mean step: 11.18, Agent Replay Experience, get loss: 405.91 
in 