In [1]:
import gym
import tensorflow as tf
import numpy as np
import random

In [2]:
class Q_nn():
    
    def __init__(self, scope = 'estimator'):
        self.scope = scope
        with tf.variable_scope(scope):
            self.__build_graph()
            

    def __build_graph(self):
        # state matrix
        self.X_states = tf.placeholder(shape = [None, 4], dtype = tf.float32)
        
        # target values (R+maxQ)
        self.Q_targets = tf.placeholder(shape = [None], dtype = tf.float32)
            
        # action as an index
        self.actions = tf.placeholder(shape = [None], dtype = tf.int32)
        
        self.batch_size = self.X_states.shape[0]
        self.action_size = 2
        self.action_one_hots = tf.one_hot(self.actions,self.action_size,axis = -1)
        
        self.W_fc1 = tf.Variable(tf.truncated_normal([4,32],0.1))
        self.b_fc1 = tf.Variable(tf.constant(0.1,shape=[32]))
        self.fc1 = tf.matmul(self.X_states,self.W_fc1) + self.b_fc1
        
        self.relu1 = tf.nn.relu(self.fc1)
        
        self.W_fc2 = tf.Variable(tf.truncated_normal([32,2],0.1))
        self.b_fc2 = tf.Variable(tf.constant(0.1,shape=[2]))
        
        self.Q_est = tf.matmul(self.relu1,self.W_fc2) + self.b_fc2
        
        
        
        self.action_Qs = tf.reduce_sum(tf.multiply(self.Q_est,self.action_one_hots),-1)
               
        self.loss = tf.losses.mean_squared_error(self.Q_targets, self.action_Qs)
        
        self.optimizer = tf.train.GradientDescentOptimizer(1e-3)
        self.train_op = self.optimizer.minimize(self.loss)
        

    def predict(self, sess, state):
        return sess.run(self.Q_est, { self.X_states: state })
        
        
    def update(self, sess, states, actions, targets):
        feed_dict = { self.X_states: states, self.Q_targets: targets, self.actions: actions}
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss

    
class NetworkCopier():
    def __init__(self, estimator,target):
        est_vars = [variable for variable in tf.trainable_variables() if variable.name.startswith(estimator.scope)]
        est_vars = sorted(est_vars, key = lambda x: x.name)

        tar_vars = [variable for variable in tf.trainable_variables() if variable.name.startswith(target.scope)]
        tar_vars = sorted(tar_vars, key = lambda x: x.name)

        self.update_ops = [tar_var.assign(est_var) for est_var,tar_var in zip(est_vars,tar_vars)]
    
    
    def copy_and_freeze(self,sess): 
        sess.run(self.update_ops)
        
        
class ReplayBuffer():
    def __init__(self, max_size = 10000):
        self.buffer = []
        self.max_size = max_size
        
        
    def add_new(self, state, action, reward, next_state):
        if len(self.buffer) >= self.max_size:
            _ = self.buffer.pop()
        entry = (state,action,reward,next_state)
        self.buffer.append(entry)
        
        
    def batch(self, n = 100):
        if len(self.buffer) < n:
            return self.buffer
        else:
            return random.sample(self.buffer, n)

In [3]:
episodes = 1000
explore_steps = int(episodes*0.25) # how many episodes to do pure exploration
iterations = 10 # per episode
max_steps = 1000 # per episode
batch_size = 100
discount = 0.99


tf.reset_default_graph()

Q_estimator = Q_nn(scope = 'estimator')
Q_target = Q_nn(scope = 'target')
Freezer = NetworkCopier(Q_estimator,Q_target)
Buffer = ReplayBuffer(2500)

sess = tf.Session()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())
env = gym.make('CartPole-v0')
print('Graph Set')

[2017-12-27 13:30:56,521] Making new env: CartPole-v0


Graph Set


In [4]:
%%time
for e in range(episodes):
    cumulative_loss = 0
    if e < explore_steps: # pure exploration
        epsilon = 1
    else: # epsilon-greedy exploration/exploitation
        epsilon = max(np.exp((e-explore_steps)/-50),0.1)
    observation = env.reset()
    total_reward = 0
    
    for step in range(max_steps):
        curr_state = observation
        if random.uniform(0,1) < epsilon:
            action = env.action_space.sample()
        else:
            curr_state_reshape = np.reshape(curr_state,(1,4))
            q_est = Q_estimator.predict(sess,curr_state_reshape)
            action = np.argmin(q_est)
        
        observation, reward, done, info = env.step(action)
        next_state = observation
        Buffer.add_new(curr_state, action, reward, next_state)
        total_reward += reward
        if done:
            break
    for i in range(iterations):
        states, actions, rewards, next_states = zip(*Buffer.batch(batch_size))
        targets = []
        for reward, next_state in zip(rewards,next_states):
            next_state_reshape = np.reshape(next_state, (1,4))
            q_max = np.max(Q_target.predict(sess,next_state_reshape))
            target = reward + discount*q_max
            targets.append(target)
        state_array = np.stack(states,axis=0)
        action_array = np.array(actions)
        target_array = np.array(targets)
        loss = Q_estimator.update(sess, state_array,action_array,target_array)
        cumulative_loss += loss
    if e%25 == 0:
        Freezer.copy_and_freeze(sess)
        print('Episode {}, loss: {}, total steps = {}'.format(e,cumulative_loss/iterations,total_reward))

Episode 0, loss: 19.87629795074463, total steps = 12.0
Episode 25, loss: 0.5317947566509247, total steps = 57.0
Episode 50, loss: 0.3819993257522583, total steps = 9.0
Episode 75, loss: 0.7842229396104813, total steps = 35.0
Episode 100, loss: 0.4830306738615036, total steps = 19.0
Episode 125, loss: 0.44875511825084685, total steps = 28.0
Episode 150, loss: 0.47208432853221893, total steps = 16.0
Episode 175, loss: 0.4970659613609314, total steps = 24.0
Episode 200, loss: 0.5723933339118957, total steps = 13.0
Episode 225, loss: 0.6282437562942504, total steps = 28.0
Episode 250, loss: 0.6839518189430237, total steps = 35.0
Episode 275, loss: 0.7458848416805267, total steps = 18.0
Episode 300, loss: 1.0102652251720428, total steps = 20.0
Episode 325, loss: 1.1592188477516174, total steps = 19.0
Episode 350, loss: 1.6888458967208861, total steps = 23.0
Episode 375, loss: 1.8304792046546936, total steps = 30.0
Episode 400, loss: 3.0613433837890627, total steps = 21.0
Episode 425, loss: 

In [7]:
%%time
random_results = []
agent_results = []
for t in range(100):
    done = False
    observation = env.reset()
    total_steps = 0
    while not done:
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        total_steps += 1
    random_results.append(total_steps)    

for t in range(100):
    done = False
    observation = env.reset()
    total_steps = 0
    while not done:
        state_reshape = np.reshape(observation,(1,4))
        q_est = Q_estimator.predict(sess,state_reshape)
        action = np.argmin(q_est)
        observation, reward, done, info = env.step(action)
        total_steps += 1
    agent_results.append(total_steps)
print('Luckiest of 100 random agents survives {} time steps'.format(max(random_results)))
print('  Average random agent survived {} time steps'.format(np.mean(random_results)))
print('Best learned agent survives {} time steps'.format(max(agent_results)))
print('  Average learned agent survives {} time steps'.format(np.mean(agent_results)))

Luckiest of 100 random agents survives 55 time steps
  Average random agent survived 20.76 time steps
Best learned agent survives 140 time steps
  Average learned agent survives 120.71 time steps
Wall time: 20.2 s
