In [1]:
import gym
import tensorflow as tf
import numpy as np
import random

In [2]:
class Q_nn():
    
    def __init__(self, scope = 'estimator'):
        self.scope = scope
        with tf.variable_scope(scope):
            self.__build_graph()
            

    def __build_graph(self):
        # state matrix
        self.X_states = tf.placeholder(shape = [None, 4], dtype = tf.float32)
        
        # target values (R+maxQ)
        self.Q_targets = tf.placeholder(shape = [None], dtype = tf.float32)
            
        # action as an index
        self.actions = tf.placeholder(shape = [None], dtype = tf.int32)
        
        self.batch_size = self.X_states.shape[0]
        self.action_size = 2
        self.action_one_hots = tf.one_hot(self.actions,self.action_size,axis = -1)
        
        self.W_fc1 = tf.Variable(tf.truncated_normal([4,32],0.1))
        self.b_fc1 = tf.Variable(tf.constant(0.1,shape=[32]))
        self.fc1 = tf.matmul(self.X_states,self.W_fc1) + self.b_fc1
        
        self.relu1 = tf.nn.relu(self.fc1)
        
        self.W_fc2 = tf.Variable(tf.truncated_normal([32,2],0.1))
        self.b_fc2 = tf.Variable(tf.constant(0.1,shape=[2]))
        
        self.Q_est = tf.matmul(self.relu1,self.W_fc2) + self.b_fc2
        
        
        
        self.action_Qs = tf.reduce_sum(tf.multiply(self.Q_est,self.action_one_hots),-1)
               
        self.loss = tf.losses.mean_squared_error(self.Q_targets, self.action_Qs)
        
        self.optimizer = tf.train.GradientDescentOptimizer(1e-3)
        self.train_op = self.optimizer.minimize(self.loss)
        

    def predict(self, sess, state):
        return sess.run(self.Q_est, { self.X_states: state })
        
        
    def update(self, sess, states, actions, targets):
        feed_dict = { self.X_states: states, self.Q_targets: targets, self.actions: actions}
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss

    
class NetworkCopier():
    def __init__(self, estimator,target):
        est_vars = [variable for variable in tf.trainable_variables() if variable.name.startswith(estimator.scope)]
        est_vars = sorted(est_vars, key = lambda x: x.name)

        tar_vars = [variable for variable in tf.trainable_variables() if variable.name.startswith(target.scope)]
        tar_vars = sorted(tar_vars, key = lambda x: x.name)

        self.update_ops = [tar_var.assign(est_var) for est_var,tar_var in zip(est_vars,tar_vars)]
    
    
    def copy_and_freeze(self,sess): 
        sess.run(self.update_ops)
        
        
class ReplayBuffer():
    def __init__(self, max_size = 10000):
        self.buffer = []
        self.max_size = max_size
        
        
    def add_new(self, state, action, reward, next_state):
        if len(self.buffer) >= self.max_size:
            _ = self.buffer.pop()
        entry = (state,action,reward,next_state)
        self.buffer.append(entry)
        
        
    def batch(self, n = 100):
        if len(self.buffer) < n:
            return self.buffer
        else:
            return random.sample(self.buffer, n)

In [31]:
episodes = 10000
explore_steps = 250 # how many episodes to do pure exploration
iterations = 10 # per episode
max_steps = 1000 # per episode
batch_size = 100
discount = 0.9


tf.reset_default_graph()

Q_estimator = Q_nn(scope = 'estimator')
Q_target = Q_nn(scope = 'target')
Freezer = NetworkCopier(Q_estimator,Q_target)
Buffer = ReplayBuffer(2500)

sess = tf.Session()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())
env = gym.make('CartPole-v0')
print('Graph Set')

[2017-12-28 00:34:56,226] Making new env: CartPole-v0


Graph Set


In [32]:
%%time
for e in range(episodes):
    cumulative_loss = 0
    if e < explore_steps: # pure exploration
        epsilon = 1
    else: # epsilon-greedy exploration/exploitation
        epsilon = max(np.exp((e-explore_steps)/-50),0.1)
    observation = env.reset()
    total_reward = 0
    
    for step in range(max_steps):
        curr_state = observation
        if random.uniform(0,1) < epsilon:
            action = env.action_space.sample()
        else:
            curr_state_reshape = np.reshape(curr_state,(1,4))
            q_est = Q_estimator.predict(sess,curr_state_reshape)
            action = np.argmax(q_est)
        
        observation, reward, done, info = env.step(action)
        next_state = observation
        Buffer.add_new(curr_state, action, reward, next_state)
        total_reward += reward
        if done:
            break
    for i in range(iterations):
        states, actions, rewards, next_states = zip(*Buffer.batch(batch_size))
        targets = []
        for reward, next_state in zip(rewards,next_states):
            next_state_reshape = np.reshape(next_state, (1,4))
            q_max = np.max(Q_target.predict(sess,next_state_reshape))
            target = reward + discount*q_max
            targets.append(target)
        state_array = np.stack(states,axis=0)
        action_array = np.array(actions)
        target_array = np.array(targets)
        loss = Q_estimator.update(sess, state_array,action_array,target_array)
        cumulative_loss += loss
    if e%25 == 0:
        Freezer.copy_and_freeze(sess)
        print('Episode {}, loss: {}, total steps = {}'.format(e,cumulative_loss/iterations,total_reward))

Episode 0, loss: 33.80260791778564, total steps = 18.0
Episode 25, loss: 0.5409657686948777, total steps = 22.0
Episode 50, loss: 0.34227833300828936, total steps = 16.0
Episode 75, loss: 0.21838380992412568, total steps = 13.0
Episode 100, loss: 0.16771242171525955, total steps = 20.0
Episode 125, loss: 0.141452556848526, total steps = 22.0
Episode 150, loss: 0.11481173038482666, total steps = 34.0
Episode 175, loss: 0.12878579497337342, total steps = 12.0
Episode 200, loss: 0.10050932168960572, total steps = 12.0
Episode 225, loss: 0.10115797594189643, total steps = 27.0
Episode 250, loss: 0.10691907331347465, total steps = 38.0
Episode 275, loss: 0.09224972799420357, total steps = 14.0
Episode 300, loss: 0.09253381937742233, total steps = 9.0
Episode 325, loss: 0.08250379301607609, total steps = 10.0
Episode 350, loss: 0.07631386779248714, total steps = 8.0
Episode 375, loss: 0.07134317383170127, total steps = 9.0
Episode 400, loss: 0.08036250546574593, total steps = 10.0
Episode 42

Episode 3475, loss: 0.030641797184944152, total steps = 10.0
Episode 3500, loss: 0.02973885778337717, total steps = 10.0
Episode 3525, loss: 0.033653955720365045, total steps = 9.0
Episode 3550, loss: 0.029173130728304388, total steps = 8.0
Episode 3575, loss: 0.029712295904755593, total steps = 9.0
Episode 3600, loss: 0.02933363225311041, total steps = 10.0
Episode 3625, loss: 0.0288596298545599, total steps = 11.0
Episode 3650, loss: 0.02986489497125149, total steps = 9.0
Episode 3675, loss: 0.028802960738539697, total steps = 10.0
Episode 3700, loss: 0.027466177381575106, total steps = 9.0
Episode 3725, loss: 0.0286336675286293, total steps = 9.0
Episode 3750, loss: 0.02598653994500637, total steps = 8.0
Episode 3775, loss: 0.028859657421708106, total steps = 12.0
Episode 3800, loss: 0.0259414903819561, total steps = 9.0
Episode 3825, loss: 0.025734616629779338, total steps = 9.0
Episode 3850, loss: 0.029600246250629424, total steps = 10.0
Episode 3875, loss: 0.028010027296841145, t

Episode 6900, loss: 0.011798636056482792, total steps = 9.0
Episode 6925, loss: 0.01101508061401546, total steps = 11.0
Episode 6950, loss: 0.011285526398569345, total steps = 10.0
Episode 6975, loss: 0.011519200354814529, total steps = 8.0
Episode 7000, loss: 0.010351148946210743, total steps = 11.0
Episode 7025, loss: 0.009652772545814514, total steps = 11.0
Episode 7050, loss: 0.010334680788218975, total steps = 10.0
Episode 7075, loss: 0.009628517972305416, total steps = 9.0
Episode 7100, loss: 0.010258476622402668, total steps = 9.0
Episode 7125, loss: 0.012495662551373243, total steps = 9.0
Episode 7150, loss: 0.011182739911600948, total steps = 11.0
Episode 7175, loss: 0.010953758843243122, total steps = 9.0
Episode 7200, loss: 0.009741020668298007, total steps = 10.0
Episode 7225, loss: 0.01278283940628171, total steps = 10.0
Episode 7250, loss: 0.009994249511510133, total steps = 10.0
Episode 7275, loss: 0.00949524063616991, total steps = 9.0
Episode 7300, loss: 0.009801310906

In [33]:
%%time
random_results = []
agent_results = []
for t in range(100):
    done = False
    observation = env.reset()
    total_steps = 0
    while not done:
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        total_steps += 1
    random_results.append(total_steps)    

for t in range(100):
    done = False
    observation = env.reset()
    total_steps = 0
    while not done:
        state_reshape = np.reshape(observation,(1,4))
        q_est = Q_estimator.predict(sess,state_reshape)
        action = np.argmax(q_est)
        observation, reward, done, info = env.step(action)
        total_steps += 1
    agent_results.append(total_steps)
print('Luckiest of 100 random agents survives {} time steps'.format(max(random_results)))
print('  Average random agent survived {} time steps'.format(np.mean(random_results)))
print('Best learned agent survives {} time steps'.format(max(agent_results)))
print('  Average learned agent survives {} time steps'.format(np.mean(agent_results)))

Luckiest of 100 random agents survives 83 time steps
  Average random agent survived 22.56 time steps
Best learned agent survives 11 time steps
  Average learned agent survives 9.45 time steps
Wall time: 977 ms
