In [1]:
import gym
import tensorflow as tf
import numpy as np
import random

In [2]:
class Q_nn():
    
    def __init__(self, scope = 'estimator'):
        self.scope = scope
        with tf.variable_scope(scope):
            self.__build_graph()
            

    def __build_graph(self):
        # state matrix
        self.X_states = tf.placeholder(shape = [None, 4], dtype = tf.float32)
        
        # target values (R+maxQ)
        self.Q_targets = tf.placeholder(shape = [None], dtype = tf.float32)
            
        # action as an index
        self.actions = tf.placeholder(shape = [None], dtype = tf.int32)
        
        self.batch_size = self.X_states.shape[0]
        self.action_size = 2
        self.action_one_hots = tf.one_hot(self.actions,self.action_size,axis = -1)
        
        self.W_fc1 = tf.Variable(tf.truncated_normal([4,32],0.1))
        self.b_fc1 = tf.Variable(tf.constant(0.1,shape=[32]))
        self.fc1 = tf.matmul(self.X_states,self.W_fc1) + self.b_fc1
        
        self.relu1 = tf.nn.relu(self.fc1)
        
        self.W_fc2 = tf.Variable(tf.truncated_normal([32,2],0.1))
        self.b_fc2 = tf.Variable(tf.constant(0.1,shape=[2]))
        
        self.Q_est = tf.matmul(self.relu1,self.W_fc2) + self.b_fc2
        
        
        
        self.action_Qs = tf.reduce_sum(tf.multiply(self.Q_est,self.action_one_hots),-1)
               
        self.loss = tf.losses.mean_squared_error(self.Q_targets, self.action_Qs)
        
        self.optimizer = tf.train.GradientDescentOptimizer(1e-3)
        self.train_op = self.optimizer.minimize(self.loss)
        

    def predict(self, sess, state):
        return sess.run(self.Q_est, { self.X_states: state })
        
        
    def update(self, sess, states, actions, targets):
        feed_dict = { self.X_states: states, self.Q_targets: targets, self.actions: actions}
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss

    
class NetworkCopier():
    def __init__(self, estimator,target):
        est_params = [variable for variable in tf.trainable_variables() if variable.name.startswith(estimator.scope)]
        est_params = sorted(est_params, key = lambda x: x.name)

        tar_params = [variable for variable in tf.trainable_variables() if variable.name.startswith(target.scope)]
        tar_params = sorted(tar_params, key = lambda x: x.name)

        self.update_ops = [tar_var.assign(est_var) for est_var,tar_var in zip(est_params,tar_params)]
    
    
    def copy_and_freeze(self,sess): 
        sess.run(self.update_ops)
        
        
class ReplayBuffer():
    def __init__(self, max_size = 10000):
        self.buffer = []
        self.max_size = max_size
        
        
    def add_new(self, state, action, reward, next_state):
        if len(self.buffer) >= self.max_size:
            _ = self.buffer.pop()
        entry = (state,action,reward,next_state)
        self.buffer.append(entry)
        
        
    def batch(self, n = 100):
        if len(self.buffer) < n:
            return self.buffer
        else:
            return random.sample(self.buffer, n)

In [3]:
episodes = 1000
explore_steps = int(episodes*0.25) # how many episodes to do pure exploration
iterations = 10 # per episode
max_steps = 1000 # per episode
batch_size = 100
discount = 0.99


tf.reset_default_graph()

Q_estimator = Q_nn(scope = 'estimator')
Q_target = Q_nn(scope = 'target')
Freezer = NetworkCopier(Q_estimator,Q_target)
Buffer = ReplayBuffer(2500)

sess = tf.Session()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())
env = gym.make('CartPole-v0')
print('Graph Set')

[2017-12-21 17:24:09,261] Making new env: CartPole-v0


Graph Set


In [4]:
%%time
for e in range(episodes):
    cumulative_loss = 0
    if e < explore_steps: # pure exploration
        epsilon = 1
    else: # epsilon-greedy exploration/exploitation
        epsilon = max(np.exp((e-explore_steps)/-50),0.1)
    observation = env.reset()
    total_reward = 0
    
    for step in range(max_steps):
        curr_state = observation
        if random.uniform(0,1) < epsilon:
            action = env.action_space.sample()
        else:
            curr_state_reshape = np.reshape(curr_state,(1,4))
            q_est = Q_estimator.predict(sess,curr_state_reshape)
            action = np.argmin(q_est)
        
        observation, reward, done, info = env.step(action)
        next_state = observation
        Buffer.add_new(curr_state, action, reward, next_state)
        total_reward += reward
        if done:
            break
    for i in range(iterations):
        states, actions, rewards, next_states = zip(*Buffer.batch(batch_size))
        targets = []
        for reward, next_state in zip(rewards,next_states):
            next_state_reshape = np.reshape(next_state, (1,4))
            q_max = np.max(Q_target.predict(sess,next_state_reshape))
            target = reward + discount*q_max
            targets.append(target)
        state_array = np.stack(states,axis=0)
        action_array = np.array(actions)
        target_array = np.array(targets)
        loss = Q_estimator.update(sess, state_array,action_array,target_array)
        cumulative_loss += loss
    if e%25 == 0:
        Freezer.copy_and_freeze(sess)
        print('Episode {}, loss: {}, total steps = {}'.format(e,cumulative_loss/iterations,total_reward))

Episode 0, loss: 36.20223426818848, total steps = 13.0
Episode 25, loss: 0.5808361381292343, total steps = 75.0
Episode 50, loss: 0.31379234790802, total steps = 11.0
Episode 75, loss: 0.2676816999912262, total steps = 20.0
Episode 100, loss: 0.2788293197751045, total steps = 15.0
Episode 125, loss: 0.2887959197163582, total steps = 11.0
Episode 150, loss: 0.28190267384052276, total steps = 12.0
Episode 175, loss: 0.3077401012182236, total steps = 16.0
Episode 200, loss: 0.32511214911937714, total steps = 23.0
Episode 225, loss: 0.32575283050537107, total steps = 15.0
Episode 250, loss: 0.3532680481672287, total steps = 25.0
Episode 275, loss: 0.4300054043531418, total steps = 10.0
Episode 300, loss: 0.5083339989185334, total steps = 29.0
Episode 325, loss: 0.6105148643255234, total steps = 48.0
Episode 350, loss: 0.8126034796237945, total steps = 63.0
Episode 375, loss: 0.9641392529010773, total steps = 60.0
Episode 400, loss: 1.3265783131122588, total steps = 37.0
Episode 425, loss: 

In [26]:
%%time
random_results = []
agent_results = []
for t in range(100):
    done = False
    observation = env.reset()
    total_steps = 0
    while not done:
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        total_steps += 1
    random_results.append(total_steps)    

for t in range(100):
    done = False
    observation = env.reset()
    total_steps = 0
    while not done:
        env.render()
        state_reshape = np.reshape(observation,(1,4))
        q_est = Q_estimator.predict(sess,state_reshape)
    #     print(q_est)
        action = np.argmin(q_est)
    #     print(action)
        observation, reward, done, info = env.step(action)
        total_steps += 1
    agent_results.append(total_steps)
env.render(close=True)
print('Luckiest of 100 random agents survives {} time steps'.format(max(random_results)))
print('Best learned agent survives {} time steps'.format(max(agent_results)))

AttributeError: 'NoneType' object has no attribute 'set_current'

In [25]:
env.reset()
for i in range(100):
    done = False
    observation = env.reset()
    total_steps = 0
    while not done:
        env.render()
        state_reshape = np.reshape(observation,(1,4))
        q_est = Q_estimator.predict(sess,state_reshape)
    #     print(q_est)
        action = env.action_space.sample()
    #     print(action)
        observation, reward, done, info = env.step(action)
        total_steps += 1
    agent_results.append(total_steps)
env.render(close=True)

AttributeError: 'NoneType' object has no attribute 'set_current'