In [1]:
import gym
import tensorflow as tf
import numpy as np
import random
from collections import deque

In [5]:
class NetworkCopier():
    def __init__(self, estimator,target):
        est_params = [variable for variable in tf.trainable_variables() if variable.name.startswith(estimator.scope)]
        est_params = sorted(est_params, key = lambda x: x.name)

        tar_params = [variable for variable in tf.trainable_variables() if variable.name.startswith(target.scope)]
        tar_params = sorted(tar_params, key = lambda x: x.name)

        self.update_ops = [tar_var.assign(est_var) for est_var,tar_var in zip(est_params,tar_params)]
        return
    
    
    def copy_and_freeze(self,sess): 
        sess.run(self.update_ops)
        return
        
        
class ReplayBuffer():
    def __init__(self, max_size = 50000):
        self.buffer = deque(maxlen = max_size)
        return
        
        
    def add_new(self, state, action, reward, next_state, done):
        entry = (state,action,reward,next_state,done)
        self.buffer.append(entry)
        return
        
        
    def batch(self, n = 100):
        if len(self.buffer) < n:
            minibatch = 0
        else:
            minibatch = random.sample(self.buffer, n)
        return minibatch


class Q_learner():
    def __init__(self, state_size, action_size, lr = 0.001, scope='default'):
        self.scope = scope
        self.state_size = state_size
        self.action_size = action_size
        self.lr = lr
        
        return
    
    
    def setup(self):
        with tf.variable_scope(self.scope):
            self.__set_placeholders(self.state_size, self.action_size)
            self.__build_model()
            self.__set_loss_and_opt(self.lr)
            
        return
    
    
    def __set_placeholders(self, state_size, action_size):
                # state
        self.X_states = tf.placeholder(shape = [None, self.state_size], dtype = tf.float32)
        
        # target values (R+maxQ)
        self.Q_targets = tf.placeholder(shape = [None, self.action_size], dtype = tf.float32)
        
        return
    
    
    def __build_model(self):
        pass
    
    
    def __set_loss_and_opt(self, lr = 0.001):
        pass
    
    
    def predict(self, sess, state):
        return sess.run(self.Q_est, { self.X_states: state })
        
        
    def update(self, sess, states, targets):
        feed_dict = { self.X_states: states, self.Q_targets: targets }
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss
    
    
class Q_nn(Q_learner):
# class Q_nn():
#     def __init__(self, state_size, action_size, lr = 0.001, scope='default'):
#         self.scope = scope
#         self.state_size = state_size
#         self.action_size = action_size
#         self.lr = lr
        
#         return
    
    
    
#     def setup(self):
#         with tf.variable_scope(self.scope):
#             self.__set_placeholders(self.state_size, self.action_size)
#             self.__build_model()
#             self.__set_loss_and_opt(self.lr)
            
#         return
    
    
#     def __set_placeholders(self, state_size, action_size):
#                 # state
#         self.X_states = tf.placeholder(shape = [None, self.state_size], dtype = tf.float32)
        
#         # target values (R+maxQ)
#         self.Q_targets = tf.placeholder(shape = [None, self.action_size], dtype = tf.float32)
        
#         return
    
    
    def __build_model(self):

        
        self.dense1 = tf.layers.dense(inputs = self.X_states, units=12, activation = tf.nn.relu)
        self.dense2 = tf.layers.dense(self.dense1,12, activation = tf.nn.relu)
        self.dense3 = tf.layers.dense(self.dense2,12, activation = tf.nn.relu)
        self.Q_est = tf.layers.dense(self.dense3,2)
        
        return
    
    
    def __set_loss_and_opt(self, lr):
        self.loss = tf.losses.mean_squared_error(self.Q_targets, self.Q_est)
        self.optimizer = tf.train.AdamOptimizer(lr)
        self.train_op = self.optimizer.minimize(self.loss)
        
        return
    
    
#     def predict(self, sess, state):
#         return sess.run(self.Q_est, { self.X_states: state })
     
    
#     def update(self, sess, states, targets):
#         feed_dict = { self.X_states: states, self.Q_targets: targets }
#         _, loss = sess.run([self.train_op, self.loss], feed_dict)
#         return loss

In [6]:
episodes = 2000
epsilon_decay = 0.995
epsilon_min = 0.1
batch_size = 32
discount = 0.95
max_steps = 1000
tf.reset_default_graph()

Q_estimator = Q_nn(4,2,0.001,scope = 'estimator')
Q_target = Q_nn(4,2,0.001,scope = 'target')
Q_estimator.setup()
Q_target.setup()
Freezer = NetworkCopier(Q_estimator,Q_target)
Buffer = ReplayBuffer(50000)

sess = tf.Session()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())
env = gym.make('CartPole-v1')
print('Graph Set')

ValueError: No variables to save

In [7]:
for e in range(episodes):
    epsilon = max(epsilon_decay**e, epsilon_min)
    observation = env.reset()
    total_reward = 0
    loss = None
    
    for step in range(max_steps):
        curr_state = observation
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            curr_state_reshape = np.reshape(curr_state,(1,4))
            q_est = Q_estimator.predict(sess,curr_state_reshape)[0]
            action = np.argmax(q_est)
        
        observation, reward, done, info = env.step(action)
        next_state = observation
        Buffer.add_new(curr_state, action, reward, next_state, done)
        total_reward += reward
        if done:
            break
        
    for i in range(10):
        minibatch = Buffer.batch(batch_size)
        if minibatch:
            states = []
            targets = []
            for state, action, reward, next_state, done in minibatch:
                state_reshape = np.reshape(state, (1,4))
                next_state_reshape = np.reshape(next_state, (1,4))            
                target = Q_estimator.predict(sess,state_reshape)[0]
                q_max = np.amax(Q_target.predict(sess,next_state_reshape)[0])

                target[action] = reward
                if not done:
                    target[action] += discount*q_max

                states.append(state)
                targets.append(target)
            state_array = np.stack(states)
            target_array = np.stack(targets)
            loss = Q_estimator.update(sess, state_array, target_array)
        
    if loss:
        Freezer.copy_and_freeze(sess)
        print('Episode {}, loss = {}, total steps = {}'.format(e,loss,total_reward))

AttributeError: 'Q_nn' object has no attribute 'Q_est'

In [None]:
%%time
random_results = []
agent_results = []
for t in range(100):
    done = False
    observation = env.reset()
    total_steps = 0
    while not done:
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        total_steps += 1
    random_results.append(total_steps)    

for t in range(100):
    done = False
    observation = env.reset()
    total_steps = 0
    while not done:
#         env.render()
        state_reshape = np.reshape(observation,(1,4))
        q_est = Q_estimator.predict(sess,state_reshape)[0]
    #     print(q_est)
        action = np.argmax(q_est)
    #     print(action)
        observation, reward, done, info = env.step(action)
        total_steps += 1
    agent_results.append(total_steps)
# env.render(close=True)
print('Luckiest of 100 random agents survives {} time steps'.format(max(random_results)))
print('Best of 100 learned agent survives {} time steps'.format(max(agent_results)))