In [1]:
%matplotlib inline

import gym
from gym import wrappers
import itertools
import numpy as np
import os
import random
import sys
import tensorflow as tf
from IPython.display import clear_output

if "../" not in sys.path:
  sys.path.append("../")

from collections import deque, namedtuple

In [2]:
env = gym.envs.make('Breakout-v0')


[2017-02-04 20:02:32,686] Making new env: Breakout-v0


In [3]:
valid_actions = [0,1,2,3]

In [4]:
def build_shared_network(X):
    X = tf.to_float(X)/255.0
    conv1 = tf.contrib.layers.conv2d(X, 32, 7, 3, activation_fn=tf.nn.relu, scope="conv1")
    conv2 = tf.contrib.layers.conv2d(conv1, 16, 5, 2, activation_fn=tf.nn.relu, scope="conv2")
    conv3 = tf.contrib.layers.conv2d(conv2, 16, 5, 2, activation_fn=tf.nn.relu, scope="conv3")
    return conv3

In [5]:
def copy_model_parameters(sess, estimator1, estimator2):
    """
    Copies the model parameters of one estimator to another.

    Args:
      sess: Tensorflow session instance
      estimator1: Estimator to copy the paramters from
      estimator2: Estimator to copy the parameters to
    """
    e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]
    e1_params = sorted(e1_params, key=lambda v: v.name)
    e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]
    e2_params = sorted(e2_params, key=lambda v: v.name)

    update_ops = []
    for e1_v, e2_v in zip(e1_params, e2_params):
        op = e2_v.assign(e1_v)
        update_ops.append(op)

    sess.run(update_ops)

In [6]:
def update_target_parameters(behavior, target, taw, sess):
    """
    Copies the model parameters of one estimator to another.

    Args:
      sess: Tensorflow session instance
      estimator1: Estimator to copy the paramters from
      estimator2: Estimator to copy the parameters to
    """
    e1_params = [t for t in tf.trainable_variables() if t.name.startswith(behavior.scope)]
    e1_params = sorted(e1_params, key=lambda v: v.name)
    e2_params = [t for t in tf.trainable_variables() if t.name.startswith(target.scope)]
    e2_params = sorted(e2_params, key=lambda v: v.name)

    update_ops = []
    for e1_v, e2_v in zip(e1_params, e2_params):
        op = e2_v.assign(taw * e1_v + (1-taw) * e2_v)
        update_ops.append(op)

    sess.run(update_ops)

In [7]:
class StateProcessor():
    """
    Processes a raw Atari iamges. Resizes it and converts it to grayscale.
    """
    def __init__(self):
        # Build the Tensorflow graph
        with tf.variable_scope("state_processor"):
            self.input_state = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8)
            self.output=tf.image.rgb_to_grayscale(self.input_state)
            self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160)
            self.output = tf.image.resize_images(
                self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
            self.output=tf.squeeze(self.output)
    def process(self, sess, state):
        """
        Args:
            sess: A Tensorflow session object
            state: A [210, 160, 3] Atari RGB State

        Returns:
            A processed [84, 84, 1] state representing grayscale values.
        """
        return sess.run(self.output, { self.input_state: state })

In [8]:
class actor :
    def __init__(self, shared_scope='behaviour', batch_size=32, learning_rate=0.01):
        scope='actor'
        self.scope = scope+shared_scope
        self.states = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X")

        with tf.variable_scope(shared_scope, reuse=True):
            conv3 = build_shared_network(self.states)

        with tf.variable_scope(scope+shared_scope):
            flattened = tf.contrib.layers.flatten(conv3)
            self.fc1 = tf.contrib.layers.fully_connected(flattened, 256, activation_fn=tf.nn.relu)
            self.fc2 = tf.contrib.layers.fully_connected(self.fc1, 64, activation_fn=tf.nn.relu)
            self.fc3 = tf.contrib.layers.fully_connected(self.fc2, 64, activation_fn=tf.nn.relu)

            self.fc_out = tf.contrib.layers.fully_connected(self.fc3, len(valid_actions), activation_fn=tf.nn.relu)
            self.probs = tf.nn.softmax(self.fc_out)
            
            self.action_gradient = tf.placeholder(tf.float32,[None, 1])
            self.weights = [t for t in tf.trainable_variables() if t.name.startswith(self.scope)]
            self.params_grad = tf.gradients(self.probs, self.weights, -self.action_gradient)
            grads = zip(self.params_grad, self.weights)
            self.optimizer = tf.train.AdamOptimizer(learning_rate).apply_gradients(grads)


        # tf.scalar_summary(self.loss.op.name, self.loss)
        # tf.scalar_summary(self.entropy_mean.op.name, self.entropy_mean)
        # tf.histogram_summary(self.entropy.op.name, self.entropy)


    def predict(self, state, sess):
        return sess.run(self.probs, { self.states: state })

    def update(self, state, grads, sess):
        sess = sess 
        feed_dict = { self.states: state, self.action_gradient: grads }
        sess.run(self.optimizer, feed_dict)

In [9]:
class ValueEstimator:
    
    def __init__ (self, shared_scope, batch_size=32, scope='critic', learning_rate=1e-4):
        self.learning_rate=learning_rate
        self.batch_size = batch_size
        self.scope = scope+shared_scope
        with tf.variable_scope(scope+shared_scope):
            self.states = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X")
            self.actions = tf.placeholder(shape=[None, 1], dtype=tf.float32, name="actions")
            self.targets = tf.placeholder(shape=[None], dtype=tf.float32, name="targets")
        
        with tf.variable_scope(shared_scope, reuse=False):
            conv3 = build_shared_network(self.states)
            
        with tf.variable_scope(scope+shared_scope):
            flattened = tf.contrib.layers.flatten(conv3)
            self.a_fc = tf.contrib.layers.fully_connected(self.actions, 64, activation_fn=tf.nn.relu)
            self.fc1 = tf.contrib.layers.fully_connected(flattened, 256, activation_fn=tf.nn.relu)
            self.merge = tf.concat(1, [self.fc1, self.a_fc])
            self.fc2 = tf.contrib.layers.fully_connected(self.merge, 64, activation_fn=tf.nn.relu)
            self.fc3 = tf.contrib.layers.fully_connected(self.fc2, 64, activation_fn=tf.nn.relu)

            self.predictions = tf.contrib.layers.fully_connected(self.fc3, 1, activation_fn=tf.nn.relu)
            
            self.loss = tf.squared_difference(self.predictions, self.targets)
            self.losses = tf.reduce_mean(self.loss)
        
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
            self.train = self.optimizer.minimize(self.losses, global_step=tf.contrib.framework.get_global_step())
            
            self.grads = tf.gradients(self.predictions, self.actions)
            
    def predict (self, states, actions, sess):
        return sess.run(self.predictions, {self.states: states, self.actions: actions})
        
    def update (self, states, actions, targets, sess):
        losses, _ = sess.run([self.losses, self.train], {self.states: states, self.targets: targets, self.actions: actions})
        return losses
    
    def gradient (self, states, actions, sess):
        return sess.run(self.grads, {self.states: states, self.actions: actions})

In [None]:
def DDPG (sess, 
          env, 
          behavior_v, 
          target_v, 
          behavior_policy,
          target_policy,
          project_dir, 
          state_processor,
          num_episodes=100000, 
          replay_mem_size=500000, 
          replay_mem_init_size=50000, 
          update_target_estimator_every=10000, 
          discount_factor=0.99, 
          batch_size=32, 
          record_video_every=10):
   

    Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])

    # The replay memory
    replay_mem = []
    
    summary_dir = os.path.join(project_dir, "summaries_{}".format('tezy'))
    if not os.path.exists(summary_dir):
        os.makedirs(summary_dir)
    summary_writer = tf.train.SummaryWriter(summary_dir)

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitorr")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)
    
    # Get the current time step
    total_t = sess.run(tf.contrib.framework.get_global_step())

    # Record videos
    env = wrappers.Monitor(env, monitor_path, resume=True,
                               video_callable=lambda count: count % record_video_every == 0)

    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    state = env.reset()
    state = state_processor.process(sess, state)
    state = np.stack([state] * 4, axis=2)
    i = 0
    while True:
        # TODO: Populate replay memory!
        action = np.random.choice(len(valid_actions))
        next_state, reward, done, _ = env.step(action)
        next_state = np.concatenate((state_processor.process(sess, next_state).reshape((84,84,1)), state[:,:,:3]), axis=2)
        replay_mem.append(Transition(state, action, reward, next_state, done))
        state = next_state
        i+=1
        if done:
            if i > replay_mem_init_size:
                break
            state = env.reset()
            state = state_processor.process(sess, state)
            state = np.stack([state] * 4, axis=2)
        
    for i_episode in range(num_episodes):
        state=env.reset()
        state = state_processor.process(sess, state)
        state = np.stack([state] * 4, axis=2)
        saver.save(sess, checkpoint_path)
        total_reward = 0
        lenght = 0
        for t in itertools.count():
            print (t)

            actions_prob = behavior_policy.predict(np.reshape(state, (-1,84, 84, 4)), sess)[0]
            action = np.random.choice(np.arange(len(actions_prob)),p=actions_prob)
            nextstate, reward, done, _ = env.step(valid_actions[action])
            nextstate = np.concatenate((state_processor.process(sess, nextstate).reshape((84,84,1)), state[:,:,:3]), axis=2)
            
            total_reward += reward
            if len(replay_mem) == replay_mem_size:
                replay_mem.pop(0)
            replay_mem.append(Transition(state, action, reward, nextstate, done))
            samples = random.sample(replay_mem, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(
                                                                    np.array, zip(*samples))
            
            next_actions = target_policy.predict(next_states_batch, sess)
            targets = reward_batch + np.reshape(np.invert(done_batch).astype(np.float32), (-1,)) * discount_factor * np.reshape(target_v.predict(next_states_batch,
                                                                np.reshape(np.argmax(next_actions, axis=1), (-1, 1)), sess), (-1,))
            behavior_v.update(states_batch, np.reshape(action_batch, (-1,1)), targets, sess)
            actions_prob = behavior_policy.predict(states_batch, sess)
            Q_ = behavior_v.gradient(states_batch, np.reshape(np.argmax(actions_prob, axis=1), (-1,1)), sess)
            behavior_policy.update(states_batch, Q_[0], sess)
            
            if total_t % 5 == 0:
                update_target_parameters(behavior=behavior_v, target=target_v, taw=0.01, sess=sess)
                update_target_parameters(behavior=behavior_policy, target=target_policy, taw=0.01, sess=sess)
            
            
            if done:
                lenght = t
                break
                
            state = nextstate
            total_t += 1
            clear_output()
            
        episode_summary = tf.Summary()
        episode_summary.value.add(simple_value=total_reward, node_name='episode_reward', tag='episode_reward')
        episode_summary.value.add(simple_value=lenght, node_name='episode_lenght', tag='episode_lenght')
        summary_writer.add_summary(episode_summary, total_t)
        summary_writer.flush()
        

In [None]:
tf.reset_default_graph()

experiment_dir = os.path.abspath("./experiments/{}".format(env.spec.id))

global_step = tf.Variable(0, name='global_step', trainable=False)
    
V_t = ValueEstimator(shared_scope='target')
V_b = ValueEstimator(shared_scope='behaviour')

P_t = actor(shared_scope='target')
P_b = actor(shared_scope='behaviour')

state_processor=StateProcessor()
    
try:
    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
        DDPG(sess, 
              env, 
              behavior_v=V_b, 
              target_v=V_t, 
              behavior_policy=P_b, 
              target_policy=P_t, 
              state_processor=state_processor, 
              project_dir=experiment_dir, 
              num_episodes=10000, 
              replay_mem_size=500000, 
              replay_mem_init_size=50000, 
              update_target_estimator_every=10000, 
              discount_factor=0.99, 
              batch_size=32)
            
except KeyboardInterrupt:
    pass

150
