In [None]:
%matplotlib inline

import gym
import universe
from gym import wrappers
import itertools
import numpy as np
import os
import random
import sys
import tensorflow as tf
from IPython.display import clear_output

if "../" not in sys.path:
  sys.path.append("../")

from collections import deque, namedtuple

In [None]:
env = gym.make('flashgames.UltimateEscape-v0')
env.configure(remotes=1) 

In [None]:
valid_actions = [[[('KeyEvent', 'w', True), ('KeyEvent', 'd', False), ('KeyEvent', 'a', False)]], 
                [[('KeyEvent', 'w', False), ('KeyEvent', 'd', True), ('KeyEvent', 'a', False)]],
                [[('KeyEvent', 'w', False), ('KeyEvent', 'd', False), ('KeyEvent', 'a', True)]],
                [[('KeyEvent', 'w', True), ('KeyEvent', 'd', True), ('KeyEvent', 'a', False)]],
                [[('KeyEvent', 'w', True), ('KeyEvent', 'd', False), ('KeyEvent', 'a', True)]],
                [[('KeyEvent', 'w', False), ('KeyEvent', 'd', False), ('KeyEvent', 'a', False)]]]

In [None]:
def build_shared_network(X):
    X = tf.to_float(X)
    conv1 = tf.contrib.layers.conv2d(X, 64, 7, 3, activation_fn=tf.nn.relu, scope="conv1")
    conv2 = tf.contrib.layers.conv2d(conv1, 32, 5, 2, activation_fn=tf.nn.relu, scope="conv2")
    conv3 = tf.contrib.layers.conv2d(conv2, 16, 5, 2, activation_fn=tf.nn.relu, scope="conv3")
    return conv3

In [None]:
def copy_model_parameters(sess, estimator1, estimator2):
    """
    Copies the model parameters of one estimator to another.

    Args:
      sess: Tensorflow session instance
      estimator1: Estimator to copy the paramters from
      estimator2: Estimator to copy the parameters to
    """
    e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]
    e1_params = sorted(e1_params, key=lambda v: v.name)
    e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]
    e2_params = sorted(e2_params, key=lambda v: v.name)

    update_ops = []
    for e1_v, e2_v in zip(e1_params, e2_params):
        op = e2_v.assign(e1_v)
        update_ops.append(op)

    sess.run(update_ops)

In [None]:
class StateProcessor():
    """
    Processes a raw Atari iamges. Resizes it and converts it to grayscale.
    """
    def __init__(self):
        # Build the Tensorflow graph
        with tf.variable_scope("state_processor"):
            self.input_state = tf.placeholder(shape=[768, 1024, 3], dtype=tf.uint8)
            self.output = tf.image.crop_to_bounding_box(self.input_state, 120, 19, 432, 658)
            self.output = tf.image.resize_images(
                self.output, [128, 256], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)

    def process(self, sess, state):
        """
        Args:
            sess: A Tensorflow session object
            state: A [210, 160, 3] Atari RGB State

        Returns:
            A processed [84, 84, 1] state representing grayscale values.
        """
        return sess.run(self.output, { self.input_state: state })

In [None]:
class actor :
    def __init__(self, shared_scope='behaviour', batch_size=32, learning_rate=0.01):
        scope='actor'
        self.states = tf.placeholder(shape=[None, 128, 256, 3], dtype=tf.uint8, name="X")
        self.targets = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
        self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")

        with tf.variable_scope(shared_scope, reuse=True):
            conv3 = build_shared_network(self.states)

        with tf.variable_scope(scope+shared_scope):
            flattened = tf.contrib.layers.flatten(conv3)
            self.fc1 = tf.contrib.layers.fully_connected(flattened, 512, activation_fn=tf.nn.relu)
            self.fc2 = tf.contrib.layers.fully_connected(self.fc1, 256, activation_fn=tf.nn.relu)
            self.fc3 = tf.contrib.layers.fully_connected(self.fc2, 128, activation_fn=tf.nn.relu)

            self.fc_out = tf.contrib.layers.fully_connected(self.fc3, len(valid_actions), activation_fn=tf.nn.relu)
            self.probs = tf.nn.softmax(self.fc_out)


            self.entropy = -tf.reduce_sum(self.probs * tf.log(self.probs), 1, name="entropy")
            self.entropy_mean = tf.reduce_mean(self.entropy, name="entropy_mean")

            gather_indices = tf.range(batch_size) * tf.shape(self.probs)[1] + self.actions
            self.picked_action_probs = tf.gather(tf.reshape(self.probs, [-1]), gather_indices)


            self.losses = - (tf.log(self.picked_action_probs) * self.targets + 0.01 * self.entropy)
            self.loss = tf.reduce_sum(self.losses, name="loss")

            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            self.train_op = self.optimizer.minimize(
                                self.loss, global_step=tf.contrib.framework.get_global_step())


        # tf.scalar_summary(self.loss.op.name, self.loss)
        # tf.scalar_summary(self.entropy_mean.op.name, self.entropy_mean)
        # tf.histogram_summary(self.entropy.op.name, self.entropy)


    def predict(self, stateS, sess=None):
        sess = sess or tf.get_default_session()
        return sess.run(self.action_probs, { self.state: state })

    def update(self, state, target, action, sess=None):
        sess = sess or tf.get_default_session()
        feed_dict = { self.state: state, self.target: target, self.action: action  }
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss

In [None]:
class ValueEstimator:
    
    def __init__ (self, shared_scope, scope='critic', learning_rate=1e-4):
        self.learning_rate=learning_rate
        with tf.variable_scope(scope+shared_scope):
            self.states = tf.placeholder(shape=[None, 128, 256, 3], dtype=tf.uint8, name="X")
            self.targets = tf.placeholder(shape=[None], dtype=tf.float32, name="targets")
        
        with tf.variable_scope(shared_scope, reuse=False):
            conv3 = build_shared_network(self.states)
            
        with tf.variable_scope(scope+shared_scope):
            flattened = tf.contrib.layers.flatten(conv3)
            self.fc1 = tf.contrib.layers.fully_connected(flattened, 512, activation_fn=tf.nn.relu)
            self.fc2 = tf.contrib.layers.fully_connected(self.fc1, 256, activation_fn=tf.nn.relu)
            self.fc3 = tf.contrib.layers.fully_connected(self.fc2, 128, activation_fn=tf.nn.relu)

            self.predictions = tf.contrib.layers.fully_connected(self.fc3, 1, activation_fn=tf.nn.relu)
        
                    
            self.loss = tf.squared_difference(self.predictions, self.targets)
            self.losses = tf.reduce_mean(self.loss)
        
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
            self.train = self.optimizer.minimize(self.losses, global_step=tf.contrib.framework.get_global_step())
        
    def predict (self, states, sess=tf.get_default_session()):
        return sess.run(self.predictions, {self.states: states})
        
    def update (self, states, targets, sess=tf.get_default_session()):
        losses, _ = sess.run([self.losses, self.train], {self.states: states, self.targets: targets})
        return losses

In [None]:
BEGIN = False
END = False
TAKE_REWARD = False
def step(env, state_processor, action_n):
    global TAKE_REWARD
    global END
    global BEGIN
    print action_n
    while True :
        action_list=[]
        action_list.append(action_n[0])
        observation_n, reward_n, done_n, _ = env.step(action_list)
        if TAKE_REWARD:
            if reward_n[0] % 50 ==0:
                observation_n = state_processor.process(sess, observation_n[0]['vision'])
                return observation_n,1,False
            else :
                TAKE_REWARD=False
                observation_n = state_processor.process(sess, observation_n[0]['vision'])
                return observation_n,-10,True
        if observation_n[0] and END:
            if observation_n[0]['vision'][452, 27, 0] >45 and observation_n[0]['vision'][452, 27, 0] < 55 :
                BEGIN = True
                TAKE_REWARD=True
                END = False

        if observation_n[0] and not END:
            frame = observation_n[0]['vision'][120:432, 19:658, :]
            if observation_n[0]['vision'][452, 27, 0] < 45:
                BEGIN = False
                END = True
                TAKE_REWARD=False



def DDPG (sess, 
          env, 
          behavior_v, 
          target_v, 
          behavior_policy,
          target_policy,
          project_dir, 
          state_processor,
          num_episodes=100000, 
          replay_mem_size=500000, 
          replay_mem_init_size=50000, 
          update_target_estimator_every=10000, 
          discount_factor=0.99, 
          batch_size=32, 
          record_video_every=10):
    
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")
    transition=namedtuple("transition",["state","action","reward" ,"nextstate","done"])
    
    summary_dir = os.path.join(project_dir, "summaries")
    if not os.path.exists(summary_dir):
        os.makedirs(summary_dir)
    summary_writer = tf.train.SummaryWriter(summary_dir)
    
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)
    
    # Get the current time step
    total_t = sess.run(tf.contrib.framework.get_global_step())
   
#     env =wrappers.Monitor(env,monitor_path,resume=True ,video_callable= lambda x : x % record_video_every == 0 )

    replay_mem = []
    #initialize replay mem
    state=env.reset()
    state, _, _ = step(env, state_processor, valid_actions[0])
    for t in itertools.count():
        action=np.random.choice(len(valid_actions))
        nextstate,reward,done = step(env, state_processor, valid_actions[action])
        replay_mem.append(transition(state,action,reward,nextstate,done))
        state=nextstate
        if t > replay_mem_init_size : 
            break
        
    for i_episode in range(num_episodes):
        loss=None 
        saver.save(tf.get_defualt_session(), checkpoint_path)
        total_reward = 0
        lenght = 0
        for t in itertools.count():
            
            if total_t % update_target_estimator_every == 0:
                copy_model_parameters(sess=sess, estimator1=target_v, estimator2=behavior_v)
                copy_model_parameters(sess=sess, estimator1=target_policy, estimator2=behavior_policy)
                
            actions_prob = behavior_policy.predict(sess,state)
            action = np.random.choice(np.arange(len(actions_prob)),p=actions_prob)
            nextstate,reward,done=step(env,state_processor,valid_actions[action])
            total_reward += reward
            if len(replay_mem)== replay_mem_size:
                replay_mem.pop(0)
            replay_mem.append(transition(state,action,reward,nextstate,done))
            samples = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(
                                                                    np.array, zip(*samples))
            
            targets=reward_batch+np.invert(done_batch).astype(np.float32)*discount_factor*
                                            target_v.predict(sess,next_states_batch)
            loss+= target_v.update(states,target,states)
            _ = target_policy.update(states,target,states)
            
            
            if done:
                lenght = t
                break
                
            state = nextstate
            total_t += 1
            
        episode_summary = tf.Summary()
        episode_summary.value.add(simple_value=total_reward, node_name='episode_reward', tag='episode_reward')
        episode_summary.value.add(simple_value=lenght, node_name='episode_lenght', tag='episode_lenght')
        summary_writer.add_summary(episode_summary, total_t)
        summary_writer.flush()
        

In [None]:
tf.reset_default_graph()

experiment_dir = os.path.abspath("./experiments/{}".format(env.spec.id))

global_step = tf.Variable(0, name='global_step', trainable=False)
    
V_t = ValueEstimator(shared_scope='target')
V_b = ValueEstimator(shared_scope='behaviour')

P_t = actor(shared_scope='target')
P_b = actor(shared_scope='behaviour')

state_processor=StateProcessor()
    
try:
    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
        DDPG(sess, 
              env, 
              behavior_v=V_b, 
              target_v=V_t, 
              behavior_policy=P_b, 
              target_policy=P_t, 
              state_processor=state_processor, 
              project_dir=experiment_dir, 
              num_episodes=10000, 
              replay_mem_size=500000, 
              replay_mem_init_size=50000, 
              update_target_estimator_every=10000, 
              discount_factor=0.99, 
              batch_size=32)
            
except KeyboardInterrupt:
    pass