In [1]:
import tensorflow as tf
import numpy as np
import gym

from skimage import transform
from skimage.color import rgb2gray
import matplotlib.pyplot as plt

from collections import deque
import random

from multiprocessing import Process, Pipe

import time

In [2]:
def preprocess_screen(screen, sz_to_process):
    print(screen.shape)
    print(screen)
    gray = rgb2gray(screen)
    #cropped_screen = gray[8:-12,5:-12]  # For Space Invaders, TODO make it cleaner
    cropped_screen = gray[25:-10,5:-5]  # For Pong, TODO make it cleaner
    preprocessed_screen = transform.resize(cropped_screen, sz_to_process, mode='constant', anti_aliasing=True)
    
    return preprocessed_screen

In [3]:
def stack_observations(stacked_observations, observation, is_new_episode):
    assert isinstance(stacked_observations, deque), "stacked_observations has not type deque"
    sz_to_process = stacked_observations[0].shape  #TODO make it cleaner
        
    if is_new_episode:
        for _ in range(len(stacked_observations)):
            stacked_observations.append(observation)
    else:
        stacked_observations.append(observation)
    
    last_axis = len(stacked_observations[0].shape)
    state = np.stack(stacked_observations, axis=last_axis)
                    
    return state, stacked_observations

In [4]:
def compute_target_values(batch_rewards, next_estimated_values, batch_dones, gamma):
    nb_seq, len_seq = batch_rewards.shape
    batch_target_values = np.zeros_like(batch_rewards, dtype=np.float)
    cums = next_estimated_values
        
    for i in range(len_seq-1, -1, -1):
        cums = np.where(batch_dones[:, i], batch_rewards[:, i], gamma * cums + batch_rewards[:, i])
        
        batch_target_values[:, i] = cums
        
    return batch_target_values

In [5]:
class EnvWrapper:
    # Do not have to handle stacked frames externally
    
    def __init__(self, game_name, state_size):
        self.env = gym.make(game_name)
        self.stacked_observations = deque([np.zeros(state_size[:-1]) for _ in range(state_size[-1])],
                                    maxlen=state_size[-1])
        self.nb_actions = self.env.action_space.n
        self.observation_shape = self.env.observation_space.shape
        self.obs_is_image = self.observation_shape == (210, 160, 3)
        
    def step(self, action, render=False):
        observation, reward, done, info = self.env.step(action)
        if self.obs_is_image:
            observation = preprocess_screen(observation, sz_to_process)
        if render:
            self.render()
            
        if done:
            # Reset env and state
            observation = self.env.reset()
            if self.obs_is_image:
                observation = preprocess_screen(observation, sz_to_process)

            stacked_state, self.stacked_observations = stack_observations(self.stacked_observations, observation, True)
        else:
            stacked_state, self.stacked_observations = stack_observations(self.stacked_observations, observation, False)

        return stacked_state, reward, done, info
            
    def reset(self):
        observation = self.env.reset()
        if self.obs_is_image:
            observation = preprocess_screen(observation, sz_to_process)

        stacked_state, self.stacked_observations = stack_observations(self.stacked_observations, observation, True)
        
        return stacked_state
        
    def render(self):
        self.env.render()

In [48]:
def worker(env_remote, worker_remote, env):
    env_remote.close()
    while True:
        cmd, data = worker_remote.recv()

        if cmd == 'step':
            stacked_input, reward, done, info = env.step(data)
            worker_remote.send((stacked_input, reward, done, info))

        elif cmd == 'reset':
            stacked_input = env.reset()
            worker_remote.send(stacked_input)
            
        elif cmd == 'close':
            env_remote.close()

        else:
            raise NotImplementedError         

class VecEnvWrapper:
    def __init__(self, env_list):
        self.env_remotes, self.worker_remotes = zip(*[Pipe() for _ in range(len(env_list))])
        self.processes = [Process(target=worker, args=(e_remote, w_remote, env))
                            for (e_remote, w_remote, env) in zip(self.env_remotes, self.worker_remotes, env_list)]

        for p in self.processes:
            p.daemon = True
            p.start()
        for wr in self.worker_remotes:
            wr.close()
            
    def step(self, actions):   
        for r, a in zip(self.env_remotes, actions):
            r.send(('step', a))
        
        step_outputs = [r.recv() for r in self.env_remotes]

        stacked_inputs, rewards, dones, infos = zip(*step_outputs)

        return stacked_inputs, rewards, dones, infos

    def reset(self):
        for r in self.env_remotes:
            r.send(('reset', None))
        return [r.recv() for r in self.env_remotes]
    
    def close(self):
        for r in self.env_remotes:
            r.send(('close', None))
        for r in self.env_remotes:
            r.close()
        for p in self.processes:
            p.join()

In [16]:
class ActorCriticGraphImages:
    
    def __init__(self, state_size, nb_actions, learning_rate, scope_name):
        self.state_size = state_size
        self.nb_actions = nb_actions
        self.learning_rate = learning_rate
        self.scope_name = scope_name
        
        with tf.variable_scope(self.scope_name):
            self.state = tf.placeholder(tf.float32, (None, *(self.state_size)), name="state")
            self.action = tf.placeholder(tf.uint8, (None,), name="action")
            self.action_OH = tf.one_hot(self.action, self.nb_actions, name="action_OH")

 
            self.target_value = tf.placeholder(tf.float32, (None,), name="target_value")
            self.advantage = tf.placeholder(tf.float32, (None,), name="advantage")
        
            initializer = tf.contrib.layers.xavier_initializer()

            device = '/device:GPU:0' if tf.test.is_gpu_available() else '/device:CPU:0'
            with tf.device(device):
                # Neural net
                conv1_F = tf.Variable(initializer((7, 7, 4, 8)))
                self.conv1 = tf.nn.conv2d(input=self.state,
                                          filter=conv1_F,
                                          strides=(1,4,4,1),
                                          padding="VALID",
                                          name="conv1")
                self.conv1_act = tf.nn.relu(self.conv1)

                conv2_F = tf.Variable(initializer((5, 5, 8, 16)))
                self.conv2 = tf.nn.conv2d(input=self.conv1_act,
                                          filter=conv2_F,
                                          strides=(1,2,2,1),
                                          padding="VALID",
                                          name="conv2")
                self.conv2_act = tf.nn.relu(self.conv2)
            
            
                conv3_F = tf.Variable(initializer((3, 3, 16, 16)))
                self.conv3 = tf.nn.conv2d(input=self.conv2_act,
                                          filter=conv3_F,
                                          strides=(1,2,2,1),
                                          padding="VALID",
                                          name="conv3")
                self.conv3_act = tf.nn.relu(self.conv3)
            
                self.flatten = tf.keras.layers.Flatten()(self.conv3_act)

                # Actor part
                self.fc_actions = tf.keras.layers.Dense(units=512,
                                                        activation=tf.nn.relu,
                                                        kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                        name="fc_action")(self.flatten)

                self.prob_actions = tf.keras.layers.Dense(units=self.nb_actions,
                                                          activation=tf.nn.softmax,
                                                          kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                          name="action_distribution")(self.fc_actions)

                # Critic part
                self.fc_value = tf.keras.layers.Dense(
                                                units=512,
                                                activation=tf.nn.relu,
                                                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                name="fc_value")(self.flatten)

                self.value = tf.keras.layers.Dense(units=1,
                                                   activation=None,
                                                   kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                   name="value")(self.fc_value)
            
                # Losses
                # Actor loss
                self.log_prob_actions = tf.math.log(self.prob_actions)

                self.log_prob_chosen_action = tf.reduce_sum(self.log_prob_actions * self.action_OH, axis=1)
                self.actor_loss = - tf.reduce_mean(self.log_prob_chosen_action * self.advantage)

                # Critic loss
                self.critic_loss = tf.reduce_mean(tf.square(self.target_value - self.value))

                # Entropy: sum(p(x) * -log(p(x)))
                self.entropy = tf.reduce_sum(tf.multiply(self.prob_actions, - self.log_prob_actions))

                # TODO put coeffs as parameters
                self.loss = 0.5 * self.critic_loss + self.actor_loss - 0.01 * self.entropy

                self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.train_op = self.optimizer.minimize(self.loss)

In [89]:
class ActorCriticGraphVectors:
    
    def __init__(self, state_size, nb_actions, learning_rate, max_grad_norm=None, scope_name="ACNet"):
        self.state_size = state_size
        self.nb_actions = nb_actions
        self.learning_rate = learning_rate
        self.max_grad_norm = max_grad_norm
        self.scope_name = scope_name
        
        with tf.variable_scope(self.scope_name):
            self.state = tf.placeholder(tf.float32, (None, *(self.state_size)), name="state")
            self.action = tf.placeholder(tf.uint8, (None,), name="action")
            self.action_OH = tf.one_hot(self.action, self.nb_actions, name="action_OH")

 
            self.target_value = tf.placeholder(tf.float32, (None,), name="target_value")
            self.advantage = tf.placeholder(tf.float32, (None,), name="advantage")
        
            initializer = tf.contrib.layers.xavier_initializer()

            device = '/device:GPU:0' if tf.test.is_gpu_available() else '/device:CPU:0'
            with tf.device(device):
                # Neural net
                self.flatten_state = tf.keras.layers.Flatten()(self.state)
                
                # Actor part
                self.fc_actions = tf.keras.layers.Dense(units=10,
                                                        activation=tf.nn.relu,
                                                        kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                        name="fc_action")(self.flatten_state)

                self.prob_actions = tf.keras.layers.Dense(units=self.nb_actions,
                                                          activation=tf.nn.softmax,
                                                          kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                          name="action_distribution")(self.fc_actions)

                # Critic part
                self.fc_value = tf.keras.layers.Dense(units=10,
                                                      activation=tf.nn.relu,
                                                      kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                      name="fc_value")(self.flatten_state)

                self.value = tf.keras.layers.Dense(units=1,
                                                   activation=None,
                                                   kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                   name="value")(self.fc_value)
            
                # Losses
                # Actor loss
                self.log_prob_actions = tf.math.log(self.prob_actions)

                self.log_prob_chosen_action = tf.reduce_sum(self.log_prob_actions * self.action_OH, axis=1)
                self.actor_loss = - tf.reduce_mean(self.log_prob_chosen_action * self.advantage)

                # Critic loss
                self.critic_loss = tf.reduce_mean(tf.square(self.target_value - self.value))

                # Entropy: sum(p(x) * -log(p(x)))
                self.entropy = tf.reduce_sum(tf.multiply(self.prob_actions, - self.log_prob_actions))

                # TODO put coeffs as parameters
                self.loss = 0.5 * self.critic_loss + self.actor_loss - 0.01 * self.entropy
                
                # Clip norm
                params = tf.trainable_variables(self.scope_name)
                grads = tf.gradients(self.loss, params)
                if max_grad_norm is not None:
                    # Clip the gradients (normalize)
                    grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
                grads = list(zip(grads, params))
        
                self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
                #self.train_op = self.optimizer.minimize(self.loss)
                self.train_op = self.optimizer.apply_gradients(grads)

In [90]:
class ActorCriticTrainer:
    def __init__(self, graph, vec_env, nb_env, eval_env, gamma, state_size, ckpt_file):
        self.graph = graph
        self.vec_env = vec_env
        self.nb_env = nb_env
        self.eval_env = eval_env
        self.nb_actions = self.eval_env.nb_actions
        self.gamma = gamma
        self.state_size = state_size
        self.ckpt_file = ckpt_file
        self.saver = tf.train.Saver()

    def choose_actions(self, input_states, session):
        prob_actions = session.run(self.graph.prob_actions,
                                    feed_dict={self.graph.state: input_states})

        actions = [np.random.choice(np.arange(self.nb_actions), p=p) for p in prob_actions]

        return actions
    
    def get_values(self, input_states, session):
        values = session.run(self.graph.value,
                             feed_dict={self.graph.state: input_states})

        return values.flatten()
        
    def choose_actions_and_get_values(self, input_states, session):
        prob_actions, values = session.run([self.graph.prob_actions, self.graph.value],
                                           feed_dict={self.graph.state: input_states})

        actions = [np.random.choice(np.arange(self.nb_actions), p=p) for p in prob_actions]

        return actions, values.flatten()
        
    def run_episode(self, env_to_run, max_step, render, session):

        # Reset state
        state = env_to_run.reset()        
        total_reward = 0
        
        for step in range(max_step):
            state = np.expand_dims(state, axis=0)
            action = self.choose_actions(state, session)[0]

            # Apply action to env and get next state, reward, and done bool
            state, reward, done, _ = env_to_run.step(action, render)
            total_reward += reward

            if done:
                break

        print("Reward on episode: %f" % total_reward)
        
    def train_on_batch(self, states, actions, target_values, advantages, session):
        loss, _ = session.run([self.graph.loss, self.graph.train_op],
                              feed_dict={self.graph.state: states,
                                         self.graph.action: actions,
                                         self.graph.target_value: target_values,
                                         self.graph.advantage: advantages})
        
        return loss
         
    def play_and_learn(self, n_iterations, steps_per_iteration,
                       evaluate_every, save_every,
                       ckpt_file, session):

        # Reset states
        states = self.vec_env.reset()       

        states_all_env = np.empty((self.nb_env, steps_per_iteration, *self.state_size), dtype=np.float)
        actions_all_env = np.empty((self.nb_env, steps_per_iteration), dtype=np.int)
        rewards_all_env = np.empty((self.nb_env, steps_per_iteration), dtype=np.float)
        values_all_env = np.empty((self.nb_env, steps_per_iteration), dtype=np.float)
        dones_all_env = np.empty((self.nb_env, steps_per_iteration), dtype=np.bool)
        
        T = time.time()
        for iteration in range(1, n_iterations):

            for step in range(steps_per_iteration):
                actions, values = self.choose_actions_and_get_values(states, session)
                next_states, rewards, dones, _ = self.vec_env.step(actions)

                clipped_rewards = np.clip(rewards, -1, 1)
                                
                states_all_env[:, step] = states
                actions_all_env[:, step] = actions
                rewards_all_env[:, step] = clipped_rewards
                values_all_env[:, step] = values
                dones_all_env[:, step] = dones
                
                states = next_states
            
            # Estimated values for the future
            next_estimated_values = self.get_values(states, session)
            next_estimated_values = np.where(dones, 0, next_estimated_values)

            target_values_all_env = compute_target_values(np.array(rewards_all_env),
                                                          next_estimated_values,
                                                          np.array(dones_all_env),
                                                          self.gamma)
            
            advantages_all_env = target_values_all_env - values_all_env
            
            # Concatenate the experiences from the different envionments
            batch_states = np.concatenate(states_all_env)
            batch_actions = np.concatenate(actions_all_env)
            batch_target_values = np.concatenate(target_values_all_env)
            batch_advantages = np.concatenate(advantages_all_env)
            
            loss = self.train_on_batch(batch_states, batch_actions, batch_target_values, batch_advantages, session)
            
            if iteration % evaluate_every == 0:
                self.run_episode(self.eval_env, 10000, False, session)
                print("Time to play %i iterations: %f" %(iteration, time.time() - T))
                
            if iteration % save_every == 0:
                self.saver.save(session, self.ckpt_file)
                print("Saved model after %i iterations." % iteration)
                
    def restore(self, session, ckpt=None):
        if ckpt is None:
            ckpt = self.ckpt_file
        self.saver.restore(session, ckpt)

In [91]:
vec_env.close()

BrokenPipeError: [Errno 32] Broken pipe

In [94]:
sz_to_process = (4,) #(110,84)
stack_size = 4
state_size = (*sz_to_process, stack_size)

# Create the environments
game_name = 'CartPole-v0' #'LunarLander-v2'  #'Pong-v0'  #'SpaceInvaders-v0'
nb_env = 16
env_list = [EnvWrapper(game_name, state_size) for _ in range(nb_env)]
eval_env = EnvWrapper(game_name, state_size)
for e in env_list:
    e.env._max_episode_steps = 2500
eval_env.env._max_episode_steps = 2500


vec_env = VecEnvWrapper(env_list)

nb_actions = eval_env.nb_actions

gamma = 0.99

n_iterations = int(5e6)
steps_per_iteration = 4
learning_rate = 0.01 #0.00075
max_grad_norm = 0.5

evaluate_every = 5000
save_every = 50000

ckpt_file = "./models/model_cartpole.ckpt"

In [None]:
tf.reset_default_graph()

with tf.Session() as sess:
    acnet = ActorCriticGraphVectors(state_size, nb_actions, learning_rate, max_grad_norm, "ACNet")  # TODO Put lr and grad norm in trainer

    actrainer = ActorCriticTrainer(acnet,
                                   vec_env=vec_env, nb_env=nb_env,
                                   eval_env=eval_env,
                                   gamma=gamma,
                                   state_size=state_size,
                                   ckpt_file=ckpt_file)

    # Setup TensorBoard
    #writer = tf.summary.FileWriter("./tensorboard/", sess.graph)

    sess.run(tf.global_variables_initializer())
    #actrainer.restore(sess, ckpt="./models/model_pong.ckpt")
    actrainer.play_and_learn(n_iterations, steps_per_iteration,
                             evaluate_every, save_every,
                             ckpt_file, sess)

Reward on episode: 180.000000
Time to play 5000 iterations: 78.180474
Reward on episode: 17.000000
Time to play 10000 iterations: 155.509469
Reward on episode: 131.000000
Time to play 15000 iterations: 233.698744
Reward on episode: 222.000000
Time to play 20000 iterations: 312.848570
Reward on episode: 60.000000
Time to play 25000 iterations: 388.877827
Reward on episode: 12.000000
Time to play 30000 iterations: 474.449958
Reward on episode: 20.000000
Time to play 35000 iterations: 550.608641
Reward on episode: 73.000000
Time to play 40000 iterations: 628.217841
Reward on episode: 28.000000
Time to play 45000 iterations: 705.565625
Reward on episode: 237.000000
Time to play 50000 iterations: 780.526319
Saved model after 50000 iterations.
Reward on episode: 48.000000
Time to play 55000 iterations: 855.541730


In [43]:
tf.reset_default_graph()

with tf.Session() as sess:
    acnet = ActorCriticGraphVectors(state_size, nb_actions, learning_rate, "ACNet")
    
    actrainer = ActorCriticTrainer(acnet,
                                   vec_env=vec_env, nb_env=nb_env,
                                   eval_env=eval_env,
                                   gamma=gamma,
                                   state_size=state_size,
                                   ckpt_file=ckpt_file)
    
    actrainer.restore(sess)
    actrainer.run_episode(eval_env, 10000, True, sess)

INFO:tensorflow:Restoring parameters from ./models/model_cartpole.ckpt
Reward on episode: 43.000000
