In [1]:
import tensorflow as tf
import numpy as np
import gym

from skimage import transform
from skimage.color import rgb2gray
import matplotlib.pyplot as plt

from collections import deque
import random

from multiprocessing import Process, Pipe

import time
import datetime

In [2]:
def preprocess_screen(screen, sz_to_process):
    print(screen.shape)
    print(screen)
    gray = rgb2gray(screen)
    #cropped_screen = gray[8:-12,5:-12]  # For Space Invaders, TODO make it cleaner
    cropped_screen = gray[25:-10,5:-5]  # For Pong, TODO make it cleaner
    preprocessed_screen = transform.resize(cropped_screen, sz_to_process, mode='constant', anti_aliasing=True)
    
    return preprocessed_screen

In [3]:
def stack_observations(stacked_observations, observation, is_new_episode):
    assert isinstance(stacked_observations, deque), "stacked_observations has not type deque"
    sz_to_process = stacked_observations[0].shape  #TODO make it cleaner
        
    if is_new_episode:
        for _ in range(len(stacked_observations)):
            stacked_observations.append(observation)
    else:
        stacked_observations.append(observation)
    
    last_axis = len(stacked_observations[0].shape)
    state = np.stack(stacked_observations, axis=last_axis)
                    
    return state, stacked_observations

In [4]:
def compute_target_values(batch_rewards, next_estimated_values, batch_dones, gamma):
    nb_seq, len_seq = batch_rewards.shape
    batch_target_values = np.zeros_like(batch_rewards, dtype=np.float)
    cums = next_estimated_values
        
    for i in range(len_seq-1, -1, -1):
        cums = np.where(batch_dones[:, i], batch_rewards[:, i], gamma * cums + batch_rewards[:, i])
        
        batch_target_values[:, i] = cums
        
    return batch_target_values

In [5]:
class EnvWrapper:
    # Do not have to handle stacked frames externally
    
    def __init__(self, game_name, state_size):
        self.env = gym.make(game_name)
        self.stacked_observations = deque([np.zeros(state_size[:-1]) for _ in range(state_size[-1])],
                                    maxlen=state_size[-1])
        self.nb_actions = self.env.action_space.n
        self.observation_shape = self.env.observation_space.shape
        self.obs_is_image = self.observation_shape == (210, 160, 3)
        
    def step(self, action, render=False):
        observation, reward, done, info = self.env.step(action)
        if self.obs_is_image:
            observation = preprocess_screen(observation, sz_to_process)
        if render:
            self.render()
            
        if done:
            # Reset env and state
            observation = self.env.reset()
            if self.obs_is_image:
                observation = preprocess_screen(observation, sz_to_process)

            stacked_state, self.stacked_observations = stack_observations(self.stacked_observations, observation, True)
        else:
            stacked_state, self.stacked_observations = stack_observations(self.stacked_observations, observation, False)

        return stacked_state, reward, done, info
            
    def reset(self):
        observation = self.env.reset()
        if self.obs_is_image:
            observation = preprocess_screen(observation, sz_to_process)

        stacked_state, self.stacked_observations = stack_observations(self.stacked_observations, observation, True)
        
        return stacked_state
        
    def render(self):
        self.env.render()

In [6]:
def worker(env_remote, worker_remote, env):
    env_remote.close()
    while True:
        cmd, data = worker_remote.recv()

        if cmd == 'step':
            stacked_input, reward, done, info = env.step(data)
            worker_remote.send((stacked_input, reward, done, info))

        elif cmd == 'reset':
            stacked_input = env.reset()
            worker_remote.send(stacked_input)
            
        elif cmd == 'close':
            worker_remote.close()

        else:
            raise NotImplementedError         

class VecEnvWrapper:
    def __init__(self, env_list):
        self.env_remotes, self.worker_remotes = zip(*[Pipe() for _ in range(len(env_list))])
        self.processes = [Process(target=worker, args=(e_remote, w_remote, env))
                            for (e_remote, w_remote, env) in zip(self.env_remotes, self.worker_remotes, env_list)]

        for p in self.processes:
            p.daemon = True
            p.start()
        for wr in self.worker_remotes:
            wr.close()
            
    def step(self, actions):   
        for r, a in zip(self.env_remotes, actions):
            r.send(('step', a))
        
        step_outputs = [r.recv() for r in self.env_remotes]

        stacked_inputs, rewards, dones, infos = zip(*step_outputs)

        return stacked_inputs, rewards, dones, infos

    def reset(self):
        for r in self.env_remotes:
            r.send(('reset', None))
        return [r.recv() for r in self.env_remotes]
    
    def close(self):
        for r in self.env_remotes:
            r.send(('close', None))
        for r in self.env_remotes:
            r.close()
        for p in self.processes:
            p.join()

In [7]:
class ActorCriticGraphImages:
    
    def __init__(self, state_size, nb_actions, learning_rate, scope_name):
        self.state_size = state_size
        self.nb_actions = nb_actions
        self.learning_rate = learning_rate
        self.scope_name = scope_name
        
        with tf.variable_scope(self.scope_name):
            self.state = tf.placeholder(tf.float32, (None, *(self.state_size)), name="state")
            self.action = tf.placeholder(tf.uint8, (None,), name="action")
            self.action_OH = tf.one_hot(self.action, self.nb_actions, name="action_OH")

 
            self.target_value = tf.placeholder(tf.float32, (None,), name="target_value")
            self.advantage = tf.placeholder(tf.float32, (None,), name="advantage")
        
            initializer = tf.contrib.layers.xavier_initializer()

            device = '/device:GPU:0' if tf.test.is_gpu_available() else '/device:CPU:0'
            with tf.device(device):
                # Neural net
                conv1_F = tf.Variable(initializer((7, 7, 4, 8)))
                self.conv1 = tf.nn.conv2d(input=self.state,
                                          filter=conv1_F,
                                          strides=(1,4,4,1),
                                          padding="VALID",
                                          name="conv1")
                self.conv1_act = tf.nn.relu(self.conv1)

                conv2_F = tf.Variable(initializer((5, 5, 8, 16)))
                self.conv2 = tf.nn.conv2d(input=self.conv1_act,
                                          filter=conv2_F,
                                          strides=(1,2,2,1),
                                          padding="VALID",
                                          name="conv2")
                self.conv2_act = tf.nn.relu(self.conv2)
            
            
                conv3_F = tf.Variable(initializer((3, 3, 16, 16)))
                self.conv3 = tf.nn.conv2d(input=self.conv2_act,
                                          filter=conv3_F,
                                          strides=(1,2,2,1),
                                          padding="VALID",
                                          name="conv3")
                self.conv3_act = tf.nn.relu(self.conv3)
            
                self.flatten = tf.keras.layers.Flatten()(self.conv3_act)

                # Actor part
                self.fc_actions = tf.keras.layers.Dense(units=512,
                                                        activation=tf.nn.relu,
                                                        kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                        name="fc_action")(self.flatten)

                self.prob_actions = tf.keras.layers.Dense(units=self.nb_actions,
                                                          activation=tf.nn.softmax,
                                                          kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                          name="action_distribution")(self.fc_actions)

                # Critic part
                self.fc_value = tf.keras.layers.Dense(
                                                units=512,
                                                activation=tf.nn.relu,
                                                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                name="fc_value")(self.flatten)

                self.value = tf.keras.layers.Dense(units=1,
                                                   activation=None,
                                                   kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                   name="value")(self.fc_value)
            
                # Losses
                # Actor loss
                self.log_prob_actions = tf.math.log(self.prob_actions)

                self.log_prob_chosen_action = tf.reduce_sum(self.log_prob_actions * self.action_OH, axis=1)
                self.actor_loss = - tf.reduce_mean(self.log_prob_chosen_action * self.advantage)

                # Critic loss
                self.critic_loss = tf.reduce_mean(tf.square(self.target_value - self.value))

                # Entropy: sum(p(x) * -log(p(x)))
                self.entropy = tf.reduce_sum(tf.multiply(self.prob_actions, - self.log_prob_actions))

                # TODO put coeffs as parameters
                self.loss = 0.5 * self.critic_loss + self.actor_loss - 0.01 * self.entropy

                self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.train_op = self.optimizer.minimize(self.loss)

In [112]:
class ActorCriticGraphVectors:
    
    def __init__(self, state_size, nb_actions, learning_rate, max_grad_norm=None, scope_name="ACNet"):
        self.state_size = state_size
        self.nb_actions = nb_actions
        self.learning_rate = learning_rate
        self.max_grad_norm = max_grad_norm
        self.scope_name = scope_name
        
        with tf.variable_scope(self.scope_name):
            self.state = tf.placeholder(tf.float32, (None, *(self.state_size)), name="state")
            self.action = tf.placeholder(tf.uint8, (None,), name="action")
            self.action_OH = tf.one_hot(self.action, self.nb_actions, name="action_OH")

 
            self.target_value = tf.placeholder(tf.float32, (None,), name="target_value")
            self.advantage = tf.placeholder(tf.float32, (None,), name="advantage")
        
            initializer = tf.contrib.layers.xavier_initializer()

            device = '/device:GPU:0' if tf.test.is_gpu_available() else '/device:CPU:0'
            with tf.device(device):
                # Neural net
                self.flatten_state = tf.keras.layers.Flatten()(self.state)
                
                # Actor part
                self.fc_actions = tf.keras.layers.Dense(units=32,
                                                        activation=tf.nn.relu,
                                                        kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                        name="fc_action")(self.flatten_state)

                self.prob_actions = tf.keras.layers.Dense(units=self.nb_actions,
                                                          activation=tf.nn.softmax,
                                                          kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                          name="action_distribution")(self.fc_actions)

                # Critic part
                self.fc_value = tf.keras.layers.Dense(units=32,
                                                      activation=tf.nn.relu,
                                                      kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                      name="fc_value")(self.flatten_state)

                self.value = tf.keras.layers.Dense(units=1,
                                                   activation=None,
                                                   kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                   name="value")(self.fc_value)
        
        with tf.variable_scope("Losses"):
            # Actor loss
            self.log_prob_actions = tf.math.log(self.prob_actions)

            self.log_prob_chosen_action = tf.reduce_sum(self.log_prob_actions * self.action_OH, axis=1)
            self.actor_loss = - tf.reduce_mean(self.log_prob_chosen_action * self.advantage)

            # Critic loss
            self.critic_loss = tf.reduce_mean(tf.square(self.target_value - self.value))

            # Entropy: sum(p(x) * -log(p(x)))
            self.entropy = tf.reduce_sum(tf.multiply(self.prob_actions, - self.log_prob_actions))

            # TODO put coeffs as parameters
            self.loss = 0.5 * self.critic_loss + self.actor_loss - 0.001 * self.entropy

            # Optimization
            #self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
            self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate, decay=0.9)
                
            #params = tf.trainable_variables(self.scope_name)
            grads_vars = self.optimizer.compute_gradients(self.loss)
            grads = [gv[0] for gv in grads_vars]
            params = [gv[1] for gv in grads_vars]
            if max_grad_norm is not None:
                # Clip the gradients
                grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
            grads = list(zip(grads, params))

            # Summaries
            tf.summary.scalar('actor_loss', self.actor_loss)
            tf.summary.scalar('critic_loss', self.critic_loss)
            tf.summary.scalar('entropy', self.entropy)
            tf.summary.scalar('loss', self.loss)
            self.merged_summaries = tf.summary.merge_all()

            #self.train_op = self.optimizer.minimize(self.loss)
            self.train_op = self.optimizer.apply_gradients(grads)

In [113]:
class ActorCriticTrainer:
    def __init__(self, graph, vec_env, nb_env, eval_env, gamma, state_size, ckpt_file, session):
        self.graph = graph
        self.vec_env = vec_env
        self.nb_env = nb_env
        self.eval_env = eval_env
        self.nb_actions = self.eval_env.nb_actions
        self.gamma = gamma
        self.state_size = state_size
        self.ckpt_file = ckpt_file
        self.session = session
        self.saver = tf.train.Saver()
        self.summary_writer = tf.summary.FileWriter("./tensorboard/", self.session.graph)

    def choose_actions(self, input_states):
        prob_actions = self.session.run(self.graph.prob_actions,
                                        feed_dict={self.graph.state: input_states})

        actions = [np.random.choice(np.arange(self.nb_actions), p=p) for p in prob_actions]

        return actions
    
    def get_values(self, input_states):
        values = self.session.run(self.graph.value,
                                  feed_dict={self.graph.state: input_states})

        return values.flatten()
        
    def choose_actions_and_get_values(self, input_states):
        prob_actions, values = self.session.run([self.graph.prob_actions, self.graph.value],
                                                feed_dict={self.graph.state: input_states})

        actions = [np.random.choice(np.arange(self.nb_actions), p=p) for p in prob_actions]

        return actions, values.flatten()
        
    def run_episode(self, env_to_run, max_step, render):

        # Reset state
        state = env_to_run.reset()        
        total_reward = 0
        
        for step in range(max_step):
            state = np.expand_dims(state, axis=0)
            action = self.choose_actions(state)[0]

            # Apply action to env and get next state, reward, and done bool
            state, reward, done, _ = env_to_run.step(action, render)
            total_reward += reward

            if done:
                break

        return total_reward
    
    def evaluate_agent(self, env_to_run, max_step, n_trials):
        rewards = []
        for _ in range(n_trials):
            rewards.append(self.run_episode(env_to_run, max_step, False))
        avg_reward = np.mean(rewards)
        std_reward = np.std(rewards)
        
        return avg_reward, std_reward
        
    def train_on_batch(self, states, actions, target_values, advantages, iteration):
        summaries, _ = self.session.run([self.graph.merged_summaries, self.graph.train_op],
                                         feed_dict={self.graph.state: states,
                                                    self.graph.action: actions,
                                                    self.graph.target_value: target_values,
                                                    self.graph.advantage: advantages})
        if iteration % 250 == 0:
            self.summary_writer.add_summary(summaries, iteration)
         
    def play_and_learn(self, n_iterations, steps_per_iteration,
                       evaluate_every, save_every,
                       ckpt_file):

        # Reset states
        states = self.vec_env.reset()       

        states_all_env = np.empty((self.nb_env, steps_per_iteration, *self.state_size), dtype=np.float)
        actions_all_env = np.empty((self.nb_env, steps_per_iteration), dtype=np.int)
        rewards_all_env = np.empty((self.nb_env, steps_per_iteration), dtype=np.float)
        values_all_env = np.empty((self.nb_env, steps_per_iteration), dtype=np.float)
        dones_all_env = np.empty((self.nb_env, steps_per_iteration), dtype=np.bool)
        
        T = time.time()
        for iteration in range(1, n_iterations):

            for step in range(steps_per_iteration):
                actions, values = self.choose_actions_and_get_values(states)
                next_states, rewards, dones, _ = self.vec_env.step(actions)

                clipped_rewards = np.clip(rewards, -1, 1)
                                
                states_all_env[:, step] = states
                actions_all_env[:, step] = actions
                rewards_all_env[:, step] = clipped_rewards
                values_all_env[:, step] = values
                dones_all_env[:, step] = dones
                
                states = next_states
            
            # Estimated values for the future
            next_estimated_values = self.get_values(states)
            next_estimated_values = np.where(dones, 0, next_estimated_values)

            target_values_all_env = compute_target_values(np.array(rewards_all_env),
                                                          next_estimated_values,
                                                          np.array(dones_all_env),
                                                          self.gamma)
            
            advantages_all_env = target_values_all_env - values_all_env
            
            # Concatenate the experiences from the different envionments
            batch_states = np.concatenate(states_all_env)
            batch_actions = np.concatenate(actions_all_env)
            batch_target_values = np.concatenate(target_values_all_env)
            batch_advantages = np.concatenate(advantages_all_env)
            
            self.train_on_batch(batch_states, batch_actions, batch_target_values, batch_advantages, iteration)
            
            if iteration % evaluate_every == 0:
                avg_reward, std_reward = self.evaluate_agent(self.eval_env, 10000, 10)
                print("Average reward over 10 trials: %f (+- %f)" % (avg_reward, std_reward))
                print("Time to play %i iterations: %s" %(iteration, str(datetime.timedelta(seconds=time.time() - T))))
                
            if iteration % save_every == 0:
                self.saver.save(self.session, self.ckpt_file)
                print("Saved model after %i iterations." % iteration)
                
    def restore(self, ckpt=None):
        if ckpt is None:
            ckpt = self.ckpt_file
        self.saver.restore(self.session, ckpt)

In [138]:
vec_env.close()

BrokenPipeError: [Errno 32] Broken pipe

In [119]:
sz_to_process = (4,) #(110,84)
stack_size = 2
state_size = (*sz_to_process, stack_size)

# Create the environments
game_name = 'CartPole-v0' #'LunarLander-v2'  #'Pong-v0'  #'SpaceInvaders-v0'
nb_env = 16
env_list = [EnvWrapper(game_name, state_size) for _ in range(nb_env)]
eval_env = EnvWrapper(game_name, state_size)
for e in env_list:
    e.env._max_episode_steps = 2500
eval_env.env._max_episode_steps = 2500


vec_env = VecEnvWrapper(env_list)

nb_actions = eval_env.nb_actions

gamma = 0.95

n_iterations = int(5e6)
steps_per_iteration = 32
learning_rate = 7e-4 #0.00075
max_grad_norm = 0.5

evaluate_every = 250
save_every = 2500

ckpt_file = "./models/model_cartpole.ckpt"

Process Process-163:
Process Process-164:
Process Process-158:
Process Process-160:
Process Process-165:
Process Process-162:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
Process Process-157:
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Process Process-161:
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Process Process-15

  File "/usr/lib/python3.5/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "<ipython-input-6-3dcfb0764467>", line 8, in worker
    worker_remote.send((stacked_input, reward, done, info))
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "<ipython-input-6-3dcfb0764467>", line 4, in worker
    cmd, data = worker_remote.recv()
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
  File "<ipython-input-6-3dcfb0764467>", line 4, in worker
    cmd, data = worker_remote.recv()
  File "/usr/lib/python3.5/multiprocess

In [120]:
tf.reset_default_graph()

with tf.Session() as sess:
    acnet = ActorCriticGraphVectors(state_size, nb_actions, learning_rate, max_grad_norm, "ACNet")  # TODO Put lr and grad norm in trainer

    actrainer = ActorCriticTrainer(acnet,
                                   vec_env=vec_env, nb_env=nb_env,
                                   eval_env=eval_env,
                                   gamma=gamma,
                                   state_size=state_size,
                                   ckpt_file=ckpt_file,
                                   session=sess)

    sess.run(tf.global_variables_initializer())
    #actrainer.restore(sess, ckpt="./models/model_pong.ckpt")
    actrainer.play_and_learn(n_iterations, steps_per_iteration,
                             evaluate_every, save_every,
                             ckpt_file)

Average reward over 10 trials: 65.500000 (+- 35.878266)
Time to play 250 iterations: 0:00:22.670544
Average reward over 10 trials: 154.500000 (+- 64.527901)
Time to play 500 iterations: 0:00:46.551461
Average reward over 10 trials: 199.700000 (+- 149.970030)
Time to play 750 iterations: 0:01:10.621818
Average reward over 10 trials: 205.000000 (+- 62.486799)
Time to play 1000 iterations: 0:01:34.049691
Average reward over 10 trials: 500.800000 (+- 333.120939)
Time to play 1250 iterations: 0:01:58.933450
Average reward over 10 trials: 254.100000 (+- 114.835056)
Time to play 1500 iterations: 0:02:22.607178
Average reward over 10 trials: 365.200000 (+- 236.246397)
Time to play 1750 iterations: 0:02:46.941789
Average reward over 10 trials: 156.700000 (+- 89.213284)
Time to play 2000 iterations: 0:03:10.042778
Average reward over 10 trials: 222.000000 (+- 52.074946)
Time to play 2250 iterations: 0:03:34.070725
Average reward over 10 trials: 181.300000 (+- 42.666263)
Time to play 2500 iterati

Average reward over 10 trials: 412.000000 (+- 285.918170)
Time to play 19750 iterations: 0:32:50.773338
Average reward over 10 trials: 301.700000 (+- 122.293949)
Time to play 20000 iterations: 0:33:14.998080
Saved model after 20000 iterations.
Average reward over 10 trials: 336.200000 (+- 182.283186)
Time to play 20250 iterations: 0:33:39.057260
Average reward over 10 trials: 315.900000 (+- 139.325841)
Time to play 20500 iterations: 0:34:03.206326
Average reward over 10 trials: 298.800000 (+- 105.363941)
Time to play 20750 iterations: 0:34:27.930250
Average reward over 10 trials: 265.300000 (+- 80.992654)
Time to play 21000 iterations: 0:34:54.310464
Average reward over 10 trials: 473.800000 (+- 211.576842)
Time to play 21250 iterations: 0:35:19.599437
Average reward over 10 trials: 819.500000 (+- 398.514303)
Time to play 21500 iterations: 0:35:47.082050
Average reward over 10 trials: 460.900000 (+- 246.737289)
Time to play 21750 iterations: 0:36:11.938902
Average reward over 10 trials

KeyboardInterrupt: 

In [137]:
tf.reset_default_graph()

with tf.Session() as sess:
    acnet = ActorCriticGraphVectors(state_size, nb_actions, learning_rate, max_grad_norm, "ACNet")  # TODO Put lr and grad norm in trainer

    actrainer = ActorCriticTrainer(acnet,
                                   vec_env=vec_env, nb_env=nb_env,
                                   eval_env=eval_env,
                                   gamma=gamma,
                                   state_size=state_size,
                                   ckpt_file=ckpt_file,
                                   session=sess)
    
    actrainer.restore()
    actrainer.run_episode(eval_env, 10000, True)

INFO:tensorflow:Restoring parameters from ./models/model_cartpole.ckpt


In [140]:
eval_env.env.close()