In [2]:
import tensorflow as tf
import numpy as np
import gym

from skimage import transform
from skimage.color import rgb2gray
import matplotlib.pyplot as plt

from collections import deque
import random

from multiprocessing import Process, Pool
from multiprocessing import sharedctypes

from threading import Thread

import time

In [3]:
def preprocess_screen(screen, sz_to_process):
    gray = rgb2gray(screen)
    cropped_screen = gray[8:-12,5:-12]  # For Space Invaders, TODO make it cleaner
    
    preprocessed_screen = transform.resize(cropped_screen, sz_to_process)
    
    return preprocessed_screen

In [5]:
def stack_frames(stacked_frames, screen, is_new_episode):
    assert isinstance(stacked_frames, deque), "stacked_frames has not type deque"
    sz_to_process = stacked_frames[0].shape
    frame = preprocess_screen(screen, sz_to_process)
    # Not tested yet:
    frame = np.maximum(frame, stacked_frames[-1])
    if is_new_episode:
        for _ in range(len(stacked_frames)):
            stacked_frames.append(frame)

    else:
        stacked_frames.append(frame)
        
    input_state = np.stack(stacked_frames, axis=2)
    
    return input_state, stacked_frames

In [6]:
def compute_target_values(rewards, next_estimated_value, is_done, gamma):
    target_values = np.zeros_like(rewards, dtype=np.float)
    cum = next_estimated_value
    end_idx = len(rewards) - 1
    
    rev_rewards = reversed(rewards)
    rev_is_done = reversed(is_done)
    
    for i, (r, done) in enumerate(zip(rev_rewards, rev_is_done)):
        if done:
            cum = r
        else:
            cum = gamma * cum + r
        target_values[end_idx - i] = cum
            
    return target_values

In [13]:
class EnvWrapper:
    # Do not have to handle stacked frames externally
    
    def __init__(self, game_name, state_size, frame_skipping):
        self.env = gym.make(game_name)
        self.stacked_frames = deque([np.zeros(state_size[:-1], dtype=np.float) for _ in range(state_size[-1])],
                                     maxlen=state_size[-1])
        self.nb_actions = self.env.action_space.n
        self.frame_skipping = frame_skipping
        
    def step(self, action, render=False):
        for _ in range(self.frame_skipping):
            screen, reward, done, info = self.env.step(action)
            if render:
                self.render()
            if done:
                break
        if done:
            # Reset env and state
            screen = self.env.reset()
            stacked_input, self.stacked_frames = stack_frames(self.stacked_frames, screen, True)
        else:
            stacked_input, self.stacked_frames = stack_frames(self.stacked_frames, screen, False)

        return stacked_input, reward, done, info
            
    def reset(self):
        screen = self.env.reset()
        stacked_input, self.stacked_frames = stack_frames(self.stacked_frames, screen, True)
        
        return stacked_input
        
    def render(self):
        self.env.render()

In [8]:
class ActorCriticGraph:
    
    def __init__(self, state_size, nb_actions, learning_rate, scope_name):
        self.state_size = state_size
        self.nb_actions = nb_actions
        self.learning_rate = learning_rate
        self.scope_name = scope_name
        
        with tf.variable_scope(self.scope_name):
            self.state = tf.placeholder(tf.float32, (None, *(self.state_size)), name="state")
            self.action = tf.placeholder(tf.uint8, (None,), name="action")
            self.action_OH = tf.one_hot(self.action, self.nb_actions, name="action_OH")

 
            self.target_value = tf.placeholder(tf.float32, (None,), name="target_value")
            self.advantage = tf.placeholder(tf.float32, (None,), name="advantage")
                        
            # Neural net
            self.conv1 = tf.layers.conv2d(inputs=self.state,
                                          filters=32,
                                          kernel_size=(8,8),
                                          strides=(4,4),
                                          padding="valid",
                                          activation=tf.nn.relu,
                                          kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                          name="conv1")
            
            self.conv2 = tf.layers.conv2d(inputs=self.conv1,
                                          filters=64,
                                          kernel_size=(4,4),
                                          strides=(2,2),
                                          padding="valid",
                                          activation=tf.nn.relu,
                                          kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                          name="conv2")
            
            self.conv3 = tf.layers.conv2d(inputs=self.conv2,
                                          filters=64,
                                          kernel_size=(3,3),
                                          strides=(2,2),
                                          padding="valid",
                                          activation=tf.nn.relu,
                                          kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                          name="conv3")
            
            self.flatten = tf.layers.flatten(self.conv3)
            
            # Actor part
            self.fc_actions = tf.layers.dense(self.flatten,
                                              units=512,
                                              activation=tf.nn.relu,
                                              kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                              name="fc_action")
            
            self.prob_actions = tf.layers.dense(self.fc_actions,
                                                units=self.nb_actions,
                                                activation=tf.nn.softmax,
                                                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                name="action_distribution")
            
            # Critic part
            self.fc_value = tf.layers.dense(self.flatten,
                                            units=512,
                                            activation=tf.nn.relu,
                                            kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                            name="fc_value")
            
            self.value = tf.layers.dense(self.fc_value,
                                         units=1,
                                         activation=None,
                                         kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                         name="value")
            
            # Losses
            # Actor loss
            self.log_prob_actions = tf.math.log(self.prob_actions)
            
            self.log_prob_chosen_action = tf.reduce_sum(self.log_prob_actions * self.action_OH, axis=1)
            self.actor_loss = - tf.reduce_mean(self.log_prob_chosen_action * self.advantage)
            
            # Critic loss
            self.critic_loss = tf.reduce_mean(tf.square(self.target_value - self.value))

            # Entropy: sum(p(x) * -log(p(x)))
            self.entropy = tf.reduce_sum(tf.multiply(self.prob_actions, - self.log_prob_actions))

            # TODO put coeffs as parameters
            self.loss = 0.5 * self.critic_loss + self.actor_loss - 0.01 * self.entropy

            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
            self.train_op = self.optimizer.minimize(self.loss)

In [9]:
class ActorCriticTrainer:
    def __init__(self, graph, env, eval_env, gamma, state_size, ckpt_file):
        self.graph = graph
        self.env = env
        self.nb_actions = self.env.nb_actions
        #self.env_list = env_list
        self.eval_env = eval_env
        #self.nb_actions = self.env_list[0].nb_actions
        self.gamma = gamma
        self.state_size = state_size
        self.ckpt_file = ckpt_file
        self.saver = tf.train.Saver()

    def choose_action(self, input_state, session):
        prob_actions = session.run(self.graph.prob_actions,
                                    feed_dict={self.graph.state: np.expand_dims(input_state, axis=0)})

        action = np.random.choice(np.arange(nb_actions),
                                  p=np.ravel(prob_actions))
        
        return action
    
    def get_val(self, input_state, session):
        value = session.run(self.graph.value,
                            feed_dict={self.graph.state: np.expand_dims(input_state, axis=0)})

        return value.flatten()[0]
        
    def choose_action_and_get_val(self, input_state, session):
        prob_actions, value = session.run([self.graph.prob_actions, self.graph.value],
                                           feed_dict={self.graph.state: np.expand_dims(input_state, axis=0)})

        action = np.random.choice(np.arange(nb_actions),
                                  p=np.ravel(prob_actions))

        return action, value.flatten()[0]
        
    def run_episode(self, env_to_run, max_step, render, session):

        # Reset state
        state = env_to_run.reset()        
        total_reward = 0
        
        for step in range(max_step):
            action = self.choose_action(state, session)

            # Apply action to env and get next state, reward, and done bool
            state, reward, done, _ = env_to_run.step(action, render)
            total_reward += reward

            if done:
                break

        print("Reward on episode: %f" % total_reward)
        
    def train_on_batch(self, states, actions, target_values, advantages, session):
        loss, _ = session.run([self.graph.loss, self.graph.train_op],
                              feed_dict={self.graph.state: states,
                                         self.graph.action: actions,
                                         self.graph.target_value: target_values,
                                         self.graph.advantage: advantages})
        
        return loss
         
    def play_and_learn(self, n_iterations, steps_per_iteration,
                       evaluate_every, save_every,
                       ckpt_file, session):

        # Reset states
        state = self.env.reset()       

        for iteration in range(1, n_iterations):
            
            states, actions, rewards, values = [], [], [], []
            is_done = np.zeros(steps_per_iteration, dtype=bool)
            
            for step in range(steps_per_iteration):
                action, value = self.choose_action_and_get_val(state, session)
                next_state, reward, done, _ = self.env.step(action)

                clipped_reward = np.clip(reward, -1, 1)
                
                states.append(state)
                actions.append(action)
                rewards.append(clipped_reward)
                values.append(value)
                
                state = next_state

                if done:
                    is_done[step] = True
                    
            # Target values
            next_estimated_value = 0 if done else self.get_val(state, session)
            target_values = compute_target_values(rewards, next_estimated_value, is_done, self.gamma)
            
            advantages = target_values - values
                                    
            loss = self.train_on_batch(states, actions, target_values, advantages, session)
                
            if iteration % evaluate_every == 0:
                self.run_episode(self.eval_env, 10000, False, session)
                
            if iteration % save_every == 0:
                self.saver.save(session, self.ckpt_file)
                print("Saved model after %i iterations." % iteration)
                
    def restore(self, session):
        self.saver.restore(session, self.ckpt_file)

In [14]:
sz_to_process = (110,84)
stack_size = 4
state_size = (*sz_to_process, stack_size)

frame_skipping = 2

# Create the environments
env = EnvWrapper('SpaceInvaders-v0', state_size, frame_skipping)
eval_env = EnvWrapper('SpaceInvaders-v0', state_size, frame_skipping)

nb_actions = env.nb_actions

gamma = 0.95

n_iterations = int(5e6)
steps_per_iteration = 8
learning_rate = 0.00025

evaluate_every = 2500
save_every = 10000

ckpt_file = "./models/model.ckpt"

In [None]:
tf.reset_default_graph()

with tf.Session() as sess:
    acnet = ActorCriticGraph(state_size, nb_actions, learning_rate, "ACNet")
    
    actrainer = ActorCriticTrainer(acnet,
                                   env=env,
                                   eval_env=eval_env,
                                   gamma=gamma,
                                   state_size=state_size,
                                   ckpt_file=ckpt_file)

    # Setup TensorBoard
    #writer = tf.summary.FileWriter("./tensorboard/", sess.graph)
    
    #sess.run(tf.global_variables_initializer())
    actrainer.restore(sess)
    actrainer.play_and_learn(n_iterations, steps_per_iteration,
                             evaluate_every, save_every,
                             ckpt_file, sess)

INFO:tensorflow:Restoring parameters from ./models/model.ckpt
Reward on episode: 35.000000
Reward on episode: 30.000000
Reward on episode: 155.000000
Reward on episode: 70.000000
Saved model after 10000 iterations.
Reward on episode: 90.000000
Reward on episode: 15.000000
Reward on episode: 60.000000
Reward on episode: 50.000000
Saved model after 20000 iterations.
Reward on episode: 120.000000
Reward on episode: 50.000000
Reward on episode: 80.000000


In [49]:
tf.reset_default_graph()

with tf.Session() as sess:
    acnet = ActorCriticGraph(state_size, nb_actions, learning_rate, "ACNet")
    
    actrainer = ActorCriticTrainer(acnet,
                                   env=env,
                                   eval_env=eval_env,
                                   gamma=gamma,
                                   state_size=state_size,
                                   ckpt_file=ckpt_file)
    
    actrainer.restore(sess)
    actrainer.run_episode(eval_env, 10000, True, sess)

INFO:tensorflow:Restoring parameters from ./models/model.ckpt
Reward on episode: 155.000000
