## This notebook is intended to solve the CartPole-v0 problem with the Advantage Actor Critic (A2C).

It will be a benchmark to future algorithms devoloped by Guilherme Viveiros.


> Solved Requirements for CartPole: Considered solved when the average return is greater than or equal to 195.0 over 100 consecutive trials.

In [255]:
import gym #envrionment to test the algorithms
import numpy as np #vector calculations
import tensorflow as tf #tensor / ML operations
#pip install jdc
import os #operative system
from tqdm import tqdm #progress bar
from tqdm import trange

import collections #collect experiences from real environment to sample within DQN
import random #random environment's
import math#math
import time #time 

from typing import Any, List, Sequence, Tuple #to define custom function returns

# Small epsilon value for stabilizing division operations
eps = np.finfo(np.float32).eps.item()

## CartPole-V0 environment

In [256]:
env = gym.make('CartPole-v0')

## Let's define the core part, the Agent

In [257]:
class ActorCritic(tf.keras.models.Model):
    def __init__(
        self,
        number_of_actions : int,
        number_hidden_units : int
        #**kargs
    ):
        super().__init__()
        self.dense1 = tf.keras.layers.Dense(number_hidden_units,activation='relu')
        self.dense2 = tf.keras.layers.Dense(number_hidden_units*2,activation='relu')
        self.actor =  tf.keras.layers.Dense(number_of_actions,name='Actor')
        self.critic =  tf.keras.layers.Dense(1,name='Critic')
    
    #return the action logits as the actor and the value state as the critic
    def call(self,inputs : tf.Tensor) -> Tuple[tf.Tensor,tf.Tensor]:
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.actor(x),self.critic(x)
    
action_space = env.action_space.n
number_hidden_units = 64
model = ActorCritic(action_space,number_hidden_units)

## Perform an action and change to another state

> Also, tf_env_step it's the wrapper function that transforms env_step into a tensorflow function to be added in tf.graph 

In [258]:
#returns the next state, the associated reward and a boolean indicating if the agent reached the terminal state
#I receive and output arrays because I'm using tf.numpy_function to wrap env_step to graph_mode and this functions
#expects that the python function receives as its arguments an array and returns arrays as its outputs
def env_step(action : np.array) -> Tuple[np.array,np.array,np.array]:
    state, reward, done, _ = env.step(action)
    return ( state.astype(np.float32),
             np.array(reward,dtype=np.int32),
             np.array(done,dtype=np.int32)
    )
    
def tf_env_step(action : tf.Tensor ) -> List[tf.Tensor]:
    return tf.numpy_function(func = env_step, inp = [action], Tout = [tf.float32,tf.int32,tf.int32])

## Main function

In [259]:
#return array containing rewards, values and the associated action probabilities of each state
#.mark_used() in graph time as to be gone
def run_episode(initial_state : tf.Tensor , model : tf.keras.Model, max_steps : int ) -> List[tf.Tensor]:
    
    rewards = tf.TensorArray(dtype = tf.int32, size = 0, dynamic_size=True)
    values = tf.TensorArray(dtype = tf.float32, size = 0, dynamic_size=True)
    action_probs_t = tf.TensorArray(dtype = tf.float32, size = 0, dynamic_size=True)
    
    
    initial_state_shape = initial_state.shape
    state = initial_state
    
    #Iterate until the terminal state, True == 1
    for step in tf.range(max_steps):
        
        state = tf.expand_dims(state,0)
        
        #Agent choose the action according to the actor network
        action_logits, state_value = model(state)
        
        #use multionomial function from tensorflow probability to sample a given action from action_logits
        action = tf.random.categorical(logits=action_logits,num_samples=1)[0,0]
        
        #action probabilities
        action_probs = tf.nn.softmax(action_logits)
        
        #append value and proability actions associated
        values = values.write(step,tf.squeeze(state_value))#.mark_used
        action_probs_t = action_probs_t.write(step,action_probs[0,action])#.mark_used
        
        #new state after executing the action chosen by the agent
        state,reward,done = tf_env_step(action)
        #state = tf.reshape(state,[1,4])
        state.set_shape(initial_state_shape)
        
                
        #append reward associated
        rewards = rewards.write(step,reward)#.mark_used

        
        if tf.cast(done,tf.bool):
            break
        
    action_probs_t = action_probs_t.stack()
    values = values.stack()
    rewards = rewards.stack()
        
    return rewards,values,action_probs_t

#rewards,values,action_probs = run_episode(tf.constant(env.reset(), dtype = tf.float32) , model, 200)

### Advantage Actor Critic with Monte Carlo Updates

In this case instead of $Q(at,st)$ and  subsequently $ Rt + V(st') $ is replaced by $Gt$, the actual final reward.
   
> $A(at,st) = Q(at,st) - V(st)$
  
 Since it useless to define a DNN to predict value states and q values (value-action pairs), we can use the Bellman equations optimallity in our advantage
  
  > $ Q*(at,st) = E[Rt + V*(st')] $ (1)
  
 So
  
>$ A(at,st) = Rt + V(st') - V(st) $ (2) when using TD(0) updates\
>$ A(at,st) = Gt - V(st) $ (2) when using MC updates

And this is simply the TD Error
  
  Now I only need a NN to predict value states.
  
  The $E$ in equation (1) stands for expectation, it's the expectation cumulative reward of follwing $at$ in $st$. It's removed in equation **2** because I'm using TD(0), a boostrap method. Since I sample every step following the current policy I'm receiving the true rewards, thats why we can remove the expectation symbol in (2).

In [260]:
#.mark_used() in graph time as to be gone
def compute_cumulative_rewards(
        rewards : tf.Tensor,
        gamma : float,
        standardize : bool = True
    ) -> [tf.Tensor]:
    
    #G(t) = R(t+1) + γ*R(t+2) + γ^2*R(t+3) + ...
    #Cumulative reward -> G(t) = R + y*G(t+1)
    
    n = tf.shape(rewards)[0]
    rewards = tf.cast(rewards[::-1],tf.float32)
    
    
    cumulative_returns = tf.TensorArray(dtype=tf.float32,size = n)
    gamma = tf.cast(gamma,tf.float32)
    
    cumulative_reward = tf.constant(0.0)
    cumulative_reward_shape = cumulative_reward.shape
    
    

    for i in tf.range(n):
        cumulative_reward = rewards[i] + gamma * cumulative_reward
        #to ensure a known shape in graph time
        cumulative_reward.set_shape(cumulative_reward_shape)
        cumulative_returns = cumulative_returns.write(i,cumulative_reward)#.mark_used() 
        
    
    cumulative_returns = cumulative_returns.stack()[::-1]
    
    if(standardize):
        mean,std = tf.math.reduce_mean(cumulative_returns),tf.math.reduce_std(cumulative_returns)
        #use epsilon to ensure non-zero divisions
        cumulative_returns = (cumulative_returns - mean)/(std + eps)
    
    return cumulative_returns

#cumulative_returns = compute_cumulative_rewards(rewards,gamma)

### Loss Function

In [261]:
#I will use the huber loss as the critic loss
#as reductin technique, instead of aggregating and computing the mean, I sum up the errors of each step within an episode
huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)

def compute_loss(
        rewards : tf.Tensor,
        values : tf.Tensor,
        action_probs : tf.Tensor
    ) ->  tf.Tensor :
    """ Computes the combined actor-critic loss with advantage as the baseline"""
    
    #compute the associated advantage 
    advantage = rewards - values
    
    #actor loss
    log_action = tf.math.log(action_probs)
    actor_loss = -tf.reduce_sum(log_action * advantage)
    
    #critic loss
    critic_loss = huber_loss(rewards,values)
    
    return actor_loss + critic_loss

loss = compute_loss(cumulative_returns,values,action_probs)

## Main Part

> Defining the training step to update parameters

In [262]:
lr = 1e-2
optimizer = tf.keras.optimizers.Adam(lr=lr)

@tf.function
def train_step(
        model : tf.keras.Model,
        initial_state : tf.Tensor,
        optimizer : tf.keras.optimizers.Optimizer,
        gamma : float,
        max_steps : int
    ) -> tf.Tensor:
    """ Runs a model training step """
    
    with tf.GradientTape() as tape:
        
        #tape.watch(model.trainable_variables)
        #run an episode
        #action_probs, values, rewards = run_episode(initial_state,model,max_steps)
        rewards, values, action_probs = run_episode(initial_state,model,max_steps)
        
        #cumpute the cumulative rewards
        #cumulative_rewards = compute_cumulative_rewards(rewards,gamma)
        cumulative_rewards = compute_cumulative_rewards(rewards,gamma)
        
        #Convert training data to appropriate TF tensor shapes
        #the previous return as the shape -> (steps,) , so ensure to change it to (steps,1)
        action_probs, values, cumulative_rewards = [
            tf.expand_dims(x, 1) for x in [action_probs, values, cumulative_rewards]] 
        
        #compute the loss
        loss = compute_loss(cumulative_rewards,values,action_probs)        
    
    #comput the gradients 
    gradients = tape.gradient(loss,model.trainable_variables)
    #apply the gradients
    optimizer.apply_gradients(zip(gradients,model.trainable_variables))
    
    #return the associated reward
    episode_reward = tf.math.reduce_sum(rewards)
    
    return episode_reward

#reward = train_step(model,tf.constant(env.reset(),dtype=tf.float32),optimizer,tf.Variable(0.99),20)

## Running the training loop

In [263]:
checkpoint_directory = "/tmp/training_checkpoints"
checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")

checkpoint = tf.train.Checkpoint(
    optimizer=optimizer,
    actor_critic_model = model,
)

# saving (checkpoint) the model every 2 epochs
#checkpoint.save(file_prefix = checkpoint_prefix)

#restore the last checkpoint
#status = checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))

In [264]:
%%time 

max_episodes = 10000
max_steps_per_episode = 200
gamma = 0.99
weighted_moving_average = 0

reward_threshold = 195

with trange(max_episodes) as t:
    #each episode
    for i in t:
        #retrieve the initial state
        initial_state = tf.constant(env.reset(),dtype=tf.float32)
        #episode reward
        episode_reward = int (train_step(model,initial_state,optimizer,gamma,max_steps_per_episode))
        
        weighted_moving_average = episode_reward * 0.01 + weighted_moving_average * 0.99
        
        '''
        #exponential scheduling
        if(i % 200 == 0 and i > 0 and optimizer.learning_rate >= 1e-4):
            optimizer.learning_rate  = optimizer.learning_rate * 0.1
            tf.print("Learning changed to {0}".format(optimizer.learning_rate))
        '''
        
        t.set_description(f'Episode {i}')
        t.set_postfix(
            episode_reward=episode_reward, weighted_moving_average=weighted_moving_average
        )
        
        #I'm using a weighted moving average with B = 0.99, so it is equivelent to an average over 100 episodes
        if(weighted_moving_average >= reward_threshold):  
            break

print(f'\nSolved at episode {i}: average reward: {weighted_moving_average:.2f}!')

Episode 3474:  35%|███▍      | 3474/10000 [01:02<01:57, 55.58it/s, episode_reward=200, weighted_moving_average=195]  


Solved at episode 3474: average reward: 195.03!
CPU times: user 1min 14s, sys: 4.33 s, total: 1min 18s
Wall time: 1min 2s





In [117]:
## solved at episode 659, best so far

## Display Visiualization

In [116]:
num_episodes_to_display = 10
for i_episode in range(num_episodes_to_display):
    state = env.reset()
    for t in range(200):
        env.render()
        state = tf.expand_dims(tf.constant(state,dtype=tf.float32),0)
        action_logits, state_value = model(state)  
        #use multionomial function from tensorflow probability to sample a given action from action_logits
        action = tf.random.categorical(logits=action_logits,num_samples=1)[0,0]
        
        state, reward, done, info = env.step(action.numpy())
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()


Episode finished after 200 timesteps
Episode finished after 200 timesteps
Episode finished after 200 timesteps
Episode finished after 200 timesteps
Episode finished after 200 timesteps
Episode finished after 200 timesteps
Episode finished after 200 timesteps
Episode finished after 200 timesteps
Episode finished after 200 timesteps
Episode finished after 200 timesteps


## Render an episode and save as a GIF file

> Not working yet

In [63]:
# Render an episode and save as a GIF file

from IPython import display as ipythondisplay
from PIL import Image
from pyvirtualdisplay import Display


display = Display(visible=0, size=(400, 300))
display.start()


def render_episode(env: gym.Env, model: tf.keras.Model, max_steps: int): 
    
    screen = env.render(mode='rgb_array')
    im = Image.fromarray(screen)
    
    images = [im]

    state = tf.constant(env.reset(), dtype=tf.float32)
    
    for i in range(1, max_steps + 1):
        state = tf.expand_dims(state, 0)
        action_probs, _ = model(state)
        action = np.argmax(np.squeeze(action_probs))

        state, _, done, _ = env.step(action)
        state = tf.constant(state, dtype=tf.float32)

        # Render screen every 10 steps
        if i % 10 == 0:
            screen = env.render(mode='rgb_array')
            images.append(Image.fromarray(screen))

        if done:
            break

    return images


# Save GIF image
images = render_episode(env, model, 200)
#image_file = 'cartpole-v0.gif'
# loop=0: loop forever, duration=1: play each frame for 1ms
#images[0].save(
#    image_file, save_all=True, append_images=images[1:], loop=0, duration=1)

#import tensorflow_docs.vis.embed as embed
#embed.embed_file(image_file)

EasyProcessError: start error <EasyProcess cmd_param=['Xvfb', '-help'] cmd=['Xvfb', '-help'] oserror=[Errno 2] No such file or directory: 'Xvfb' return_code=None stdout="None" stderr="None" timeout_happened=False>