## RL with Soft Actor Critic (SAC)

In [None]:
!sudo apt-get update
!sudo apt-get install -y xvfb ffmpeg freeglut3-dev
!pip install 'imageio==2.4.0'
!pip install pyvirtualdisplay

!pip install matplotlib
!pip install tf-agents[reverb]
!pip install pyglet xvfbwrapper
!pip install pybullet
!pip install tf-keras


!pip install dm-reverb[tensorflow]
!sudo apt-get install swig
!pip install gym[box2d]

In [None]:
import os
# Keep using keras-2 (tf-keras) rather than keras-3 (keras).
os.environ['TF_USE_LEGACY_KERAS'] = '1'

## Setup

In [None]:
import base64
import imageio
import IPython
import matplotlib.pyplot as plt
import os
import reverb
import tempfile
import PIL.Image
import pyvirtualdisplay


import tensorflow as tf

from tf_agents.agents.ddpg import critic_network
from tf_agents.agents.sac import sac_agent
from tf_agents.agents.sac import tanh_normal_projection_network
from tf_agents.environments import suite_pybullet, suite_gym
from tf_agents.metrics import py_metrics
from tf_agents.networks import actor_distribution_network
from tf_agents.policies import greedy_policy
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import random_py_policy
from tf_agents.replay_buffers import reverb_replay_buffer
from tf_agents.replay_buffers import reverb_utils
from tf_agents.train import actor
from tf_agents.train import learner
from tf_agents.train import triggers
from tf_agents.train.utils import spec_utils
from tf_agents.train.utils import strategy_utils
from tf_agents.train.utils import train_utils

tempdir = tempfile.gettempdir()




# Set up a virtual display for rendering OpenAI gym environments.
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()

## Hyperparameters

In [None]:

#env names
# should be a env with continuous action space for SAC

PY_BULLET_ENVS = {
    "MinitaurBulletEnv-v0" : {
        "env_kwargs": None,
         "description": f"""This environment is a empty land with a robot, the goal is to control the Minitaur robot and have it move forward as fast as possible."""
    },
}

GYM_ENVS = {
    "LunarLander-v2" : {
         "env_kwarg": { "continuous": True, }, #"gravity": -10.0, "enable_wind": False, "wind_power": 15.0, "turbulence_power": 1.5,
         "description": f"""This environment is a classic rocket trajectory optimization problem, the goal is to help the pad to land safely. [full](https://www.gymlibrary.dev/environments/box2d/lunar_lander/)"""
    },

}

In [None]:


env_name = "LunarLander-v2" # @param ["LunarLander-v2",  "MinitaurBulletEnv-v0"]


# Use "num_iterations = 1e6" for better results (2 hrs)
# 1e5 is just so this doesn't take too long (1 hr)
num_iterations = 100_000 # @param {type:"integer"}

initial_collect_steps = 10_000 # @param {type:"integer"}
collect_steps_per_iteration = 1 # @param {type:"integer"}
replay_buffer_capacity = 10000 # @param {type:"integer"}

batch_size = 256 # @param {type:"integer"}


# @markdown ---
# @markdown ### Agent hyperparams:
# Agent hyperparams
critic_learning_rate = 3e-4 # @param {type:"number"}
actor_learning_rate = 3e-4 # @param {type:"number"}
alpha_learning_rate = 3e-4 # @param {type:"number"}
target_update_tau = 0.005 # @param {type:"number"}
target_update_period = 1 # @param {type:"number"}
gamma = 0.99 # @param {type:"number"}
reward_scale_factor = 1.0 # @param {type:"number"}


# @markdown ---
# @markdown ### Actor NN hyperparams:
# actor NN hyperparams
actor_fc_layer_params = (256,  256) # @param
actor_dropout_layer_params = 0.15     # @param {type:"slider", min:0, max:1, step:0.05}

# actor_dropout_layer_params = [actor_dropout_layer_params] * len(actor_fc_layer_params) if (not actor_fc_layer_params == None) else None
actor_dropout_layer_params = None


# @markdown ---
# @markdown ### Critic NN hyperparams:
# critic NN hyperparams
critic_action_fc_layer_params = (64, ) # @param
critic_observation_fc_layer_params = (64, ) # @param
critic_joint_fc_layer_params = (128, 256)   # @param
critic_observation_dropout_layer_params = 0.15     # @param {type:"slider", min:0, max:1, step:0.05}
critic_action_dropout_layer_params = 0.15          # @param {type:"slider", min:0, max:1, step:0.05}

# critic_observation_dropout_layer_params = [critic_observation_dropout_layer_params] * len(critic_observation_fc_layer_params) if (not critic_observation_fc_layer_params == None) else None
# critic_action_dropout_layer_params      = [critic_action_dropout_layer_params] * len(critic_action_fc_layer_params) if (not critic_action_fc_layer_params == None) else None

critic_observation_dropout_layer_params = None
critic_action_dropout_layer_params = None


# @markdown ---
# @markdown ### training eval and log params:
#training eval and log params
log_interval = 5000 # @param {type:"integer"}
num_eval_episodes = 20 # @param {type:"integer"}
eval_interval = 10000 # @param {type:"integer"}
policy_save_interval = 5000 # @param {type:"integer"}


#GPU support
use_gpu  = True #@param {type:"boolean"}
strategy = strategy_utils.get_strategy(tpu=False, use_gpu=use_gpu)

## Environment

load and examine the environment

In [None]:



#load env
current_suite = None
ENV_CONFIG = None
try:
  if env_name in PY_BULLET_ENVS:
    env = suite_pybullet.load(env_name)
    current_suite = suite_pybullet
    ENV_CONFIG = PY_BULLET_ENVS[env_name]
    print(f""" {env_name} loaded from suite_pybullet """)
  elif env_name in GYM_ENVS:
    env = suite_gym.load(env_name, gym_kwargs = GYM_ENVS[env_name]["env_kwarg"] )
    current_suite = suite_gym
    ENV_CONFIG = GYM_ENVS[env_name]
    print(f""" {env_name} loaded from suite_gym """)
  else:
    raise Exception

except Exception as e:
  print(e)
  print( f"""Something went wrong:: {env_name} is not present in either Pybullet or Gym """)




print( f""" ** Description ** :: { ENV_CONFIG.get("description") } \n\n""" )

#examine spec
env.reset()
time_step_spec = env.time_step_spec().observation
action_spec = env.action_spec()

print('Observation Spec:')
print(time_step_spec)
print('\nAction Spec:')
print(action_spec)


#render env
print('\n', '--------------------'*5, 'render environment')
PIL.Image.fromarray(env.render())

In [None]:
def get_random_action(seed = None): return tf.random.uniform(shape=action_spec.shape, minval=action_spec.minimum, maxval=action_spec.maximum, dtype=action_spec.dtype, seed=seed, name=None)

time_step = env.reset()
print('Time step:')
print(time_step)

action = get_random_action()

next_time_step = env.step(action.numpy())
print('Next time step:')
print(next_time_step)


#define training and evel env
collect_env = current_suite.load(env_name, gym_kwargs = ENV_CONFIG["env_kwarg"] )
eval_env    = current_suite.load(env_name, gym_kwargs = ENV_CONFIG["env_kwarg"] )

## Agent [Critic + Actor]

The `CriticNetwork` will give an estimation of Q(s,a)


The `ActorNetwork` will predict parameters for a tanh-squashed [MultivariateNormalDiag](https://www.tensorflow.org/probability/api_docs/python/tfp/distributions/MultivariateNormalDiag) distribution. This distribution will then be sampled, conditioned on the current observation, whenever we need to generate actions.

In [None]:
observation_spec, action_spec, time_step_spec = ( spec_utils.get_tensor_specs(collect_env) )


#critic NN gives an estimation of Q(s,a), for a couple {state, action}, outputs how good the action is
with strategy.scope():
  critic_net = critic_network.CriticNetwork(
        input_tensor_spec=(observation_spec, action_spec),

        observation_fc_layer_params=critic_observation_fc_layer_params, #None
        observation_dropout_layer_params=critic_observation_dropout_layer_params, #None

        action_fc_layer_params=critic_action_fc_layer_params, #None
        action_dropout_layer_params = critic_action_dropout_layer_params, #None

        joint_fc_layer_params=critic_joint_fc_layer_params,
        kernel_initializer='glorot_uniform',
        last_kernel_initializer='glorot_uniform',

        output_activation_fn =  None, #use tf.keras.activations.tanh so the Q(s, a) is bounded between [-1, +1]

        )





with strategy.scope():
  actor_net = actor_distribution_network.ActorDistributionNetwork(
      input_tensor_spec  = observation_spec,
      output_tensor_spec = action_spec,
      fc_layer_params=actor_fc_layer_params,
      continuous_projection_net=(tanh_normal_projection_network.TanhNormalProjectionNetwork)
      )




#create an agent from [critic + actor]
with strategy.scope():
  train_step = train_utils.create_train_step()

  tf_agent = sac_agent.SacAgent(
        time_step_spec,
        action_spec,
        actor_network=actor_net,
        critic_network=critic_net,
        actor_optimizer =tf.keras.optimizers.Adam(learning_rate=actor_learning_rate),
        critic_optimizer=tf.keras.optimizers.Adam(learning_rate=critic_learning_rate),
        alpha_optimizer= tf.keras.optimizers.Adam(learning_rate=alpha_learning_rate),
        target_update_tau=target_update_tau,
        target_update_period=target_update_period,
        td_errors_loss_fn=tf.math.squared_difference,
        gamma=gamma,
        reward_scale_factor=reward_scale_factor,
        train_step_counter=train_step)

  tf_agent.initialize()

## Replay Buffer

In [None]:
table_name = 'uniform_table'
table = reverb.Table(
    table_name,
    max_size=replay_buffer_capacity,
    sampler=reverb.selectors.Uniform(),
    remover=reverb.selectors.Fifo(),
    rate_limiter=reverb.rate_limiters.MinSize(1))

reverb_server = reverb.Server([table])



reverb_replay = reverb_replay_buffer.ReverbReplayBuffer(
    tf_agent.collect_data_spec,
    sequence_length=2, #set to 2 Since the SAC Agent needs both the current and next observation to compute the loss
    table_name=table_name,
    local_server=reverb_server)


# reverb replay buffer to tf dataset: We will pass this to the Learner to sample experiences for training.
dataset = reverb_replay.as_dataset( sample_batch_size=batch_size, num_steps=2).prefetch(100)
experience_dataset_fn = lambda: dataset

## Policies

In TF-Agents, policies represent the standard notion of policies in RL: given a `time_step` produce an action or a distribution over actions. The main method is `policy_step = policy.step(time_step)` where `policy_step` is a named tuple `PolicyStep(action, state, info)`.  The `policy_step.action` is the `action` to be applied to the environment, `state` represents the state for stateful (RNN) policies and `info` may contain auxiliary information such as log probabilities of the actions.


In [None]:
#main policy that is used for evaluation and deployment.
tf_eval_policy = tf_agent.policy
eval_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_eval_policy, use_tf_function=True)


#policy that is used for data collection.
tf_collect_policy = tf_agent.collect_policy
collect_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_collect_policy, use_tf_function=True)


#random policy
random_policy = random_py_policy.RandomPyPolicy(collect_env.time_step_spec(), collect_env.action_spec())

## Actors
The actor manages interactions between a policy and an environment.
  * The Actor components contain an instance of the environment (as `py_environment`) and a copy of the policy variables.
  * Each Actor worker runs a sequence of data collection steps given the local values of the policy variables.
  * Variable updates are done explicitly using the variable container client instance in the training script before calling `actor.run()`.
  * The observed experience is written into the replay buffer in each data collection step.

As the Actors run data collection steps, they pass trajectories of (state, action, reward) to the observer, which caches and writes them to the Reverb replay system.

We're storing trajectories for frames [(t0,t1) (t1,t2) (t2,t3), ...] because `stride_length=1`.

In [None]:
rb_observer = reverb_utils.ReverbAddTrajectoryObserver(
  reverb_replay.py_client,
  table_name,
  sequence_length=2,
  stride_length=1)

In [None]:
# We create an Actor with the random policy and collect experiences to seed the replay buffer with.
initial_collect_actor = actor.Actor(
  collect_env,
  random_policy,
  train_step,
  steps_per_run=initial_collect_steps,
  observers=[rb_observer])
initial_collect_actor.run()



#Instantiate an Actor with the collect policy to gather more experiences during training.
env_step_metric = py_metrics.EnvironmentSteps()
collect_actor = actor.Actor(
  collect_env,
  collect_policy,
  train_step,
  steps_per_run=1,
  metrics=actor.collect_metrics(10),
  summary_dir=os.path.join(tempdir, learner.TRAIN_DIR),
  observers=[rb_observer, env_step_metric])



#Create an Actor which will be used to evaluate the policy during training. We pass in actor.eval_metrics(num_eval_episodes) to log metrics later.
eval_actor = actor.Actor(
  eval_env,
  eval_policy,
  train_step,
  episodes_per_run=num_eval_episodes,
  metrics=actor.eval_metrics(num_eval_episodes),
  summary_dir=os.path.join(tempdir, 'eval'),
)

## Learners
The Learner component contains the agent and performs gradient step updates to the policy variables using experience data from the replay buffer. After one or more training steps, the Learner can push a new set of variable values to the variable container.

In [None]:
saved_model_dir = os.path.join(tempdir, learner.POLICY_SAVED_MODEL_DIR)

# Triggers to save the agent's policy checkpoints.
learning_triggers = [
    triggers.PolicySavedModelTrigger(
        saved_model_dir,
        tf_agent,
        train_step,
        interval=policy_save_interval),
    triggers.StepPerSecondLogTrigger(train_step, interval=1000),
]

agent_learner = learner.Learner(
  tempdir,
  train_step,
  tf_agent,
  experience_dataset_fn,
  triggers=learning_triggers,
  strategy=strategy)

## Metrics and Evaluation

We instantiated the eval Actor with `actor.eval_metrics` above, which creates most commonly used metrics during policy evaluation:
* Average return. The return is the sum of rewards obtained while running a policy in an environment for an episode, and we usually average this over a few episodes.
* Average episode length.

We run the Actor to generate these metrics.

In [None]:

import datetime
log_dir="logs/"

summary_writer = tf.summary.create_file_writer(
  log_dir + "fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))



def get_eval_metrics():
  eval_actor.run()
  results = {}
  for metric in eval_actor.metrics:
    results[metric.name] = metric.result()
  return results


def log_eval_metrics(step, metrics, is_training = False):
  eval_results = (', ').join(
      '{} = {:.6f}'.format(name, result) for name, result in metrics.items())
  print('step = {0}: {1}'.format(step, eval_results))

  if is_training:
    with summary_writer.as_default():
      for name, result in metrics.items():
        tf.summary.scalar(name + "_eval", result, step=step)

  return None




metrics = get_eval_metrics()
log_eval_metrics(0, metrics)




## Training the agent

The training loop involves both collecting data from the environment and optimizing the agent's networks. Along the way, we will occasionally evaluate the agent's policy to see how we are doing.

In [None]:

try:
  %%time
except:
  pass


%load_ext tensorboard
%tensorboard --logdir {log_dir}

# Reset the train step
tf_agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = get_eval_metrics()["AverageReturn"]
returns = [avg_return]

for _ in range(num_iterations):
  # Training.
  collect_actor.run()
  loss_info = agent_learner.run(iterations=1)

  # Evaluating.
  step = agent_learner.train_step_numpy

  if eval_interval and step % eval_interval == 0:
    metrics = get_eval_metrics()
    log_eval_metrics(step, metrics, is_training = True)
    returns.append(metrics["AverageReturn"])

  if log_interval and step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, loss_info.loss.numpy()))

    with summary_writer.as_default():
      tf.summary.scalar('batch_loss', loss_info.loss.numpy(), step=step)

rb_observer.close()
reverb_server.stop()

## Visualization


### Plots

We can plot average return vs global steps to see the performance of our agent. In `Minitaur`, the reward function is based on how far the minitaur walks in 1000 steps and penalizes the energy expenditure.

In [None]:


steps = range(0, num_iterations + 1, eval_interval)
plt.plot(steps, returns)
plt.ylabel('Average Return')
plt.xlabel('Step')
plt.ylim()

### Videos

It is helpful to visualize the performance of an agent by rendering the environment at each step. Before we do that, let us first create a function to embed videos in this colab.

In [None]:



def embed_mp4(filename):
  """Embeds an mp4 file in the notebook."""
  video = open(filename,'rb').read()
  b64 = base64.b64encode(video)
  tag = '''
  <video width="620" height="470" controls autoplay>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
  Your browser does not support the video tag.
  </video>'''.format(b64.decode())

  return tag



def embed_multiple_mp4(filenames, titles=[]):
  titles += [""] * len(filenames)
  titles = titles[:len(filenames)]

  video_tags = ""
  for filename, title in zip(filenames, titles):
    div = f"""
      <div style="margin-inline: 1%";>
        <p> {title} </p>
        {embed_mp4(filename)}
      </div>
    """
    video_tags += div

  html = f"""
  <div style="display: flex; flex-direction: row; align-items: center; width: 100%; ">
    {video_tags}
  </div>
  """
  return html




def display_video(html_video):
  return IPython.display.HTML(html_video)

The following code visualizes the agent's policy for a few episodes:

In [None]:
video_eval_env = current_suite.load(env_name, gym_kwargs = ENV_CONFIG["env_kwarg"], max_episode_steps=15000 )

def gen_video(video_filename = env_name, num_episodes = 5, is_random = False, fps=60):
  video_env = video_eval_env
  if is_random: video_filename += "_random"
  video_filename += '.mp4'

  with imageio.get_writer(video_filename, fps=fps) as video:
    for _ in range(num_episodes):
      time_step = video_env.reset()
      video.append_data(video_env.render())
      while not time_step.is_last():
        action_step = eval_actor.policy.action(time_step).action if (not is_random) else get_random_action().numpy()
        time_step = video_env.step(action_step)
        video.append_data(video_env.render())

  return video_filename


saved_video_fnames = [gen_video(is_random=False), gen_video(is_random=True)  ]

display_video(  embed_multiple_mp4 (filenames=saved_video_fnames , titles=['trained policy Agent', 'uniform random policy Agent']   )  )

