In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image
import pyvirtualdisplay

import tensorflow as tf
tf.compat.v1.enable_v2_behavior()

from tf_agents.agents.ddpg import critic_network
from tf_agents.agents.sac import sac_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_pybullet
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import actor_distribution_network
from tf_agents.networks import normal_projection_network
from tf_agents.policies import greedy_policy
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from tf_agents.environments import suite_gym
import gym


In [None]:
env_name = "fishing-v1"

num_iterations = 100000 


initial_collect_steps = 10000
collect_steps_per_iteration = 1
replay_buffer_capacity = 1000000

#number of examples used in a batch - the set of examples 
#used for training
batch_size = 256


critic_learning_rate = 3e-4
actor_learning_rate = 3e-4
alpha_learning_rate = 3e-4 #the stepsize per iteration
target_update_tau = 0.005 # Sets when tau = 1, there will be an update 
target_update_period = 1 #when tau = 1
gamma = 0.99 #discount
reward_scale_factor = 1.0 #factor for reward
gradient_clipping = None  #gradient clipping mitigates steeps gradients

actor_fc_layer_params = (256,256) #number of layers
critic_joint_fc_layer_params = (256, 256)

log_interval = 5000

num_eval_episodes = 30
eval_interval = 10000

In [None]:
#load environment
env = suite_gym.load(env_name)
env.reset()
#PIL.Image.fromarray(env.render())

In [None]:
#read action and observation specs
print("Observation spec")
print(env.time_step_spec().observation)

print("Action spec")
print(env.action_spec())

In [None]:
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

In [None]:
#Critic Network
observation_spec = train_env.observation_spec()
action_spec = train_env.action_spec()
critic_net = critic_network.CriticNetwork(
    (observation_spec, action_spec), 
    observation_fc_layer_params=None, 
    action_fc_layer_params=None, 
    joint_fc_layer_params=critic_joint_fc_layer_params
     #joint is the fully connected layer after the observation and action layers
     #are applied

In [None]:
#Normal distribution - creates an action distribution based on observations
def normal_projection_net(action_spec, init_means_output_factor=0.1):
  return normal_projection_network.NormalProjectionNetwork(
      action_spec,
      mean_transform=None, #normalizes mean
      state_dependent_std=True, #normalizes std
      init_means_output_factor=init_means_output_factor, #output factor initializing weights
      std_transform=sac_agent.std_clip_transform,
      scale_distribution=True)

actor_net = actor_distribution_network.ActorDistributionNetwork(
    observation_spec,
    action_spec,
    fc_layer_params=actor_fc_layer_params, dtype=tf.float64,
    continuous_projection_net=normal_projection_net)

In [None]:
#Instantiate Agent
tf.keras.backend.set_floatx(
   'float64'
)
global_step = tf.compat.v1.train.get_or_create_global_step()
tf_agent = sac_agent.SacAgent(
    train_env.time_step_spec(),
    action_spec,
    actor_network=actor_net,
    critic_network=critic_net,
    actor_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=actor_learning_rate),
    critic_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=critic_learning_rate),
    alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=alpha_learning_rate),
    target_update_tau=target_update_tau, #When to update model
    target_update_period=target_update_period,
    td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error,
    gamma=gamma,
    reward_scale_factor=reward_scale_factor,
    gradient_clipping=gradient_clipping,
    train_step_counter=global_step)
tf_agent.initialize()


In [None]:
eval_policy = greedy_policy.GreedyPolicy(tf_agent.policy)
collect_policy = tf_agent.collect_policy

In [None]:
def compute_avg_return(environment, policy, num_episodes=5):

  total_return = 0.0
  for _ in range(num_episodes):

    time_step = environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]

compute_avg_return(eval_env, eval_policy, num_eval_episodes)

In [None]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=tf_agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_capacity
)

#collect_data_spec is the name of our Trajectory

In [None]:
initial_collect_driver = dynamic_step_driver.DynamicStepDriver(
    train_env,
    collect_policy,
    observers=[replay_buffer.add_batch],
    num_steps=initial_collect_steps)

initial_collect_driver.run()

In [None]:
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3)
#num_steps=2 because the buffer can sample two adjacent rows:
#(the current observation and the next observation)

iterator = iter(dataset)

In [None]:
collect_driver = dynamic_step_driver.DynamicStepDriver(
    train_env,
    collect_policy,
    observers=[replay_buffer.add_batch],
    num_steps=collect_steps_per_iteration
)

#Reset train step
tf_agent.train_step_counter.assign(0)

#Evaluate metric of policy before training
avg_return = compute_avg_return(eval_env, eval_policy,
                                num_eval_episodes)
returns = [avg_return]

for i in range(num_iterations):

  #Collect a few steps using the collect policy method and save to buffer
  for i in range(collect_steps_per_iteration):
    collect_driver.run()

  #Samples a batch of data from the buffer and update the agent
  experience, unused_info = next(iterator)
  train_loss = tf_agent.train(experience)

  step = tf_agent.train_step_counter.numpy()

  if step % log_interval == 0:
    print('step = {0}: los = {1}'.format(step, train_loss.loss))

  if step % eval_interval == 0:
    avg_return = compute_avg_return(eval_env, eval_policy, 
                                    num_eval_episodes)
    print('step = {0}: Average Return = {1}'.format(step, avg_return))
    returns.append(avg_return)

In [None]:
steps = range(0, num_iterations + 1, eval_interval)
plt.plot(steps, returns)
plt.ylabel('Average Return')
plt.xlabel("Step")
plt.ylim()