In [None]:
'''
DDPG Implementation using tf agents library

Before use, please set PATH to any existing directory for video, train, and eval directory output 
(Default: '~/')
Additionally, please set an integer for subdirectory identification of each run of DDPG_Bipedal 
(Default: 28420).
'''
PATH = '/home/jovyan/Masterarbeit/DDPG'  
RUN_ID = 28420
'''
Almost done! After this last variable is set, please run all cells.
For testing CS5Gamma, please set following variable to 'True' (Default: 'False').
'''
CS5Gamma = True

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import functools
import os
import time
import sys
sys.path.insert(1, '/home/jovyan/Masterarbeit/reinforce-one/Environment')
sys.path.insert(1, '/home/jovyan/Masterarbeit/reinforce-one/Agent/DDPG/Test1_Frequent_Returns')
sys.path.insert(1, '/home/jovyan/Masterarbeit/reinforce-one/Environment/Simplifications')

from absl import app
from absl import flags
from absl import logging

import gin
from six.moves import range
import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import
import numpy as np

from tf_agents.agents.ddpg import ddpg_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import parallel_py_environment
from tf_agents.environments import suite_mujoco
from tf_agents.environments import tf_py_environment
from tf_agents.policies import scripted_py_policy
from tf_agents.policies import policy_saver
from tf_agents.eval import metric_utils
from tf_agents.keras_layers import inner_reshape
from tf_agents.metrics import tf_metrics
from tf_agents.networks import nest_map
from tf_agents.networks import sequential
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.utils import common

from Env import Env
from FR_Env import FREnv
from Env_Simple import Env_S

In [None]:
root_dir = '/home/jovyan/Masterarbeit/DDPG'
num_herds=2
total_population=300
#py_env = Env(num_herds = num_herds, total_population = total_population, 
                                                  #fix_episode_length = True, average_episode_length = 100)

py_env = Env_S(num_herds = num_herds, total_population = total_population, fix_episode_length=True, average_episode_length = 200)


In [None]:
def compute_avg_return(environment, policy, num_episodes=50, verbose=False):
  total_return = 0.0
  cullsteps = 0 
  for e in range(num_episodes):

    time_step = environment.reset()
    if isinstance(policy, scripted_py_policy.ScriptedPyPolicy):
        policy_state = policy.get_initial_state() # remember where in the script we were
    else:
        #print(policy.get_initial_state(batch_size=train_env.batch_size()))
        policy_state = policy.get_initial_state(batch_size=1) # other policies without memory
    episode_return = 0.0
    i=0
    while not time_step.is_last():
        i+=1
        action_step = policy.action(time_step, policy_state)
        #for i in range (num_herds, num_herds*2):
        for i in range (0, num_herds):
            if action_step.action[0][i] > 0.1:
                cullsteps += 1
                break
        policy_state = action_step.state
        time_step = environment.step(action_step.action)

        state = None # TF environment from wrapper does not have get_state()
        episode_return += time_step.reward
        if verbose:
            print (f"episode {e:>2} step{i:>4} action: ", action_step.action, 
                   "state=", state, "obs=", time_step.observation, "reward=", time_step.reward)
    total_return += episode_return

  avg_return = total_return / num_episodes
  cullsteps /= num_episodes
  return avg_return, cullsteps

In [None]:
def train_eval(
    root_dir,
    env_name='Env',
    eval_env_name=None,
    env_load_fn=suite_mujoco.load,
    num_iterations=2000000,
    actor_fc_layers=(400, 300),
    critic_obs_fc_layers=(400,),
    critic_action_fc_layers=None,
    critic_joint_fc_layers=(300,),
    # Params for collect
    initial_collect_steps=1000,
    collect_steps_per_iteration=200,
    num_parallel_environments=1,
    replay_buffer_capacity=100000,
    ou_stddev=0.2,
    ou_damping=0.15,
    # Params for target update
    target_update_tau=0.05,
    target_update_period=5,
    # Params for train
    train_steps_per_iteration=200,
    batch_size=64,
    actor_learning_rate=1e-4,
    critic_learning_rate=1e-3,
    dqda_clipping=None,
    td_errors_loss_fn=tf.compat.v1.losses.huber_loss,
    gamma=0.995,
    reward_scale_factor=1.0,
    gradient_clipping=None,
    use_tf_functions=True,
    # Params for eval
    num_eval_episodes=100,
    eval_interval=10000,
    # Params for checkpoints, summaries, and logging
    log_interval=1000,
    summary_interval=1000,
    summaries_flush_secs=10,
    debug_summaries=False,
    summarize_grads_and_vars=False,
    eval_metrics_callback=None):

  """A simple train and eval for DDPG."""
  root_dir = os.path.expanduser(root_dir)
  train_dir = os.path.join(root_dir, 'train')
  eval_dir = os.path.join(root_dir, 'eval')
    
  # Initialize policy saver
  best_return = -4000
  policy_dir = os.path.join(root_dir, 'policy')

  train_summary_writer = tf.compat.v2.summary.create_file_writer(
      train_dir, flush_millis=summaries_flush_secs * 1000)
  train_summary_writer.set_as_default()

  eval_summary_writer = tf.compat.v2.summary.create_file_writer(
      eval_dir, flush_millis=summaries_flush_secs * 1000)
  eval_metrics = [
      tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
      tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes)
  ]

  global_step = tf.compat.v1.train.get_or_create_global_step()
  with tf.compat.v2.summary.record_if(
      lambda: tf.math.equal(global_step % summary_interval, 0)):
    if num_parallel_environments > 1:
      tf_env = tf_py_environment.TFPyEnvironment(
          parallel_py_environment.ParallelPyEnvironment(
              [lambda: env_load_fn(env_name)] * num_parallel_environments))
    else:
      tf_env = tf_py_environment.TFPyEnvironment(py_env)
    eval_env_name = eval_env_name or env_name
    eval_tf_env = tf_py_environment.TFPyEnvironment(py_env)

    actor_net = create_actor_network(actor_fc_layers, tf_env.action_spec())
    critic_net = create_critic_network(critic_obs_fc_layers,
                                       critic_action_fc_layers,
                                       critic_joint_fc_layers)

    tf_agent = ddpg_agent.DdpgAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        actor_network=actor_net,
        critic_network=critic_net,
        actor_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=actor_learning_rate),
        critic_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=critic_learning_rate),
        ou_stddev=ou_stddev,
        ou_damping=ou_damping,
        target_update_tau=target_update_tau,
        target_update_period=target_update_period,
        dqda_clipping=dqda_clipping,
        td_errors_loss_fn=td_errors_loss_fn,
        gamma=gamma,
        reward_scale_factor=reward_scale_factor,
        gradient_clipping=gradient_clipping,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=global_step)
    tf_agent.initialize()

    saver = policy_saver.PolicySaver(tf_agent.policy)
    
    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(),
        tf_metrics.AverageEpisodeLengthMetric(),
    ]

    eval_policy = tf_agent.policy
    collect_policy = tf_agent.collect_policy

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        tf_agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=replay_buffer_capacity)

    initial_collect_driver = dynamic_step_driver.DynamicStepDriver(
        tf_env,
        collect_policy,
        observers=[replay_buffer.add_batch],
        num_steps=initial_collect_steps)

    collect_driver = dynamic_step_driver.DynamicStepDriver(
        tf_env,
        collect_policy,
        observers=[replay_buffer.add_batch] + train_metrics,
        num_steps=collect_steps_per_iteration)

    if use_tf_functions:
      initial_collect_driver.run = common.function(initial_collect_driver.run)
      collect_driver.run = common.function(collect_driver.run)
      tf_agent.train = common.function(tf_agent.train)

    # Collect initial replay data.
    logging.info(
        'Initializing replay buffer by collecting experience for %d steps with '
        'a random policy.', initial_collect_steps)
    initial_collect_driver.run()

    results = metric_utils.eager_compute(
        eval_metrics,
        eval_tf_env,
        eval_policy,
        num_episodes=num_eval_episodes,
        train_step=global_step,
        summary_writer=eval_summary_writer,
        summary_prefix='Metrics',
    )
    if eval_metrics_callback is not None:
      eval_metrics_callback(results, global_step.numpy())
    metric_utils.log_metrics(eval_metrics)

    time_step = None
    policy_state = collect_policy.get_initial_state(tf_env.batch_size)

    timed_at_step = global_step.numpy()
    time_acc = 0

    # Dataset generates trajectories with shape [Bx2x...]
    dataset = replay_buffer.as_dataset(
        num_parallel_calls=3,
        sample_batch_size=batch_size,
        num_steps=2).prefetch(3)
    iterator = iter(dataset)

    def train_step():
      experience, _ = next(iterator)
      return tf_agent.train(experience)

    if use_tf_functions:
      train_step = common.function(train_step)

    for _ in range(num_iterations):
      start_time = time.time()
      time_step, policy_state = collect_driver.run(
          time_step=time_step,
          policy_state=policy_state,
      )
      for _ in range(train_steps_per_iteration):
        train_loss = train_step()
      time_acc += time.time() - start_time

      if global_step.numpy() % log_interval == 0:
        logging.info('step = %d, loss = %f', global_step.numpy(),
                     train_loss.loss)
        steps_per_sec = (global_step.numpy() - timed_at_step) / time_acc
        logging.info('%.3f steps/sec', steps_per_sec)
        tf.compat.v2.summary.scalar(
            name='global_steps_per_sec', data=steps_per_sec, step=global_step)
        timed_at_step = global_step.numpy()
        time_acc = 0

      for train_metric in train_metrics:
        train_metric.tf_summaries(
            train_step=global_step, step_metrics=train_metrics[:2])

      if global_step.numpy() % eval_interval == 0:
        results = metric_utils.eager_compute(
            eval_metrics,
            eval_tf_env,
            eval_policy,
            num_episodes=num_eval_episodes,
            train_step=global_step,
            summary_writer=eval_summary_writer,
            summary_prefix='Metrics',
        )
        if eval_metrics_callback is not None:
          eval_metrics_callback(results, global_step.numpy())
        metric_utils.log_metrics(eval_metrics)
        avg_return, cullsteps = compute_avg_return(eval_tf_env, eval_policy, num_episodes=100, verbose=False)
        print('step {0}: average return = {1:.1f} cullsteps = {2:.1f}'.format(global_step.numpy(), 
                                                                            avg_return.numpy().item(), cullsteps))
        if avg_return > best_return:
            if avg_return > -300:
                best_return = avg_return
                print('Final best return: ', best_return)
                saver.save(os.path.join(policy_dir, str(global_step.numpy())))
                break
            else:
                best_return = avg_return
                print('New best return: ', best_return)
                saver.save(os.path.join(policy_dir, str(global_step.numpy())))
        elif (70000 <= global_step.numpy() <= 80000):
            best_return = -4000
        elif (100000 <= global_step.numpy() <= 130000):
            best_return = -4000

    return train_loss


dense = functools.partial(
    tf.keras.layers.Dense,
    activation=tf.keras.activations.relu,
    kernel_initializer=tf.compat.v1.variance_scaling_initializer(
        scale=1./ 3.0, mode='fan_in', distribution='uniform'))


def create_identity_layer():
  return tf.keras.layers.Lambda(lambda x: x)


def create_fc_network(layer_units):
  return sequential.Sequential([dense(num_units) for num_units in layer_units])


def create_actor_network(fc_layer_units, action_spec):
  """Create an actor network for DDPG."""
  flat_action_spec = tf.nest.flatten(action_spec)
  if len(flat_action_spec) > 1:
    raise ValueError('Only a single action tensor is supported by this network')
  flat_action_spec = flat_action_spec[0]

  fc_layers = [dense(num_units) for num_units in fc_layer_units]

  num_actions = flat_action_spec.shape.num_elements()
  action_fc_layer = tf.keras.layers.Dense(
      num_actions,
      activation=tf.keras.activations.tanh,
      kernel_initializer=tf.keras.initializers.RandomUniform(
          minval=-0.003, maxval=0.003))

  scaling_layer = tf.keras.layers.Lambda(
      lambda x: common.scale_to_spec(x, flat_action_spec))
  return sequential.Sequential(fc_layers + [action_fc_layer, scaling_layer])


def create_critic_network(obs_fc_layer_units,
                          action_fc_layer_units,
                          joint_fc_layer_units):
  """Create a critic network for DDPG."""

  def split_inputs(inputs):
    return {'observation': inputs[0], 'action': inputs[1]}

  obs_network = create_fc_network(
      obs_fc_layer_units) if obs_fc_layer_units else create_identity_layer()
  action_network = create_fc_network(
      action_fc_layer_units
  ) if action_fc_layer_units else create_identity_layer()
  joint_network = create_fc_network(
      joint_fc_layer_units) if joint_fc_layer_units else create_identity_layer(
      )
  value_fc_layer = tf.keras.layers.Dense(
      1,
      activation=None,
      kernel_initializer=tf.keras.initializers.RandomUniform(
          minval=-0.003, maxval=0.003))

  return sequential.Sequential([
      tf.keras.layers.Lambda(split_inputs),
      nest_map.NestMap({
          'observation': obs_network,
          'action': action_network
      }),
      nest_map.NestFlatten(),
      tf.keras.layers.Concatenate(),
      joint_network,
      value_fc_layer,
      inner_reshape.InnerReshape([1], [])
  ])


In [None]:
train_eval(root_dir)