In [1]:
# DDPG Implementation using tf agents library

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import time

from absl import app
from absl import logging

import gin
import tensorflow as tf
import gym
from gym import wrappers

from tf_agents.agents.ddpg import actor_network
from tf_agents.agents.ddpg import critic_network
from tf_agents.agents.ddpg import ddpg_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import parallel_py_environment
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.trajectories import time_step
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.utils import common
from gym.envs import box2d

In [3]:
# Defaults
env_name = 'BipedalWalker-v2'
num_iterations = 2500000
use_tf_functions = True

# Replay Buffer Parameters & Noise Function Parameters
initial_collect_steps = 1000 
collect_steps_per_iteration = 1
replay_buffer_capacity = 100000
ou_stddev = 0.2 
ou_damping = 0.15 

# Target Update Parameters
target_update_tau = 0.05
target_update_period = 5

# Train Step Parameters
train_steps_per_iteration = 1 
batch_size = 64
actor_learning_rate = 1e-4
critic_learning_rate = 1e-3
td_errors_loss_fn = tf.compat.v1.losses.mean_squared_error 
gamma = 0.99 
reward_scale_factor = 1.0 

# Evaluation and Summary Parameters
num_eval_episodes = 100
eval_interval = 10000
log_interval = 1000
summary_interval = 1000
summaries_flush_secs = 10
run_id = 20420    # ID to differentiate between runs
root_dir = '~/Bachelorarbeit/Baseline'    # Has to be an existing directory

# For training on Brain
use_brain = False

In [4]:
if use_brain:
    global args
    parser = argparse.ArgumentParser(description = 'DDPG Arguments')
    parser.add_argument('--run_id', type = int, help = "identifying substring for folder names (default: date)")
    args = parser.parse_args()
    
    if args.run_id is not None:
        run_id = args.run_id

In [5]:
def DDPG_Bipedal(root_dir):
    
    # Setting up directories for results
    root_dir = os.path.expanduser(root_dir)
    train_dir = os.path.join(root_dir, 'train' + '/' + str(run_id))
    eval_dir = os.path.join(root_dir, 'eval' + '/' + str(run_id))
    
    # Set up Summary writer for training and evaluation
    train_summary_writer = tf.compat.v2.summary.create_file_writer(
        train_dir, flush_millis = summaries_flush_secs * 1000
    )
    train_summary_writer.set_as_default()
    
    eval_summary_writer = tf.compat.v2.summary.create_file_writer(
        eval_dir, flush_millis = summaries_flush_secs * 1000
    )
    eval_metrics = [
        # Metric to record average return
        tf_metrics.AverageReturnMetric(buffer_size = num_eval_episodes),
        # Metric to record average episode length
        tf_metrics.AverageEpisodeLengthMetric(buffer_size = num_eval_episodes)
    ]
    
    #Create global step
    global_step = tf.compat.v1.train.get_or_create_global_step()
    
    with tf.compat.v2.summary.record_if(
        lambda: tf.math.equal(global_step % summary_interval, 0)):
        # Load Environment with different wrappers
        tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load(env_name))
        eval_tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load(env_name))
        eval_py_env = suite_gym.load(env_name)
    
    
        # Define Actor Network
        actorNN = actor_network.ActorNetwork(
                  tf_env.time_step_spec().observation,
                  tf_env.action_spec(),
                  fc_layer_params=(400, 300),
        )
    
        # Define Critic Network
        NN_input_specs = (tf_env.time_step_spec().observation,
                          tf_env.action_spec()
        )
    
        criticNN = critic_network.CriticNetwork(
                   NN_input_specs,
                   observation_fc_layer_params = (400,),
                   action_fc_layer_params = None,
                   joint_fc_layer_params = (300,),
        )
        
        # Define & initialize DDPG Agent
        agent = ddpg_agent.DdpgAgent(
                tf_env.time_step_spec(),
                tf_env.action_spec(),
                actor_network = actorNN,
                critic_network = criticNN,
                actor_optimizer = tf.compat.v1.train.AdamOptimizer(
                                  learning_rate = actor_learning_rate),
                critic_optimizer = tf.compat.v1.train.AdamOptimizer(
                                   learning_rate = critic_learning_rate),
                ou_stddev = ou_stddev,
                ou_damping = ou_damping,
                target_update_tau = target_update_tau,
                target_update_period = target_update_period,
                dqda_clipping = None,
                td_errors_loss_fn = tf.compat.v1.losses.mean_squared_error,
                gamma = gamma,
                reward_scale_factor = 1.0,
                gradient_clipping = None,
                debug_summaries = False,
                summarize_grads_and_vars = False,
                train_step_counter = global_step
        )
        agent.initialize()
        
        # Determine which train metrics to display with summary writer
        train_metrics = [
            tf_metrics.NumberOfEpisodes(),
            tf_metrics.EnvironmentSteps(),
            tf_metrics.AverageReturnMetric(),
            tf_metrics.AverageEpisodeLengthMetric(),
        ]
        
        # Set policies for evaluation and initial collection
        eval_policy = agent.policy_state    # Actor policy
        collect_policy = agent.collect_policy    # Actor policy with OUNoise
        
        # Set up replay buffer
        replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
                        agent.collect_data_spec,
                        batch_size = tf_env.batch_size,
                        max_length = replay_buffer_capacity
        )
        
        # Define driver for initial replay buffer filling
        initial_collect_driver = dynamic_step_driver.DynamicStepDriver(
                                     tf_env,
                                     collect_policy,    # Initializes with random Parameters as beta
                                     observers = [replay_buffer.add_batch],
                                     num_steps = initial_collect_steps
        )

        # Define collect driver for collect steps per iteration
        collect_driver = dynamic_step_driver.DynamicStepDriver(
                             tf_env,
                             collect_policy,
                             observers = [replay_buffer.add_batch] + train_metrics,
                             num_steps = collect_steps_per_iteration
        )
        
        if use_tf_functions:
            initial_collect_driver.run = common.function(initial_collect_driver.run)
            collect_driver.run = common.function(collect_driver.run)
            agent.train = common.function(agent.train)
            
        # Make 1000 random steps in tf_env and save in Replay Buffer
        logging.info(
            'Initializing replay buffer by collecting experience for 1000 steps with '
            'a random policy.', initial_collect_steps)
        initial_collect_driver.run()
        
        # Computes Evaluation Metrics
        results = metric_utils.eager_compute(
                  eval_metrics,
                  eval_tf_env,
                  eval_policy,
                  num_episodes = num_eval_episodes,
                  train_step = global_step,
                  summary_writer = eval_summary_writer,
                  summary_prefix = 'Metrics',
        )
        metric_utils.log_metrics(eval_metrics)
        
        time_step = None
        policy_state = collect_policy.get_initial_state(tf_env.batch_size)

        timed_at_step = global_step.numpy()
        time_acc = 0 

        # Dataset outputs steps in batches of 64
        dataset = replay_buffer.as_dataset(
                  num_parallel_calls = 3,
                  sample_batch_size = 64,
                  num_steps = 2).prefetch(3)
        iterator = iter(dataset)

        def train_step():
            experience, _ = next(iterator) #Get experience from dataset (replay buffer)
            return agent.train(experience) #Train agent on that experience
        
        if use_tf_functions:
            train_step = common.function(train_step)
            
        
        for _ in range(num_iterations):
            start_time = time.time() # Get start time
            # Collect data for replay buffer
            time_step, policy_state = collect_driver.run(
                                      time_step = time_step,
                                      policy_state = policy_state,
            )
            # Train on experience 
            for _ in range(train_steps_per_iteration):
                train_loss = train_step()
            time_acc += time.time() - start_time

            if global_step.numpy() % log_interval == 0:
                logging.info('step = %d, loss = %f', global_step.numpy(),
                             train_loss.loss
                )
                steps_per_sec = (global_step.numpy() - timed_at_step) / time_acc
                logging.info('%.3f steps/sec', steps_per_sec)
                tf.compat.v2.summary.scalar(
                    name = 'global_steps_per_sec', data = steps_per_sec, 
                    step = global_step
                )
                timed_at_step = global_step.numpy()
                time_acc = 0

            for train_metric in train_metrics:
                train_metric.tf_summaries(train_step = global_step, 
                                          step_metrics = train_metrics[:2])
                
            if global_step.numpy() % eval_interval == 0:
                results = metric_utils.eager_compute(
                          eval_metrics,
                          eval_tf_env,
                          eval_policy,
                          num_episodes = num_eval_episodes,
                          train_step = global_step,
                          summary_writer = eval_summary_writer,
                          summary_prefix = 'Metrics',
                )
                if eval_metrics_callback is not None:
                    eval_metrics_callback(results, global_step.numpy())
                metric_utils.log_metrics(eval_metrics)
                
    return train_loss

DDPG_Bipedal(root_dir)

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


LossInfo(loss=<tf.Tensor: shape=(), dtype=float32, numpy=120.05078>, extra=DdpgInfo(actor_loss=<tf.Tensor: shape=(), dtype=float32, numpy=82.62599>, critic_loss=<tf.Tensor: shape=(), dtype=float32, numpy=37.42479>))