# DDPG Agent for epidemic control model  
This notebook will train an agent in an epidemic control environment using DDPG with RNNs or ANNs.  
  
For use, please complete the following steps:
- Edit PATH variable below to any folder where training outputs can be stored.  
- Create a folder titled 'policy' in PATH directory.  
- Edit the sys path insertions to the directories where environments are stored.  

The default variant is ANN DDPG, for RNN DDPG edit "use_rnns" variable below.  
Default environment is EE0, for different environments see "Environment" section below.  
For more in-depth changes in hyperparameters or neural networks, the relevant sections are labeled accordingly.

In [None]:


# Output folder
PATH = '~/.out/'

# Path to environment folder
import sys
sys.path.insert(1, '~/reinforce-one/Environments')
sys.path.insert(1, '~/reinforce-one/Environments/Variations')

# Decide whether to use RNN DDPG or ANN DDPG
use_rnns = False

## Imports

Firstly, all relevant dependencies will be imported.  
Comments indicate what imports are generally used for or related to.

In [None]:
import tensorflow as tf 
import numpy as np

# Environment 
from tf_agents.environments import tf_py_environment
from tf_agents.environments import py_environment
from tf_agents.policies import scripted_py_policy
from tf_agents.policies import random_tf_policy
# Neural Networks
from tf_agents.agents.ddpg import actor_rnn_network
from tf_agents.agents.ddpg import critic_rnn_network
from tf_agents.networks import sequential
from tf_agents.networks import nest_map
from tf_agents.keras_layers import inner_reshape
import functools
# Agent 
from tf_agents.agents.ddpg import ddpg_agent
# Experience Replay
from tf_agents.drivers import dynamic_episode_driver
from tf_agents.drivers import dynamic_step_driver
from tf_agents.replay_buffers import tf_uniform_replay_buffer
#Training
from tf_agents.utils import common
#Evaluation
from tf_agents.policies import policy_saver
from tf_agents.trajectories import time_step
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
import os
import matplotlib
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
#

## Environment

Next, an environment will be imported and initialized.  
For training different environments, edit lines 8-12.

In [None]:
from EE0 import EE0
from EE0_A import EE0_A
from EE0_NT import EE0_NT
from EE1 import EE1
from EE1_A import EE1_A
from EE2 import EE2

num_herds = 2
total_population = 300
average_episode_length=200
fix_episode_length = True
py_env = EE0(num_herds = num_herds, total_population = total_population, fix_episode_length = fix_episode_length, 
               average_episode_length = average_episode_length)

# Transforms py environment into tensorflow environment (i/o are now tensors)
train_env = tf_py_environment.TFPyEnvironment(py_env)
eval_env = tf_py_environment.TFPyEnvironment(py_env)

## Training
In this section, define a function for agent training and evaluation.  
First, create neural networks for use for variations in training.

### RNN DDPG

Set up actor and critic recurrent neural networks for training with DDPG using RNNs.  
Edit hyperparams for different layer sizes.

In [None]:
# RNN hyperparams
actor_fc_layers = (200, 150)
actor_output_fc_layers = (50,)
actor_lstm_size = (40,)
critic_obs_fc_layers = (200,)
critic_action_fc_layers = None
critic_joint_fc_layers = (150,)
critic_output_fc_layers = (50,)
critic_lstm_size = (40,)

# RNN actor critic
actor_rnn = actor_rnn_network.ActorRnnNetwork(train_env.time_step_spec().observation, 
                                              train_env.action_spec(), 
                                              input_fc_layer_params=actor_fc_layers, 
                                              lstm_size = actor_lstm_size, 
                                              output_fc_layer_params=actor_output_fc_layers)

critic_net_input_specs = (train_env.time_step_spec().observation, 
                          train_env.action_spec())

critic_rnn = critic_rnn_network.CriticRnnNetwork(critic_net_input_specs, 
                                                 observation_fc_layer_params=critic_obs_fc_layers, 
                                                 action_fc_layer_params=critic_action_fc_layers, 
                                                 joint_fc_layer_params=critic_joint_fc_layers, 
                                                 lstm_size=critic_lstm_size, 
                                                 output_fc_layer_params=critic_output_fc_layers)

### ANN DDPG  
Create actor and critic artificial neural networks for DDPG.  
Again, edit hyperparams for different layer sizes.

In [None]:
# Set ann hyperparameters
actor_fc_layers=(400, 300)
critic_obs_fc_layers=(400,)
critic_action_fc_layers=None
critic_joint_fc_layers=(300,)


# Define creation functions 

dense = functools.partial(tf.keras.layers.Dense,
                          activation=tf.keras.activations.relu,
                          kernel_initializer=tf.compat.v1.variance_scaling_initializer(
                              scale=1./ 3.0, mode='fan_in', distribution='uniform')
                         )


def create_identity_layer():
    return tf.keras.layers.Lambda(lambda x: x)


def create_fc_network(layer_units):
    return sequential.Sequential([dense(num_units) for num_units in layer_units])


def create_actor_network(fc_layer_units, action_spec):
    flat_action_spec = tf.nest.flatten(action_spec)
    if len(flat_action_spec) > 1:
        raise ValueError('Only a single action tensor is supported by this network')
    flat_action_spec = flat_action_spec[0]

    fc_layers = [dense(num_units) for num_units in fc_layer_units]

    num_actions = flat_action_spec.shape.num_elements()
    
    action_fc_layer = tf.keras.layers.Dense(num_actions,
                                            activation=tf.keras.activations.tanh,
                                            kernel_initializer=tf.keras.initializers.RandomUniform(
                                                minval=-0.003, maxval=0.003)
                                           )

    scaling_layer = tf.keras.layers.Lambda(
        lambda x: common.scale_to_spec(x, flat_action_spec))
    return sequential.Sequential(fc_layers + [action_fc_layer, scaling_layer])


def create_critic_network(obs_fc_layer_units,
                          action_fc_layer_units,
                          joint_fc_layer_units):
    def split_inputs(inputs):
        return {'observation': inputs[0], 'action': inputs[1]}
    
    if obs_fc_layer_units:
        obs_network = create_fc_network(obs_fc_layer_units)  
    else:
        obs_network = create_identity_layer()
    if action_fc_layer_units:    
        action_network = create_fc_network(action_fc_layer_units)
    else:
        action_network = create_identity_layer()
    if joint_fc_layer_units:    
        joint_network = create_fc_network(joint_fc_layer_units) 
    else: 
        joint_network = create_identity_layer()
    value_fc_layer = tf.keras.layers.Dense(1,
                                           activation=None,
                                           kernel_initializer=tf.keras.initializers.RandomUniform(minval=-0.003, maxval=0.003)
                                          )

    return sequential.Sequential([tf.keras.layers.Lambda(split_inputs),
                                  nest_map.NestMap({'observation': obs_network,
                                                    'action': action_network}),
                                  nest_map.NestFlatten(),
                                  tf.keras.layers.Concatenate(),
                                  joint_network,
                                  value_fc_layer,
                                  inner_reshape.InnerReshape([1], [])
                                 ])


# Create neural networks

actor_ann = create_actor_network(actor_fc_layers, 
                                 train_env.action_spec())
critic_ann = create_critic_network(critic_obs_fc_layers,
                                   critic_action_fc_layers,
                                   critic_joint_fc_layers)

### Hyperparameters  
Set hyperparameters for DDPG training.

In [None]:
num_iterations = 1000000

# Agent hyperparameters
actor_learning_rate = 1e-4
critic_learning_rate = 1e-3
ou_stddev = 0.2
ou_damping = 0.15
target_update_tau = 0.05
target_update_period = 100
gamma = 0.995
# Training hyperparameters
train_steps_per_iteration = 1

# Experience replay hyperparameters
rb_capacity = 500000
batch_size = 64
train_sequence_length = 200    # Automatically set to 1 for ANN DDPG
# For ANN DDPG
collect_steps_per_iteration = 200
initial_collect_steps = 25000
# For RNN DDPG
initial_collect_episodes = 10
collect_episodes_per_iteration = 1

# Summary params
summary_interval = 1000
# Evaluation hyperparameters
eval_interval = 1000
eval_episodes = 200
threshhold_return = -30
threshhold_reset_interval = 5000
plots = False  # Only works if num_herds = 2

### DDPG  
Finally, define training function using tf-agent's ddpg agent.

In [None]:
def DDPG(num_iterations = num_iterations,
         actor_net = None,
         critic_net = None,
         directory = PATH,
         plots = plots,
         eval_interval = eval_interval,
         summary_interval = summary_interval,
         best_return = threshhold_return,
         threshhold_reset_interval = threshhold_reset_interval,
         # Agent hyperparameters
         actor_learning_rate = actor_learning_rate,
         critic_learning_rate = critic_learning_rate,
         ou_stddev = ou_stddev,
         ou_damping = ou_damping,
         target_update_tau = target_update_tau,
         target_update_period = target_update_period,
         gamma = gamma,
         # Training hyperparameters
         train_steps_per_iteration = train_steps_per_iteration,
         # Experience replay hyperparameters
         initial_collect_episodes = initial_collect_episodes,
         collect_episodes_per_iteration = collect_episodes_per_iteration,
         rb_capacity = rb_capacity,
         batch_size = batch_size,
         train_sequence_length = train_sequence_length):
    
    if actor_net is None or critic_net is None:
        raise ValueError('Please input an actor network and critic network.')
    
    # Create directories for summary output
    directory = os.path.expanduser(directory)
    train_dir = os.path.join(directory, 'train')
    eval_dir = os.path.join(directory, 'eval')
    policy_dir = os.path.join(directory, 'policy')
    
    # Global step tracks number of train steps
    global_step = tf.compat.v1.train.get_or_create_global_step()
    
    # Initialize summary writers 
    train_summary_writer = tf.compat.v2.summary.create_file_writer(
                               train_dir, flush_millis=10000)
    train_summary_writer.set_as_default()

    eval_summary_writer = tf.compat.v2.summary.create_file_writer(
                              eval_dir, flush_millis=10000)
    eval_metrics = [tf_metrics.AverageReturnMetric(buffer_size=eval_episodes),
                    tf_metrics.AverageEpisodeLengthMetric(buffer_size=eval_episodes)]
    
    with tf.compat.v2.summary.record_if(lambda: tf.math.equal(global_step % summary_interval, 0)):
    
        # DDPG Agent
        agent = ddpg_agent.DdpgAgent(train_env.time_step_spec(), 
                                     train_env.action_spec(), 
                                     actor_network = actor_net, 
                                     critic_network = critic_net, 
                                     actor_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=actor_learning_rate), 
                                     critic_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=critic_learning_rate), 
                                     ou_stddev = ou_stddev, 
                                     ou_damping = ou_damping, 
                                     target_update_tau = target_update_tau, 
                                     target_update_period = target_update_period,  
                                     gamma = gamma, 
                                     train_step_counter = global_step)
        agent.initialize()
        
        # Metrics to be tracked in the tensorboard summary 
        train_metrics = [tf_metrics.NumberOfEpisodes(),
                         tf_metrics.EnvironmentSteps(),
                         tf_metrics.AverageReturnMetric(),
                         tf_metrics.AverageEpisodeLengthMetric()]
    
        eval_metrics = [tf_metrics.AverageReturnMetric(buffer_size=eval_episodes), 
                        tf_metrics.AverageEpisodeLengthMetric(buffer_size=eval_episodes)]
        
        # Tools for evaluation
        eval_policy = agent.policy
        saver = policy_saver.PolicySaver(eval_policy)

        # Experience replay and sample collection tools
        collect_policy = agent.collect_policy
        replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(agent.collect_data_spec,
                                                                       batch_size=train_env.batch_size,
                                                                       max_length=rb_capacity)
    
        # Assign step drivers to fill replay buffer 
        if isinstance(actor_net, actor_rnn_network.ActorRnnNetwork):
            initial_collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(train_env,
                                                                                 collect_policy,
                                                                                 observers=[replay_buffer.add_batch],
                                                                                 num_episodes=initial_collect_episodes)

            collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(train_env,
                                                                         collect_policy,
                                                                         observers=[replay_buffer.add_batch] + train_metrics,
                                                                         num_episodes=collect_episodes_per_iteration)
        else:
            initial_collect_driver = dynamic_step_driver.DynamicStepDriver(train_env,
                                                                           collect_policy,
                                                                           observers=[replay_buffer.add_batch],
                                                                           num_steps=initial_collect_steps)

            collect_driver = dynamic_step_driver.DynamicStepDriver(train_env,
                                                                   collect_policy,
                                                                   observers=[replay_buffer.add_batch] + train_metrics,
                                                                   num_steps=collect_steps_per_iteration)
        
        # TF functions speed up training process
        initial_collect_driver.run = common.function(initial_collect_driver.run)
        collect_driver.run = common.function(collect_driver.run)
        agent.train = common.function(agent.train)
    
        # Collect initial random samples for replay buffer
        initial_collect_driver.run()
    
        # Training starts
        time_step = None
        policy_state = collect_policy.get_initial_state(train_env.batch_size)
    
        # If it's an ANN, trajectories have to be two steps long (transitions)
        if not isinstance(actor_net, actor_rnn_network.ActorRnnNetwork):
            train_sequence_length = 1
        
        dataset = replay_buffer.as_dataset(num_parallel_calls=3,
                                           sample_batch_size=batch_size,
                                           num_steps=train_sequence_length + 1).prefetch(3)
        iterator = iter(dataset)
    
        # Where the training happens
        def train_step():
            experience, other_info = next(iterator)
            return agent.train(experience)
        train_step = common.function(train_step)

        # Here, the policy updates start
        for _ in range(num_iterations):
            time_step, policy_state = collect_driver.run(time_step=time_step,
                                                         policy_state=policy_state)    
            for _ in range(train_steps_per_iteration):
                train_loss = train_step()
            for train_metric in train_metrics:
                train_metric.tf_summaries(train_step=global_step, step_metrics=train_metrics[:2])
                
            # Evaluation and poicy saving
            if global_step.numpy() % eval_interval == 0:
                results = metric_utils.eager_compute(eval_metrics, 
                                                     eval_env,
                                                     eval_policy,
                                                     num_episodes=eval_episodes,
                                                     train_step=global_step,
                                                     summary_writer=eval_summary_writer,
                                                     summary_prefix='Metrics')
                metric_utils.log_metrics(eval_metrics)
                print('Global Step = {0}, Average Return = {1}.'.format(global_step.numpy(), results['AverageReturn'].numpy())) 
                if results['AverageReturn'].numpy() > best_return:
                    best_return = results['AverageReturn'].numpy()
                    print('New best return: ', best_return)
                    dir_name = str(global_step.numpy()) + '_' + str(best_return)
                    saver.save(os.path.join(policy_dir, dir_name))
            if global_step.numpy() % threshhold_reset_interval == 0:
                best_return = threshhold_return
                    
    return train_loss

# Run Functions (rename)  
Now you can execute ddpg using either artificial or recurrent NNs!

In [None]:
if (use_rnns):
    anet = actor_rnn
    cnet = critic_rnn
else:
    anet = actor_ann
    cnet = critic_ann

loss = DDPG(num_iterations = num_iterations,
            actor_net = anet,
            critic_net = cnet,
            directory = PATH,
            plots = plots,
            eval_interval = eval_interval,
            # Agent hyperparameters
            actor_learning_rate = actor_learning_rate,
            critic_learning_rate = critic_learning_rate,
            ou_stddev = ou_stddev,
            ou_damping = ou_damping,
            target_update_tau = target_update_tau,
            target_update_period = target_update_period,
            gamma = gamma,
            # Experience replay hyperparameters
            initial_collect_episodes = initial_collect_episodes,
            collect_episodes_per_iteration = collect_episodes_per_iteration,
            rb_capacity = rb_capacity,
            batch_size = batch_size,
            train_sequence_length = train_sequence_length)