Agent for epidemic control model  
===================================
This notebook will train an agent in an epidemic control environment using DDPG with RNNs.  
  
For use, please edit PATH variable below to any folder where training outputs can be stored.  
Also, please create a folder titled 'policy' in PATH directory.  


In [1]:
PATH = '/home/jovyan/Masterarbeit/RNN_DDPG'

Imports
------------------------
Firstly, all relevant dependencies will be imported.  
Comments indicate what imports are generally used for or related to.

In [2]:
import sys

import tensorflow as tf 
import numpy as np

# Environment 
from tf_agents.environments import tf_py_environment
from tf_agents.environments import py_environment
from tf_agents.policies import scripted_py_policy
# Neural Networks
from tf_agents.agents.ddpg import actor_rnn_network
from tf_agents.agents.ddpg import critic_rnn_network
# Agent 
from tf_agents.agents.ddpg import ddpg_agent
# Experience Replay
from tf_agents.drivers import dynamic_episode_driver
from tf_agents.replay_buffers import tf_uniform_replay_buffer


Environment
------------------  
Next, an environment will be imported and initialized.

In [3]:
sys.path.insert(1, '/home/jovyan/Masterarbeit/reinforce-one/Environment')
from Env import Env

num_herds = 2
total_population = 300
average_episode_length=200
fix_episode_length = True

py_env = Env(num_herds = num_herds, total_population = total_population, fix_episode_length = fix_episode_length, 
             average_episode_length = average_episode_length)

# For use in training later
train_env = tf_py_environment.TFPyEnvironment(py_env)
eval_env = tf_py_environment.TFPyEnvironment(py_env)

Then, the environment will be tested with a simple scripted policy.  
Average Returns will be saved as a threshhold for evaluation to save good policies.

In [4]:
def test_env(environment, policy, num_episodes = 50):
    if isinstance(environment, py_environment.PyEnvironment):
        total_return = 0.0
        cullsteps = 0 
        if environment.action_spec().shape[0] == num_herds:
            only_culls = True
        else:
            only_culls = False
            
        for e in range(num_episodes):
            time_step = environment.reset()
            if isinstance(policy, scripted_py_policy.ScriptedPyPolicy):
                policy_state = policy.get_initial_state() # remember where in the script we were
            else:
                policy_state = policy.get_initial_state(batch_size=1) # other policies without memory
            episode_return = 0.0
            i=0
            while not time_step.is_last():
                i+=1
                action_step = policy.action(time_step, policy_state)
                if only_culls:
                    for i in range (0, num_herds):
                        if action_step.action[i] > 0:
                            cullsteps += 1
                else:
                    for i in range (num_herds, num_herds*2):
                        if action_step.action[i] > 0:
                            cullsteps += 1
                policy_state = action_step.state
                time_step = environment.step(action_step.action)
                episode_return += time_step.reward
            total_return += episode_return

        avg_return = total_return / num_episodes
        cullsteps /= num_episodes
        return avg_return, cullsteps
    else:
        return None

In [5]:
# Scripted Policy

if py_env.action_spec().shape[0] == num_herds:
    action_script = [(8, [0,0]), 
                     (1, [1,1]),
                     (8, [0,0]), 
                     (1, [1,1])] * int(1 + average_episode_length)
else:
    action_script = [(8, [0,0,0,0]), 
                     (1, [0,0,1,1]),
                     (8, [0,0,0,0]), 
                     (1, [0,0,1,1])] * int(1 + average_episode_length)
    
scr_pol = scripted_py_policy.ScriptedPyPolicy(time_step_spec=py_env.time_step_spec(),
                                              action_spec=py_env.action_spec(), 
                                              action_script=action_script)

In [6]:
# Test environment with scripted policy
#avg_return, culls = test_env(py_env, scr_pol , num_episodes = 200)

# Multiply by 1.5 to save policy progress as well
#threshhold = avg_return * 1.5

RNN DDPG
----------
Training of an Agent using DDPG with RNNs for actor and critic can begin.   
First, set all hyperparameters.

In [10]:
# RNN hyperparams
actor_fc_layers = (400, 300)
actor_output_fc_layers = (100,)
actor_lstm_size = (40,)
critic_obs_fc_layers = (400,)
critic_action_fc_layers = None
critic_joint_fc_layers = (300,)
critic_output_fc_layers = (100,)
critic_lstm_size = (40,)

# Agent hyperparams
actor_learning_rate = 1e-4
critic_learning_rate= 1e-3
ou_stddev = 0.2
ou_damping = 0.15
tu_tau = 0.05
tu_period = 5
gamma = 0.995

# Experience replay hyperparams
initial_collect_episodes = 10
collect_episodes_per_iteration = 1
rb_capacity = 100000

Initiialize neural networks.

In [11]:
# RNN actor critic
actor_net = actor_rnn_network.ActorRnnNetwork(train_env.time_step_spec().observation, 
                                              train_env.action_spec(), 
                                              input_fc_layer_params=actor_fc_layers, 
                                              lstm_size = actor_lstm_size, 
                                              output_fc_layer_params=actor_output_fc_layers)

critic_net_input_specs = (train_env.time_step_spec().observation, 
                          train_env.action_spec())

critic_net = critic_rnn_network.CriticRnnNetwork(critic_net_input_specs, 
                                                 observation_fc_layer_params=critic_obs_fc_layers, 
                                                 action_fc_layer_params=critic_action_fc_layers, 
                                                 joint_fc_layer_params=critic_joint_fc_layers, 
                                                 lstm_size=critic_lstm_size, 
                                                 output_fc_layer_params=critic_output_fc_layers)

Initialize agent and global step.

In [13]:
# Global step tracks number of train steps
global_step = tf.compat.v1.train.get_or_create_global_step()

# DDPG Agent
agent = ddpg_agent.DdpgAgent(train_env.time_step_spec(), 
                             train_env.action_spec(), 
                             actor_network = actor_net, 
                             critic_network = critic_net, 
                             actor_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=actor_learning_rate), 
                             critic_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=critic_learning_rate), 
                             ou_stddev = ou_stddev, 
                             ou_damping = ou_damping, 
                             target_update_tau = tu_tau, 
                             target_update_period = tu_period,  
                             gamma = gamma, 
                             train_step_counter = global_step)
agent.initialize()

Set up experience replay (replay buffer).

1
Trajectory(
{'action': BoundedTensorSpec(shape=(4,), dtype=tf.float32, name=None, minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'next_step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type'),
 'observation': BoundedTensorSpec(shape=(7,), dtype=tf.float32, name=None, minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'policy_info': (),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})
