## Test the environment `HerdEnv` of `herd_env.py`

In [1]:
import numpy as np
import tensorflow as tf
from tf_agents.environments import utils
from tf_agents.trajectories.time_step import StepType
from tf_agents.trajectories import TimeStep
from tf_agents.policies import scripted_py_policy
from tf_agents.policies import random_py_policy
from tf_agents.metrics import py_metrics
from tf_agents.drivers import py_driver
from tf_agents.utils import common
from tf_agents.specs import tensor_spec
from tf_agents.networks import sequential

In [2]:
from herd_env import HerdEnv

In [3]:
# sanity check
henv_val = HerdEnv(herd_sizes = [64,64], rand_recovery_prob = 0.1, rand_infection_prob = 0.05)
utils.validate_py_environment(henv_val, episodes=10)

In [4]:
# create Herd Environment instance to be trained for
max_episode_length=100
num_herds=2
henv = HerdEnv(herd_sizes = [32,32], expected_episode_length=-1, max_episode_length=max_episode_length,
               rand_recovery_prob = 0.04, rand_infection_prob = 0.05)

In [5]:
# show interor values of environment
time_step = henv.reset()
print(time_step)
cumulative_reward = time_step.reward
finished = False

while not finished:
  time_step = henv.step(0) # do nothing
  s = henv.get_state()
  print("state: ", s, "observation: ", time_step.observation, "\treward: ", time_step.reward)
  cumulative_reward += time_step.reward
  if time_step.step_type == StepType.LAST:
    finished = True

print('Final Reward = ', cumulative_reward)

TimeStep(
{'discount': 1.0,
 'observation': array([0., 0., 0.], dtype=float32),
 'reward': 0.0,
 'step_type': array(0, dtype=int32)})
state:  [1 0 1 0 1 1] observation:  [0.01 0.01 0.01] 	reward:  0.0
state:  [4 1 5 1 2 2] observation:  [0.02 0.02 0.02] 	reward:  0.0
state:  [ 5  3 10  4  3  3] observation:  [0.03 0.03 0.03] 	reward:  0.0
state:  [ 5  4 15  8  4  4] observation:  [0.04 0.04 0.04] 	reward:  0.0
state:  [ 7  4 22 12  5  5] observation:  [0.05 0.05 0.05] 	reward:  0.0
state:  [ 9  7 31 19  6  6] observation:  [0.06 0.06 0.06] 	reward:  0.0
state:  [ 9  8 40 27  7  7] observation:  [0.07 0.07 0.07] 	reward:  0.0
state:  [11 10 51 37  8  8] observation:  [0.08 0.08 0.08] 	reward:  0.0
state:  [17 10 68 47  9  9] observation:  [0.09 0.09 0.09] 	reward:  0.0
state:  [18 13 86 60 10 10] observation:  [0.1 0.1 0.1] 	reward:  0.0
state:  [ 18  14 104  74  11  11] observation:  [0.11 0.11 0.11] 	reward:  0.0
state:  [ 17  19 121  93  12  12] observation:  [0.12 0.12 0.12] 	reward

In [6]:
action_spec = henv.action_spec()
ts_spec = henv.time_step_spec()
print("action spec:\n", action_spec, "\n\ntime step spec:\n", ts_spec)

action spec:
 BoundedArraySpec(shape=(), dtype=dtype('int32'), name='action', minimum=0, maximum=3) 

time step spec:
 TimeStep(
{'discount': BoundedArraySpec(shape=(), dtype=dtype('float32'), name='discount', minimum=0.0, maximum=1.0),
 'observation': BoundedArraySpec(shape=(3,), dtype=dtype('float32'), name='observation', minimum=0.0, maximum=1.0),
 'reward': ArraySpec(shape=(), dtype=dtype('float32'), name='reward'),
 'step_type': ArraySpec(shape=(), dtype=dtype('int32'), name='step_type')})


### Define scripted policies

In [7]:
# do nothing policy: cull never
action_script0 = [(max_episode_length, 0)]

# cull first herd every 20th step and second herd every 20th step
action_script1 = [(19, 0), 
                 (1, 1),
                 (19, 0), 
                 (1, 3)] * int(1+max_episode_length/40)

manual_scripted_policy0 = scripted_py_policy.ScriptedPyPolicy(
    time_step_spec=ts_spec,
    action_spec=action_spec,
    action_script=action_script0)

manual_scripted_policy1 = scripted_py_policy.ScriptedPyPolicy(
    time_step_spec=ts_spec,
    action_spec=action_spec,
    action_script=action_script1)

init_policy_state = manual_scripted_policy0.get_initial_state()

In [8]:
policy_state =  init_policy_state
ts0 = henv.reset()
for _ in range(21):
    action_step = manual_scripted_policy1.action(ts0, policy_state)
    policy_state = action_step.state
    print("action=", action_step.action, "\tpolicy_state", policy_state)
policy_state = manual_scripted_policy1.get_initial_state()

action= 0 	policy_state [0, 1]
action= 0 	policy_state [0, 2]
action= 0 	policy_state [0, 3]
action= 0 	policy_state [0, 4]
action= 0 	policy_state [0, 5]
action= 0 	policy_state [0, 6]
action= 0 	policy_state [0, 7]
action= 0 	policy_state [0, 8]
action= 0 	policy_state [0, 9]
action= 0 	policy_state [0, 10]
action= 0 	policy_state [0, 11]
action= 0 	policy_state [0, 12]
action= 0 	policy_state [0, 13]
action= 0 	policy_state [0, 14]
action= 0 	policy_state [0, 15]
action= 0 	policy_state [0, 16]
action= 0 	policy_state [0, 17]
action= 0 	policy_state [0, 18]
action= 0 	policy_state [0, 19]
action= 1 	policy_state [1, 1]
action= 0 	policy_state [2, 1]


### ... and a random policy

In [9]:
random_policy = random_py_policy.RandomPyPolicy(time_step_spec=ts_spec, action_spec=action_spec)

## Drive a rollout

In [10]:
def compute_avg_return(environment, policy, num_episodes=50, verbose=False):
  total_return = 0.0
  cullsteps = 0
  for e in range(num_episodes):

    time_step = environment.reset()
    if isinstance(policy, scripted_py_policy.ScriptedPyPolicy):
        policy_state = policy.get_initial_state() # remember where in the script we were
    else:
        policy_state = None # other policies without memory
    episode_return = 0.0
    i=0
    while not time_step.is_last():
        i+=1
        action_step = policy.action(time_step, policy_state)
        if action_step.action > 0:
            cullsteps += 1
        policy_state = action_step.state
        time_step = environment.step(action_step.action)
        if isinstance(environment, HerdEnv):
            state = environment.get_state()
        else:
            state = None # TF environment from wrapper does not have get_state()
        episode_return += time_step.reward
        if verbose:
            print (f"episode {e:>2} step{i:>4} action: ", action_step.action, "state=", state, "obs=", time_step.observation, "reward=", time_step.reward)
    total_return += episode_return

  avg_return = total_return / num_episodes
  cullsteps /= num_episodes
  return avg_return, cullsteps

In [11]:
random_reward, cullsteps = compute_avg_return(henv, random_policy)
print (f"average return of random policy: {random_reward:.3f} avg steps with culls per episode: {cullsteps}")

average return of random policy: -3366.370 avg steps with culls per episode: 75.34


In [12]:
# show states for one rollout of second scripted policy
compute_avg_return(henv, manual_scripted_policy1, num_episodes=1, verbose=True)

episode  0 step   1 action:  0 state= [0 5 0 5 1 1] obs= [0.01 0.01 0.01] reward= 0.0
episode  0 step   2 action:  0 state= [ 4  7  4 12  2  2] obs= [0.02 0.02 0.02] reward= 0.0
episode  0 step   3 action:  0 state= [ 8  8 12 20  3  3] obs= [0.03 0.03 0.03] reward= 0.0
episode  0 step   4 action:  0 state= [ 8  9 20 29  4  4] obs= [0.04 0.04 0.04] reward= 0.0
episode  0 step   5 action:  0 state= [10 10 30 39  5  5] obs= [0.05 0.05 0.05] reward= 0.0
episode  0 step   6 action:  0 state= [10 13 40 52  6  6] obs= [0.06 0.06 0.06] reward= 0.0
episode  0 step   7 action:  0 state= [11 11 51 63  7  7] obs= [0.07 0.07 0.07] reward= 0.0
episode  0 step   8 action:  0 state= [14 10 65 73  8  8] obs= [0.08 0.08 0.08] reward= 0.0
episode  0 step   9 action:  0 state= [16 13 81 86  9  9] obs= [0.09 0.09 0.09] reward= 0.0
episode  0 step  10 action:  0 state= [ 17  16  98 102  10  10] obs= [0.1 0.1 0.1] reward= 0.0
episode  0 step  11 action:  0 state= [ 16  14 114 116  11  11] obs= [0.11 0.11 0.1

(-1506.0, 5.0)

In [13]:
manual_reward0, cullsteps = compute_avg_return(henv, manual_scripted_policy0, num_episodes=500)
print (f"average return of do-nothing-policy: {manual_reward0:.3f} avg culls {cullsteps}")
manual_reward1, cullsteps = compute_avg_return(henv, manual_scripted_policy1, num_episodes=500)
print (f"average return of manual policy: {manual_reward1:.3f} avg culls {cullsteps}")

average return of do-nothing-policy: -1940.193 avg culls 0.0
average return of manual policy: -1510.665 avg culls 5.0


### Train a Deep-Q Agent

In [14]:
from tf_agents.networks.sequential import Sequential
from tensorflow.keras.layers import Dense
from tf_agents.agents.dqn import dqn_agent
from tf_agents.environments import tf_py_environment
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory

In [15]:
num_iterations = 30000
replay_buffer_max_length = 10000
batch_size = 64
num_eval_episodes = 100
initial_collect_steps = 100
collect_steps_per_iteration = 100
log_interval = 200
eval_interval = 1000
target_update_period = 100

In [16]:
# make actor network simple
num_actions = 2**num_herds # this does not scale, obviously
kernel_initializer=tf.keras.initializers.RandomUniform(minval=-0.03, maxval=0.03)

#q_net = Sequential([Dense(300, activation='tanh'),
#                    Dense(4, activation=None,
#                          kernel_initializer = kernel_initializer)])

# from tutorial
fc_layer_params = (100, 50)
action_tensor_spec = tensor_spec.from_spec(henv.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1

# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
def dense_layer(num_units):
  return tf.keras.layers.Dense(
      num_units,
      activation=tf.keras.activations.relu,
      kernel_initializer=tf.keras.initializers.VarianceScaling(
          scale=2.0, mode='fan_in', distribution='truncated_normal'))

# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# it's output.
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
    num_actions,
    activation=None,
    kernel_initializer=kernel_initializer,
    bias_initializer=tf.keras.initializers.Constant(-0.2))
q_net = sequential.Sequential(dense_layers + [q_values_layer])

In [17]:
train_step_counter = tf.Variable(0)

train_env = tf_py_environment.TFPyEnvironment(henv)
eval_env = tf_py_environment.TFPyEnvironment(henv)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    #boltzmann_temperature = 0.005,
    #epsilon_greedy = 0.001,
    #gamma=0.99,
    optimizer=tf.keras.optimizers.RMSprop(), # tf.keras.optimizers.Adam(learning_rate = 1e-3),
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)
    #target_update_period=target_update_period)

agent.initialize()

In [18]:
# agent.policy.trainable_variables

In [19]:
# manually initialize a reasonably good policy: kill both herds if the sum of observations is large
#W = np.array([[0, 3 ,0, 2],[0, 0, 3, 2,]])
#b = np.array([1, 0, 0, 0])
#q_net.layers[0].set_weights([W,b])
#agent.policy.trainable_variables

In [20]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)

In [21]:
agent.collect_data_spec._fields

('step_type',
 'observation',
 'action',
 'policy_info',
 'next_step_type',
 'reward',
 'discount')

In [22]:
def collect_step(environment, policy, buffer):
  time_step = environment.current_time_step()
  action_step = policy.action(time_step)
  next_time_step = environment.step(action_step.action)
  traj = trajectory.from_transition(time_step, action_step, next_time_step)

  # Add trajectory to the replay buffer
  buffer.add_batch(traj)

def collect_data(env, policy, buffer, steps):
  for _ in range(steps):
    collect_step(env, policy, buffer)

collect_data(train_env, random_policy, replay_buffer, initial_collect_steps)

In [23]:
# Dataset generates trajectories with shape [Bx2x...]
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, 
    sample_batch_size=batch_size, 
    num_steps=2).prefetch(3)

iterator = iter(dataset)
dataset

Instructions for updating:
Use `tf.data.Dataset.scan(...) instead


Instructions for updating:
Use `tf.data.Dataset.scan(...) instead


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


<PrefetchDataset shapes: (Trajectory(
{action: (64, 2),
 discount: (64, 2),
 next_step_type: (64, 2),
 observation: (64, 2, 3),
 policy_info: (),
 reward: (64, 2),
 step_type: (64, 2)}), BufferInfo(ids=(64, 2), probabilities=(64,))), types: (Trajectory(
{action: tf.int32,
 discount: tf.float32,
 next_step_type: tf.int32,
 observation: tf.float32,
 policy_info: (),
 reward: tf.float32,
 step_type: tf.int32}), BufferInfo(ids=tf.int64, probabilities=tf.float32))>

In [24]:
# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]

for _ in range(num_iterations):

  # Collect a few steps using collect_policy and save to the replay buffer.
  collect_data(train_env, agent.collect_policy, replay_buffer, collect_steps_per_iteration)

  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator)
  train_loss = agent.train(experience).loss

  step = agent.train_step_counter.numpy()

  if step <= 1 or step % log_interval == 0:
    print('step = {0:4>}: loss = {1:.4f}'.format(step, train_loss), end="\t")

  if step <= 1 or (step <= 100 and step % 50 == 0) or step % eval_interval == 0:
    avg_return, cullsteps = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
    print('step {0}: average return = {1:.1f} cullsteps = {2:.1f}'.format(step, avg_return.numpy().item(), cullsteps))
    returns.append(avg_return)

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


step = 1: loss = 2019.0221	step 1: average return = -1951.1 cullsteps = 0.0
step 50: average return = -1931.6 cullsteps = 0.0
step 100: average return = -1938.6 cullsteps = 0.0
step = 200: loss = 35736.8008	step = 400: loss = 43.1391	step = 600: loss = 412.1095	step = 800: loss = 24133.8184	step = 1000: loss = 4633.9131	step 1000: average return = -1930.9 cullsteps = 0.0
step = 1200: loss = 1200.1688	step = 1400: loss = 2168.2258	step = 1600: loss = 1453.5250	step = 1800: loss = 47495.2617	step = 2000: loss = 111400.5938	step 2000: average return = -4162.6 cullsteps = 100.0
step = 2200: loss = 2209.4927	step = 2400: loss = 1778.2285	step = 2600: loss = 1331.7872	step = 2800: loss = 35893.3594	step = 3000: loss = 45362.9102	step 3000: average return = -4169.9 cullsteps = 100.0
step = 3200: loss = 264.1064	step = 3400: loss = 39.4343	step = 3600: loss = 108.8394	step = 3800: loss = 6.4145	step = 4000: loss = 88.1631	step 4000: average return = -1938.6 cullsteps = 0.0
step = 4200: loss = 

In [25]:
learned_reward, culleps = compute_avg_return(eval_env, agent.policy, num_episodes=500)
print ("reward of learned policy: ", learned_reward.numpy(), "cullsteps=", culleps)

reward of learned policy:  [-1542.952] cullsteps= 5.0


In [26]:
init_ts = eval_env.reset()

def get_action(obs):
    """ execute the learned policy network 
       obs:  one float for global time, one float for each herd - the time since last culling
    """
    _ts = TimeStep(tf.constant([0.]),
                   tf.constant([0.]),
                   tf.constant([1]),
                   tf.constant([obs]))
    # a = agent.collect_policy.action(_ts) # just to see how much is explored versus exploited
    a = agent.policy.action(_ts)
    return a.action.numpy().item()

In [27]:
agent.policy

<tf_agents.policies.greedy_policy.GreedyPolicy at 0x7f95c84c69a0>

In [28]:
# what the learned policy does on a grid of observations (5 steps per row&col)
A = [[get_action([.0, x,y])
 for y in np.arange(0.,1.,.05,np.float32)]
 for x in np.arange(0.,1.,.05,np.float32)]
A

[[0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3],
 [0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3],
 [0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3],
 [0, 0, 0, 0, 0, 0, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
 [0, 0, 0, 0, 0, 0, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
 [0, 0, 0, 0, 0, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
 [0, 0, 0, 0, 0, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
 [0, 0, 0, 0, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
 [0, 0, 0, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
 [0, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
 [1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
 [1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
 [1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
 [1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
 [1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
 [1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

### Play with parameters of manually designed q_network policy

In [29]:
# W, b = agent.policy.trainable_variables
# W = W.numpy()
# b = b.numpy()
# print ("weights\n", W, "\nbias", b)

In [30]:
# def nn(obs):
#    y = np.dot(obs, W)+b
#    return y

In [31]:
# nn([0.5,.2])