## Test the environment `HerdEnv` of `herd_env.py`

In [1]:
import os
import numpy as np
import tensorflow as tf
from tf_agents.environments import utils
from tf_agents.trajectories.time_step import StepType
from tf_agents.trajectories import TimeStep
from tf_agents.policies import scripted_py_policy
from tf_agents.policies import random_py_policy
from tf_agents.policies import random_tf_policy
from tf_agents.policies import policy_saver
from tf_agents.metrics import py_metrics
from tf_agents.drivers import py_driver
from tf_agents.utils import common
from tf_agents.specs import tensor_spec

from tf_agents.networks import sequential

import sys
sys.path.insert(1, '/home/jovyan/Masterarbeit/reinforce-one/Environment')

In [2]:
from Env import Env
root_dir = '~/Masterarbeit/DDPG'
num_herds=2
total_population=300

### Train a DDPG Agent

In [3]:
def compute_avg_return(environment, policy, num_episodes=50, verbose=False):
  total_return = 0.0
  cullsteps = 0 
  for e in range(num_episodes):

    time_step = environment.reset()
    if isinstance(policy, scripted_py_policy.ScriptedPyPolicy):
        policy_state = policy.get_initial_state() # remember where in the script we were
    else:
        #print(policy.get_initial_state(batch_size=train_env.batch_size()))
        policy_state = policy.get_initial_state(batch_size=1) # other policies without memory
    episode_return = 0.0
    i=0
    while not time_step.is_last():
        i+=1
        action_step = policy.action(time_step, policy_state)
        for i in range (num_herds, num_herds*2):
            if action_step.action[0][i] >= 0.5:
                cullsteps += 1
                break
        policy_state = action_step.state
        time_step = environment.step(action_step.action)

        state = None # TF environment from wrapper does not have get_state()
        episode_return += time_step.reward
        if verbose:
            print (f"episode {e:>2} step{i:>4} action: ", action_step.action, 
                   "state=", state, "obs=", time_step.observation, "reward=", time_step.reward)
    total_return += episode_return

  avg_return = total_return / num_episodes
  cullsteps /= num_episodes
  return avg_return, cullsteps

In [4]:

import functools
from tf_agents.networks import nest_map
from tf_agents.networks import sequential
from tf_agents.keras_layers import inner_reshape
from tf_agents.networks.sequential import Sequential
from tensorflow.keras.layers import Dense
from tf_agents.agents.ddpg import ddpg_agent
from tf_agents.environments import tf_py_environment
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory

In [5]:
num_iterations = 50000
replay_buffer_max_length = 100000
batch_size = 64
num_eval_episodes = 100
initial_collect_steps = 5000
collect_steps_per_iteration = 200
log_interval = 1000
eval_interval = 500
threshhold_reset_interval = 10000
target_update_period = 5

In [6]:
train_env = tf_py_environment.TFPyEnvironment(Env(num_herds = num_herds, total_population = total_population, 
                                                  fix_episode_length = True, average_episode_length = 200))
eval_env = tf_py_environment.TFPyEnvironment(Env(num_herds = num_herds, total_population = total_population, 
                                                 fix_episode_length = True, average_episode_length = 200))

In [7]:
#kernel_initializer=tf.keras.initializers.RandomUniform(minval=-0.03, maxval=0.03)

#Make critic and actor net for ddpg

dense = functools.partial(
    tf.keras.layers.Dense,
    activation=tf.keras.activations.relu,
    kernel_initializer=tf.compat.v1.variance_scaling_initializer(
        scale=1./ 3.0, mode='fan_in', distribution='uniform'))


def create_identity_layer():
    return tf.keras.layers.Lambda(lambda x: x)


def create_fc_network(layer_units):
    return sequential.Sequential([dense(num_units) for num_units in layer_units])


def create_actor_network(fc_layer_units, action_spec):
    """Create an actor network for DDPG."""
    flat_action_spec = tf.nest.flatten(action_spec)
    if len(flat_action_spec) > 1:
        raise ValueError('Only a single action tensor is supported by this network')
    flat_action_spec = flat_action_spec[0]

    fc_layers = [dense(num_units) for num_units in fc_layer_units]

    num_actions = flat_action_spec.shape.num_elements()
    action_fc_layer = tf.keras.layers.Dense(
        num_actions,
        activation=tf.keras.activations.tanh,
        kernel_initializer=tf.keras.initializers.RandomUniform(
            minval=-0.003, maxval=0.003))

    scaling_layer = tf.keras.layers.Lambda(
        lambda x: common.scale_to_spec(x, flat_action_spec))
    return sequential.Sequential(fc_layers + [action_fc_layer, scaling_layer])


def create_critic_network(obs_fc_layer_units,
                          action_fc_layer_units,
                          joint_fc_layer_units):
    """Create a critic network for DDPG."""

    def split_inputs(inputs):
        return {'observation': inputs[0], 'action': inputs[1]}

    obs_network = create_fc_network(
        obs_fc_layer_units) if obs_fc_layer_units else create_identity_layer()
    action_network = create_fc_network(
        action_fc_layer_units
        ) if action_fc_layer_units else create_identity_layer()
    joint_network = create_fc_network(
        joint_fc_layer_units) if joint_fc_layer_units else create_identity_layer()
    
    value_fc_layer = tf.keras.layers.Dense(
        1,
        activation=None,
        kernel_initializer=tf.keras.initializers.RandomUniform(
            minval=-0.003, maxval=0.003))

    return sequential.Sequential([
        tf.keras.layers.Lambda(split_inputs),
        nest_map.NestMap({
            'observation': obs_network,
            'action': action_network
        }),
        nest_map.NestFlatten(),
        tf.keras.layers.Concatenate(),
        joint_network,
        value_fc_layer,
        inner_reshape.InnerReshape([1], [])
    ])

actor_fc_layers=(200, 150)
critic_obs_fc_layers=(200,)
critic_action_fc_layers=None
critic_joint_fc_layers=(150,)

actor_net = create_actor_network(actor_fc_layers, train_env.action_spec())
critic_net = create_critic_network(critic_obs_fc_layers,
                                       critic_action_fc_layers,
                                       critic_joint_fc_layers)

In [8]:
train_step_counter = tf.Variable(0)
global_step = tf.compat.v1.train.get_or_create_global_step()

agent = ddpg_agent.DdpgAgent(
        train_env.time_step_spec(),
        train_env.action_spec(),
        actor_network=actor_net,
        critic_network=critic_net,
        actor_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=1e-4),    #1e-4
        critic_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=1e-3),    #1e-3
        ou_stddev=0.2,
        ou_damping=0.15,
        target_update_tau=0.05,
        target_update_period=5,
        #dqda_clipping=dqda_clipping,
        td_errors_loss_fn=tf.compat.v1.losses.huber_loss,
        gamma=0.99,
        #reward_scale_factor=reward_scale_factor,
        #gradient_clipping=gradient_clipping,
        #debug_summaries=debug_summaries,
        #summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=global_step)

agent.initialize()

In [9]:
# agent.policy.trainable_variables

In [10]:
# manually initialize a reasonably good policy: kill both herds if the sum of observations is large
#W = np.array([[0, 3 ,0, 2],[0, 0, 3, 2,]])
#b = np.array([1, 0, 0, 0])
#q_net.layers[0].set_weights([W,b])
#agent.policy.trainable_variables

In [11]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)

In [12]:
agent.collect_data_spec._fields

('step_type',
 'observation',
 'action',
 'policy_info',
 'next_step_type',
 'reward',
 'discount')

In [13]:
def collect_step(environment, policy, buffer):
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)

    # Add trajectory to the replay buffer
    buffer.add_batch(traj)

def collect_data(env, policy, buffer, steps):
    for _ in range(steps):
        collect_step(env, policy, buffer)

random_policy = random_tf_policy.RandomTFPolicy(
    action_spec=train_env.action_spec(), time_step_spec=train_env.time_step_spec())
collect_data(train_env, random_policy, replay_buffer, initial_collect_steps)

In [14]:
# Dataset generates trajectories with shape [Bx2x...]
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, 
    sample_batch_size=batch_size, 
    num_steps=2).prefetch(3)

iterator = iter(dataset)
dataset

Instructions for updating:
Use `tf.data.Dataset.scan(...) instead
Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


<PrefetchDataset shapes: (Trajectory(
{action: (64, 2, 4),
 discount: (64, 2),
 next_step_type: (64, 2),
 observation: (64, 2, 7),
 policy_info: (),
 reward: (64, 2),
 step_type: (64, 2)}), BufferInfo(ids=(64, 2), probabilities=(64,))), types: (Trajectory(
{action: tf.float32,
 discount: tf.float32,
 next_step_type: tf.int32,
 observation: tf.float32,
 policy_info: (),
 reward: tf.float32,
 step_type: tf.int32}), BufferInfo(ids=tf.int64, probabilities=tf.float32))>

In [None]:
# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]
# Initialize policy saver
saver = policy_saver.PolicySaver(agent.policy)
best_return = -1000000
root_dir = os.path.expanduser(root_dir)
policy_dir = os.path.join(root_dir, 'policy')
#compute_avg_return = common.function(compute_avg_return)

for _ in range(num_iterations):

  # Collect a few steps using collect_policy and save to the replay buffer.
  collect_data(train_env, agent.collect_policy, replay_buffer, collect_steps_per_iteration)

  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator)
  train_loss = agent.train(experience).loss

  step = agent.train_step_counter.numpy()

  if step % log_interval == 0:
    print('step = {0:4>}: loss = {1:.4f}'.format(step, train_loss), end="\t")

  if step % eval_interval == 0:
    avg_return, cullsteps = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
    print('step {0}: average return = {1:.1f} cullsteps = {2:.1f}'.format(step, avg_return.numpy().item(), cullsteps))
    returns.append(avg_return)
   # if avg_return <= -100:
      #  eval_interval = 500
    #elif avg_return > -100 and avg_return < -40:
       # eval_interval = 100
    #elif avg_return >= -40:
      #  eval_interval = 100
    if avg_return <= -100:
        eval_interval = 500
    if avg_return > best_return:
            best_return = avg_return
            if best_return >= -33:
                eval_interval = 5
            print('New best return: ', best_return)
            saver.save(os.path.join(policy_dir, str(global_step.numpy())))
    elif step % threshhold_reset_interval == 0:
        best_return = -1000000

step 500: average return = -586.1 cullsteps = 0.0
New best return:  tf.Tensor([-586.05316], shape=(1,), dtype=float32)




INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/500/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/500/assets


step = 1000: loss = 1.0825	step 1000: average return = -587.3 cullsteps = 0.0




step 1500: average return = -584.7 cullsteps = 0.0
New best return:  tf.Tensor([-584.6961], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/1500/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/1500/assets


step = 2000: loss = 2.0383	



step 2000: average return = -584.0 cullsteps = 0.0
New best return:  tf.Tensor([-583.9886], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/2000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/2000/assets


step 2500: average return = -289.4 cullsteps = 161.8
New best return:  tf.Tensor([-289.4167], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/2500/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/2500/assets


step = 3000: loss = 0.9718	step 3000: average return = -311.5 cullsteps = 174.5
step 3500: average return = -442.9 cullsteps = 82.8
step = 4000: loss = 1.9933	step 4000: average return = -476.8 cullsteps = 44.5




step 4500: average return = -216.1 cullsteps = 140.5
New best return:  tf.Tensor([-216.14601], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/4500/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/4500/assets


step = 5000: loss = 1.6512	step 5000: average return = -270.1 cullsteps = 168.0
step 5500: average return = -502.0 cullsteps = 13.6
step = 6000: loss = 0.9712	



step 6000: average return = -69.2 cullsteps = 52.8
New best return:  tf.Tensor([-69.178444], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/6000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/6000/assets


step 6500: average return = -69.3 cullsteps = 45.2
step = 7000: loss = 1.0256	



step 7000: average return = -45.1 cullsteps = 23.9
New best return:  tf.Tensor([-45.105785], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/7000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/7000/assets


step 7500: average return = -33.7 cullsteps = 15.8
New best return:  tf.Tensor([-33.659115], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/7500/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/7500/assets


step = 8000: loss = 0.4521	step 8000: average return = -34.2 cullsteps = 15.0
step 8500: average return = -37.7 cullsteps = 14.0
step = 9000: loss = 0.4773	step 9000: average return = -38.2 cullsteps = 12.9
step 9500: average return = -36.1 cullsteps = 13.1
step = 10000: loss = 0.1384	step 10000: average return = -38.1 cullsteps = 13.2




step 10500: average return = -40.6 cullsteps = 12.7
New best return:  tf.Tensor([-40.590405], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/10500/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/10500/assets


step = 11000: loss = 0.0898	



step 11000: average return = -39.4 cullsteps = 12.7
New best return:  tf.Tensor([-39.400803], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/11000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/11000/assets


step 11500: average return = -36.7 cullsteps = 14.2
New best return:  tf.Tensor([-36.72893], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/11500/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/11500/assets


step = 12000: loss = 0.2219	step 12000: average return = -47.7 cullsteps = 12.8
step 12500: average return = -41.3 cullsteps = 13.2
step = 13000: loss = 0.0595	step 13000: average return = -40.9 cullsteps = 13.4
step 13500: average return = -51.4 cullsteps = 14.6
step = 14000: loss = 0.1596	step 14000: average return = -226.8 cullsteps = 92.0
step 14500: average return = -242.1 cullsteps = 122.5
step = 15000: loss = 0.1315	step 15000: average return = -183.5 cullsteps = 52.1
step 15500: average return = -180.8 cullsteps = 26.9
step = 16000: loss = 0.0746	step 16000: average return = -189.2 cullsteps = 31.3
step 16500: average return = -352.0 cullsteps = 117.6
step = 17000: loss = 0.2729	step 17000: average return = -130.4 cullsteps = 16.8
step 17500: average return = -47.6 cullsteps = 13.6
step = 18000: loss = 0.1635	step 18000: average return = -49.8 cullsteps = 12.7
step 18500: average return = -46.9 cullsteps = 13.3
step = 19000: loss = 0.0873	step 19000: average return = -42.3 cull



step 20500: average return = -237.2 cullsteps = 53.1
New best return:  tf.Tensor([-237.24811], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/20500/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/20500/assets


step = 21000: loss = 0.2637	step 21000: average return = -585.2 cullsteps = 0.1
step 21500: average return = -417.4 cullsteps = 32.9
step = 22000: loss = 0.1604	step 22000: average return = -272.5 cullsteps = 57.1
step 22500: average return = -240.2 cullsteps = 150.1
step = 23000: loss = 0.2990	step 23000: average return = -423.1 cullsteps = 8.4




step 23500: average return = -202.5 cullsteps = 45.2
New best return:  tf.Tensor([-202.54333], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/23500/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/23500/assets


step = 24000: loss = 0.1892	step 24000: average return = -319.7 cullsteps = 30.3




step 24500: average return = -160.1 cullsteps = 15.6
New best return:  tf.Tensor([-160.09172], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/24500/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/24500/assets


step = 25000: loss = 0.3236	step 25000: average return = -297.7 cullsteps = 151.3
step 25500: average return = -251.3 cullsteps = 19.7
step = 26000: loss = 0.4156	step 26000: average return = -219.1 cullsteps = 21.8
step = 27000: loss = 0.1495	step 27000: average return = -249.4 cullsteps = 17.1
step 27500: average return = -490.7 cullsteps = 4.0
step = 28000: loss = 0.0976	step 28000: average return = -413.1 cullsteps = 7.0
step 28500: average return = -565.9 cullsteps = 0.6
step = 29000: loss = 0.1104	



step 29000: average return = -60.2 cullsteps = 15.5
New best return:  tf.Tensor([-60.243286], shape=(1,), dtype=float32)




INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/29000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/29000/assets


step 29500: average return = -75.5 cullsteps = 9.9
step = 30000: loss = 0.4207	step 30000: average return = -145.6 cullsteps = 26.5




step 30500: average return = -198.3 cullsteps = 35.4
New best return:  tf.Tensor([-198.27051], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/30500/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/30500/assets


step = 31000: loss = 0.2544	step 31000: average return = -215.2 cullsteps = 20.9




step 31500: average return = -119.7 cullsteps = 16.4
New best return:  tf.Tensor([-119.70339], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/31500/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/31500/assets


step = 32000: loss = 0.1672	step 32000: average return = -219.5 cullsteps = 24.3




step 32500: average return = -97.8 cullsteps = 16.0
New best return:  tf.Tensor([-97.84627], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/32500/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/32500/assets


step = 33000: loss = 0.0645	



step 33000: average return = -54.8 cullsteps = 8.3
New best return:  tf.Tensor([-54.7697], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/33000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/DDPG/policy/33000/assets


step 33500: average return = -111.6 cullsteps = 9.8
step = 34000: loss = 0.2982	step 34000: average return = -115.9 cullsteps = 43.1
step 34500: average return = -210.9 cullsteps = 12.6
step = 35000: loss = 0.3199	step 35000: average return = -254.7 cullsteps = 17.8
step 35500: average return = -88.0 cullsteps = 11.4
step = 36000: loss = 0.1141	step 36000: average return = -302.8 cullsteps = 11.1


In [None]:
learned_reward, culleps = compute_avg_return(eval_env, agent.policy, num_episodes=500)
print ("reward of learned policy: ", learned_reward.numpy(), "cullsteps=", culleps)

In [None]:
init_ts = eval_env.reset()

def get_action(obs):
    """ execute the learned policy network 
       obs:  one float for global time, one float for each herd - the time since last culling
    """
    _ts = TimeStep(tf.constant([0.]),
                   tf.constant([0.]),
                   tf.constant([1]),
                   tf.constant([obs]))
    # a = agent.collect_policy.action(_ts) # just to see how much is explored versus exploited
    a = agent.policy.action(_ts)
    return a.action.numpy().item()

In [None]:
agent.policy

In [None]:
# what the learned policy does on a grid of observations (5 steps per row&col)
'''A = [[get_action([.0, x,y])
 for y in np.arange(0.,1.,.05,np.float32)]
 for x in np.arange(0.,1.,.05,np.float32)]
A'''

### Play with parameters of manually designed q_network policy

In [None]:
# W, b = agent.policy.trainable_variables
# W = W.numpy()
# b = b.numpy()
# print ("weights\n", W, "\nbias", b)

In [None]:
# def nn(obs):
#    y = np.dot(obs, W)+b
#    return y

In [None]:
# nn([0.5,.2])