## Env Development  
This will contain tests for developing environment.

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import functools
import os
import time

from absl import app
from absl import logging

import gin
from six.moves import range
import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import

from tf_agents.agents.ddpg import actor_rnn_network
from tf_agents.agents.ddpg import critic_rnn_network
from tf_agents.agents.ddpg import ddpg_agent
from tf_agents.drivers import dynamic_episode_driver
from tf_agents.environments import suite_dm_control
from tf_agents.environments import tf_py_environment
from tf_agents.environments import py_environment
from tf_agents.environments import wrappers
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.utils import common

import numpy 
from tf_agents.environments import utils
from tf_agents.trajectories.time_step import StepType
from tf_agents.trajectories import TimeStep
from tf_agents.policies import scripted_py_policy
from tf_agents.policies import random_py_policy
from tf_agents.policies import policy_saver
from tf_agents.metrics import py_metrics
from tf_agents.drivers import py_driver
from tf_agents.specs import tensor_spec
from tf_agents.networks import sequential

from Env import Env
max_episode_length=1000
num_herds = 2
total_population = 300

In [2]:
py_env = Env(num_herds = num_herds, total_population = total_population)

Define two scripted policies:

In [3]:
# Define t

action_script1 = [(10, [0,0,0,0]), 
                 (1, [0,0,1,1]),
                 (10, [0,0,0,0]), 
                 (1, [0,0,1,1])] * int(1+max_episode_length)

scr_pol_1 = scripted_py_policy.ScriptedPyPolicy(
    time_step_spec=py_env.time_step_spec(),
    action_spec=py_env.action_spec(),
    action_script=action_script1)

action_script2 = [(7, [0,0,0,0]), 
                 (1, [0,0,1,1]),
                 (7, [0,0,0,0]), 
                 (1, [0,0,1,1])] * int(1+max_episode_length)

scr_pol_2 = scripted_py_policy.ScriptedPyPolicy(
    time_step_spec=py_env.time_step_spec(),
    action_spec=py_env.action_spec(),
    action_script=action_script2)

And create a random policy:

In [4]:
random_policy = random_py_policy.RandomPyPolicy(time_step_spec=py_env.time_step_spec(), 
                                                action_spec=py_env.action_spec())

Now write a function that tests an environment with any policy.  
Outputs average return over a set number of episodes and average steps where the agent culled one or more herds.

In [5]:
def test_rnn_env(environment, policy, num_episodes=50):
    if isinstance(environment, py_environment.PyEnvironment):
        total_return = 0.0
        cullsteps = 0 
        for e in range(num_episodes):

            time_step = environment.reset()
            if isinstance(policy, scripted_py_policy.ScriptedPyPolicy):
                policy_state = policy.get_initial_state() # remember where in the script we were
            else:
                #print(policy.get_initial_state(batch_size=train_env.batch_size()))
                policy_state = policy.get_initial_state(batch_size=1) # other policies without memory
            episode_return = 0.0
            i=0
            while not time_step.is_last():
                i+=1
                action_step = policy.action(time_step, policy_state)
                for i in range (num_herds, num_herds*2):
                    if action_step.action[i] > 0:
                        cullsteps += 1
                        break
                policy_state = action_step.state
                time_step = environment.step(action_step.action)
                episode_return += time_step.reward

            total_return += episode_return

        avg_return = total_return / num_episodes
        cullsteps /= num_episodes
        return avg_return, cullsteps
    else:
        return None

In [6]:
avg_return, culls = test_rnn_env(py_env, scr_pol_1, num_episodes = 200)
print('average return = {0} cullsteps = {1}'.format(avg_return, culls))

average return = -31565.545988415186 cullsteps = 24.03


In [7]:
avg_return, culls = test_rnn_env(py_env, scr_pol_2, num_episodes = 200)
print('average return = {0} cullsteps = {1}'.format(avg_return, culls))

average return = -32381.563486573006 cullsteps = 33.78


In [8]:
avg_return, culls = test_rnn_env(py_env, random_policy , num_episodes = 2000)
print('average return = {0} cullsteps = {1}'.format(avg_return, culls))

average return = -107995.97262813791 cullsteps = 271.095
