## RNN DDPG  
This code is from tf-agents library with minor alterations.  
5 Herds 1000 total population:  
After training, best av_return is about ~90.000.  
Best results with scripted policy are roughly 15.000 (see test_rnn_env.ipynb)

In [1]:
# coding=utf-8
# Copyright 2020 The TF-Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Lint as: python2, python3
r"""Train and Eval DDPG.

To run:

```bash
tensorboard --logdir $HOME/tmp/ddpg_rnn/dm/CartPole-Balance/ --port 2223 &

python tf_agents/agents/ddpg/examples/v2/train_eval_rnn.py \
  --root_dir=$HOME/tmp/ddpg_rnn/dm/CartPole-Balance/ \
  --num_iterations=100000 \
  --alsologtostderr
```
"""

root_dir = '~/Masterarbeit/RNN_DDPG'

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import functools
import os
import time
import sys
sys.path.insert(1, '/home/jovyan/Masterarbeit/reinforce-one/Environment')

from absl import app
from absl import logging

import gin
from six.moves import range
import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import

from tf_agents.agents.ddpg import actor_rnn_network
from tf_agents.agents.ddpg import critic_rnn_network
from tf_agents.agents.ddpg import ddpg_agent
from tf_agents.drivers import dynamic_episode_driver
from tf_agents.environments import suite_dm_control
from tf_agents.environments import tf_py_environment
from tf_agents.environments import wrappers
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.utils import common

import numpy 
from tf_agents.environments import utils
from tf_agents.trajectories.time_step import StepType
from tf_agents.trajectories import TimeStep
from tf_agents.policies import scripted_py_policy
from tf_agents.policies import random_py_policy
from tf_agents.policies import policy_saver
from tf_agents.metrics import py_metrics
from tf_agents.drivers import py_driver
from tf_agents.specs import tensor_spec
from tf_agents.networks import sequential
from Env import Env

from RNN_Env_P2 import Env_P2_N
max_episode_length=1000
num_herds = 2
total_population = 300
average_episode_length=100

In [2]:
py_env = Env(num_herds = num_herds, total_population = total_population, fix_episode_length = True, average_episode_length = 100)

In [3]:
def compute_avg_return(environment, policy, num_episodes=50, verbose=False):
  total_return = 0.0
  cullsteps = 0 
  for e in range(num_episodes):

    time_step = environment.reset()
    if isinstance(policy, scripted_py_policy.ScriptedPyPolicy):
        policy_state = policy.get_initial_state() # remember where in the script we were
    else:
        #print(policy.get_initial_state(batch_size=train_env.batch_size()))
        policy_state = policy.get_initial_state(batch_size=1) # other policies without memory
    episode_return = 0.0
    i=0
    while not time_step.is_last():
        i+=1
        action_step = policy.action(time_step, policy_state)
        for i in range (num_herds, num_herds*2):
            if action_step.action[0][i] > 0.1:
                cullsteps += 1
                break
        policy_state = action_step.state
        time_step = environment.step(action_step.action)

        state = None # TF environment from wrapper does not have get_state()
        episode_return += time_step.reward
        if verbose:
            print (f"episode {e:>2} step{i:>4} action: ", action_step.action, 
                   "state=", state, "obs=", time_step.observation, "reward=", time_step.reward)
    total_return += episode_return

  avg_return = total_return / num_episodes
  cullsteps /= num_episodes
  return avg_return, cullsteps

In [4]:
### @gin.configurable
def train_eval(
    root_dir,
    env_name='cartpole',
    task_name='balance',
    observations_allowlist='position',
    num_iterations=200000,
    actor_fc_layers=(400, 300),
    actor_output_fc_layers=(100,),
    actor_lstm_size=(40,),
    critic_obs_fc_layers=(400,),
    critic_action_fc_layers=None,
    critic_joint_fc_layers=(300,),
    critic_output_fc_layers=(100,),
    critic_lstm_size=(40,),
    # Params for collect
    initial_collect_episodes=1,  #1000(me)
    collect_episodes_per_iteration=1,    #5(me)
    replay_buffer_capacity=10000,
    ou_stddev=0.2,
    ou_damping=0.15,
    # Params for target update
    target_update_tau=0.05,
    target_update_period=5,
    # Params for train
    # Params for train
    train_steps_per_iteration=200,    #200
    batch_size=64,
    train_sequence_length=20,    #10
    actor_learning_rate=1e-4,
    critic_learning_rate=1e-3,
    dqda_clipping=None,
    td_errors_loss_fn=None,
    gamma=0.995,    #.995
    reward_scale_factor=1.0,
    gradient_clipping=None,
    use_tf_functions=True,
    # Params for eval
    num_eval_episodes=200,    #10
    eval_interval=1000,    #1000
    # Params for checkpoints, summaries, and logging
    log_interval=1000,
    summary_interval=1000,
    summaries_flush_secs=10,
    debug_summaries=True,
    summarize_grads_and_vars=True,
    eval_metrics_callback=None):

  """A simple train and eval for DDPG."""

  best_return = -10000
  root_dir = os.path.expanduser(root_dir)
  train_dir = os.path.join(root_dir, 'train')
  eval_dir = os.path.join(root_dir, 'eval')
  policy_dir = os.path.join(root_dir, 'policy')

  train_summary_writer = tf.compat.v2.summary.create_file_writer(
      train_dir, flush_millis=summaries_flush_secs * 1000)
  train_summary_writer.set_as_default()

  eval_summary_writer = tf.compat.v2.summary.create_file_writer(
      eval_dir, flush_millis=summaries_flush_secs * 1000)
  eval_metrics = [
      tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
      tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes)
  ]

  global_step = tf.compat.v1.train.get_or_create_global_step()
  with tf.compat.v2.summary.record_if(
      lambda: tf.math.equal(global_step % summary_interval, 0)):
    if observations_allowlist is not None:
      env_wrappers = [
          functools.partial(
              wrappers.FlattenObservationsWrapper,
              observations_allowlist=[observations_allowlist])
      ]
    else:
      env_wrappers = []

    tf_env = tf_py_environment.TFPyEnvironment(py_env)
    eval_tf_env = tf_py_environment.TFPyEnvironment(py_env)

    actor_net = actor_rnn_network.ActorRnnNetwork(
        tf_env.time_step_spec().observation,
        tf_env.action_spec(),
        input_fc_layer_params=actor_fc_layers,
        lstm_size=actor_lstm_size,
        output_fc_layer_params=actor_output_fc_layers)

    critic_net_input_specs = (tf_env.time_step_spec().observation,
                              tf_env.action_spec())

    critic_net = critic_rnn_network.CriticRnnNetwork(
        critic_net_input_specs,
        observation_fc_layer_params=critic_obs_fc_layers,
        action_fc_layer_params=critic_action_fc_layers,
        joint_fc_layer_params=critic_joint_fc_layers,
        lstm_size=critic_lstm_size,
        output_fc_layer_params=critic_output_fc_layers,
    )

    tf_agent = ddpg_agent.DdpgAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        actor_network=actor_net,
        critic_network=critic_net,
        actor_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=actor_learning_rate),
        critic_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=critic_learning_rate),
        ou_stddev=ou_stddev,
        ou_damping=ou_damping,
        target_update_tau=target_update_tau,
        target_update_period=target_update_period,
        dqda_clipping=dqda_clipping,
        td_errors_loss_fn=td_errors_loss_fn,
        gamma=gamma,
        reward_scale_factor=reward_scale_factor,
        gradient_clipping=gradient_clipping,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=global_step)
    tf_agent.initialize()

    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(),
        tf_metrics.AverageEpisodeLengthMetric(),
    ]

    eval_policy = tf_agent.policy
    collect_policy = tf_agent.collect_policy
    
    saver = policy_saver.PolicySaver(eval_policy)

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        tf_agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=replay_buffer_capacity)

    initial_collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env,
        collect_policy,
        observers=[replay_buffer.add_batch] + train_metrics,
        num_episodes=initial_collect_episodes)

    collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env,
        collect_policy,
        observers=[replay_buffer.add_batch] + train_metrics,
        num_episodes=collect_episodes_per_iteration)

    if use_tf_functions:
      initial_collect_driver.run = common.function(initial_collect_driver.run)
      collect_driver.run = common.function(collect_driver.run)
      tf_agent.train = common.function(tf_agent.train)

    # Collect initial replay data.
    logging.info(
        'Initializing replay buffer by collecting experience for %d episodes '
        'with a random policy.', initial_collect_episodes)
    initial_collect_driver.run()

    results = metric_utils.eager_compute(
        eval_metrics,
        eval_tf_env,
        eval_policy,
        num_episodes=num_eval_episodes,
        train_step=global_step,
        summary_writer=eval_summary_writer,
        summary_prefix='Metrics',
    )
    if eval_metrics_callback is not None:
      eval_metrics_callback(results, global_step.numpy())
    metric_utils.log_metrics(eval_metrics)

    time_step = None
    policy_state = collect_policy.get_initial_state(tf_env.batch_size)

    timed_at_step = global_step.numpy()
    time_acc = 0

    # Dataset generates trajectories with shape [BxTx...]
    dataset = replay_buffer.as_dataset(
        num_parallel_calls=3,
        sample_batch_size=batch_size,
        num_steps=train_sequence_length + 1).prefetch(3)
    iterator = iter(dataset)

    def train_step():
      experience, _ = next(iterator)
      return tf_agent.train(experience)

    if use_tf_functions:
      train_step = common.function(train_step)

    for _ in range(num_iterations):
      start_time = time.time()
      time_step, policy_state = collect_driver.run(
          time_step=time_step,
          policy_state=policy_state,
      )
      for _ in range(train_steps_per_iteration):
        train_loss = train_step()
      time_acc += time.time() - start_time

      if global_step.numpy() % log_interval == 0:
        logging.info('step = %d, loss = %f', global_step.numpy(),
                     train_loss.loss)
        steps_per_sec = (global_step.numpy() - timed_at_step) / time_acc
        logging.info('%.3f steps/sec', steps_per_sec)
        tf.compat.v2.summary.scalar(
            name='global_steps_per_sec', data=steps_per_sec, step=global_step)
        timed_at_step = global_step.numpy()
        time_acc = 0

      for train_metric in train_metrics:
        train_metric.tf_summaries(
            train_step=global_step, step_metrics=train_metrics[:2])

      if global_step.numpy() % eval_interval == 0:
        results = metric_utils.eager_compute(
            eval_metrics,
            eval_tf_env,
            eval_policy,
            num_episodes=num_eval_episodes,
            train_step=global_step,
            summary_writer=eval_summary_writer,
            summary_prefix='Metrics',
        )
        if eval_metrics_callback is not None:
          eval_metrics_callback(results, global_step.numpy())
        metric_utils.log_metrics(eval_metrics)
        avg_return, cullsteps = compute_avg_return(eval_tf_env, eval_policy, num_episodes=100, verbose=False)
        print('step {0}: average return = {1:.1f} cullsteps = {2:.1f}'.format(global_step.numpy(), 
                                                                                avg_return.numpy().item(), cullsteps))
        if avg_return > best_return:
            if avg_return > -300:
                best_return = avg_return
                print('Final best return: ', best_return)
                saver.save(os.path.join(policy_dir, str(global_step.numpy())))
                break
            else:
                best_return = avg_return
                print('New best return: ', best_return)
                saver.save(os.path.join(policy_dir, str(global_step.numpy())))
        elif (70000 <= global_step.numpy() <= 80000):
            best_return = -14000
        elif (100000 <= global_step.numpy() <= 130000):
            best_return = -14000
            

    return train_loss

In [None]:
train_eval(root_dir)

Instructions for updating:
Use `tf.data.Dataset.scan(...) instead


Instructions for updating:
Use `tf.data.Dataset.scan(...) instead


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.






step 1000: average return = -44524.4 cullsteps = 0.0
step 2000: average return = -43560.8 cullsteps = 5.0
step 3000: average return = -43654.4 cullsteps = 2.0
step 4000: average return = -45961.1 cullsteps = 0.0
step 5000: average return = -42653.5 cullsteps = 2.0
step 6000: average return = -39595.1 cullsteps = 3.0
step 7000: average return = -40563.2 cullsteps = 3.0
step 8000: average return = -39789.9 cullsteps = 4.0
step 9000: average return = -39123.3 cullsteps = 5.0
step 10000: average return = -42153.0 cullsteps = 5.0
step 11000: average return = -42511.1 cullsteps = 5.0
step 12000: average return = -17100.2 cullsteps = 54.3
step 13000: average return = -12619.6 cullsteps = 63.2
step 14000: average return = -11497.6 cullsteps = 50.5
step 15000: average return = -19375.7 cullsteps = 37.1
step 16000: average return = -24350.3 cullsteps = 35.1
step 17000: average return = -11892.5 cullsteps = 41.7
step 18000: average return = -11707.7 cullsteps = 45.0
step 19000: average return = -



step 25000: average return = -9730.4 cullsteps = 36.3
New best return:  tf.Tensor([-9730.371], shape=(1,), dtype=float32)




INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/25000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/25000/assets


step 26000: average return = -10422.7 cullsteps = 38.9
step 27000: average return = -10772.4 cullsteps = 43.6
step 28000: average return = -10930.4 cullsteps = 43.0
step 29000: average return = -10971.6 cullsteps = 45.1
step 30000: average return = -11452.3 cullsteps = 40.2
step 31000: average return = -11458.1 cullsteps = 38.4
step 32000: average return = -9968.4 cullsteps = 32.4




step 33000: average return = -9532.5 cullsteps = 31.5
New best return:  tf.Tensor([-9532.468], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/33000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/33000/assets


step 34000: average return = -9345.0 cullsteps = 36.1
New best return:  tf.Tensor([-9345.019], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/34000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/34000/assets


step 35000: average return = -9407.1 cullsteps = 36.7




step 36000: average return = -9247.0 cullsteps = 28.0
New best return:  tf.Tensor([-9247.004], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/36000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/36000/assets


step 37000: average return = -9636.6 cullsteps = 27.6
step 38000: average return = -9821.8 cullsteps = 27.6
step 39000: average return = -9921.7 cullsteps = 23.4
step 40000: average return = -9971.1 cullsteps = 27.7
step 41000: average return = -10033.6 cullsteps = 22.0
step 42000: average return = -10740.6 cullsteps = 36.4
step 43000: average return = -9843.5 cullsteps = 28.4
step 44000: average return = -10252.1 cullsteps = 31.5
step 45000: average return = -10277.2 cullsteps = 34.4
step 46000: average return = -12990.0 cullsteps = 58.9
step 47000: average return = -17788.9 cullsteps = 76.9
step 48000: average return = -12431.2 cullsteps = 72.6
step 49000: average return = -11726.0 cullsteps = 45.0
step 50000: average return = -13124.2 cullsteps = 29.9
step 51000: average return = -11075.9 cullsteps = 29.5
step 52000: average return = -11434.1 cullsteps = 39.1
step 53000: average return = -11040.4 cullsteps = 43.3
step 54000: average return = -12739.3 cullsteps = 43.8
step 55000: ave



step 72000: average return = -13817.8 cullsteps = 37.1
New best return:  tf.Tensor([-13817.782], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/72000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/72000/assets


step 73000: average return = -11134.8 cullsteps = 31.8
New best return:  tf.Tensor([-11134.838], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/73000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/73000/assets


step 74000: average return = -10994.7 cullsteps = 37.6
New best return:  tf.Tensor([-10994.746], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/74000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/74000/assets


step 75000: average return = -12385.7 cullsteps = 39.9




step 76000: average return = -13795.1 cullsteps = 80.7
New best return:  tf.Tensor([-13795.105], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/76000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/76000/assets


step 77000: average return = -11880.7 cullsteps = 53.0
New best return:  tf.Tensor([-11880.695], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/77000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/77000/assets


step 78000: average return = -11252.1 cullsteps = 48.2
New best return:  tf.Tensor([-11252.104], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/78000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/78000/assets


step 79000: average return = -10421.8 cullsteps = 41.7
New best return:  tf.Tensor([-10421.787], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/79000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/79000/assets


step 80000: average return = -10783.0 cullsteps = 40.8




step 81000: average return = -10998.6 cullsteps = 38.5
New best return:  tf.Tensor([-10998.588], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/81000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/81000/assets


step 82000: average return = -10581.7 cullsteps = 49.6
New best return:  tf.Tensor([-10581.74], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/82000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/82000/assets


step 83000: average return = -10003.2 cullsteps = 46.6
New best return:  tf.Tensor([-10003.169], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/83000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/83000/assets


step 84000: average return = -10087.1 cullsteps = 46.6
step 85000: average return = -10433.4 cullsteps = 45.1




step 86000: average return = -9981.4 cullsteps = 43.7
New best return:  tf.Tensor([-9981.443], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/86000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/86000/assets


step 87000: average return = -9561.0 cullsteps = 40.9
New best return:  tf.Tensor([-9561.001], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/87000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/87000/assets


step 88000: average return = -8774.0 cullsteps = 29.8
New best return:  tf.Tensor([-8774.021], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/88000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/88000/assets


step 89000: average return = -9412.2 cullsteps = 32.3
step 90000: average return = -10537.5 cullsteps = 38.7
step 91000: average return = -10448.9 cullsteps = 46.1
step 92000: average return = -9928.0 cullsteps = 30.5
step 93000: average return = -11310.0 cullsteps = 27.8
step 94000: average return = -11389.8 cullsteps = 27.1
step 95000: average return = -12508.6 cullsteps = 32.9
step 96000: average return = -10804.0 cullsteps = 34.1




step 97000: average return = -8773.1 cullsteps = 38.9
New best return:  tf.Tensor([-8773.056], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/97000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/97000/assets


step 98000: average return = -8873.8 cullsteps = 50.5
step 99000: average return = -8987.5 cullsteps = 44.7
step 100000: average return = -9359.6 cullsteps = 26.0




step 101000: average return = -9575.5 cullsteps = 26.9
New best return:  tf.Tensor([-9575.45], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/101000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/101000/assets


step 102000: average return = -10008.9 cullsteps = 32.1




step 103000: average return = -10146.2 cullsteps = 34.8
New best return:  tf.Tensor([-10146.154], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/103000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/103000/assets


step 104000: average return = -9941.8 cullsteps = 30.9
New best return:  tf.Tensor([-9941.825], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/104000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/104000/assets


step 105000: average return = -10254.4 cullsteps = 36.0




step 106000: average return = -12494.0 cullsteps = 33.6
New best return:  tf.Tensor([-12493.954], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/106000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/106000/assets


step 107000: average return = -12802.9 cullsteps = 32.2




step 108000: average return = -13743.2 cullsteps = 32.1
New best return:  tf.Tensor([-13743.2], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/108000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/108000/assets


step 109000: average return = -11646.0 cullsteps = 38.1
New best return:  tf.Tensor([-11645.972], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/109000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/109000/assets


step 110000: average return = -9772.2 cullsteps = 38.8
New best return:  tf.Tensor([-9772.229], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/110000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/110000/assets


step 111000: average return = -10881.9 cullsteps = 40.7




step 112000: average return = -10799.9 cullsteps = 43.1
New best return:  tf.Tensor([-10799.867], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/112000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/112000/assets


step 113000: average return = -10392.4 cullsteps = 47.7
New best return:  tf.Tensor([-10392.381], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/113000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/113000/assets


step 114000: average return = -10910.6 cullsteps = 40.4




step 115000: average return = -12151.3 cullsteps = 33.4
New best return:  tf.Tensor([-12151.295], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/115000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/115000/assets


step 116000: average return = -10050.0 cullsteps = 42.9
New best return:  tf.Tensor([-10049.988], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/116000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/116000/assets


step 117000: average return = -11251.7 cullsteps = 43.7
step 118000: average return = -14042.2 cullsteps = 31.7
step 119000: average return = -21683.3 cullsteps = 18.5
step 120000: average return = -14036.3 cullsteps = 29.1




step 121000: average return = -10782.7 cullsteps = 30.9
New best return:  tf.Tensor([-10782.707], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/121000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/121000/assets


step 122000: average return = -31650.7 cullsteps = 8.1
step 123000: average return = -28146.9 cullsteps = 9.5
step 124000: average return = -29288.8 cullsteps = 8.5
step 125000: average return = -19581.2 cullsteps = 19.0




step 126000: average return = -13790.0 cullsteps = 32.7
New best return:  tf.Tensor([-13789.977], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/126000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/126000/assets


step 127000: average return = -15983.4 cullsteps = 37.1




step 128000: average return = -12245.8 cullsteps = 45.8
New best return:  tf.Tensor([-12245.77], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/128000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/128000/assets


step 129000: average return = -11707.8 cullsteps = 36.3
New best return:  tf.Tensor([-11707.8], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/129000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/129000/assets


step 130000: average return = -10075.0 cullsteps = 36.0
New best return:  tf.Tensor([-10074.97], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/130000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/130000/assets


step 131000: average return = -13559.0 cullsteps = 26.1
step 132000: average return = -10498.5 cullsteps = 29.1
step 133000: average return = -10619.6 cullsteps = 31.0
step 134000: average return = -10961.7 cullsteps = 32.3
step 135000: average return = -11540.0 cullsteps = 33.5
step 136000: average return = -11949.0 cullsteps = 37.5
step 137000: average return = -11484.5 cullsteps = 34.1
step 138000: average return = -11670.1 cullsteps = 44.7
step 139000: average return = -12130.0 cullsteps = 51.4
step 140000: average return = -10426.5 cullsteps = 39.6
step 141000: average return = -11765.1 cullsteps = 43.9
step 142000: average return = -11463.5 cullsteps = 40.1
step 143000: average return = -12219.2 cullsteps = 49.1
step 144000: average return = -12452.3 cullsteps = 71.4
step 145000: average return = -11347.2 cullsteps = 55.5
step 146000: average return = -10514.5 cullsteps = 37.8
step 147000: average return = -12005.6 cullsteps = 42.3
step 148000: average return = -15772.0 cullsteps



step 154000: average return = -9930.9 cullsteps = 41.7
New best return:  tf.Tensor([-9930.911], shape=(1,), dtype=float32)
INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/154000/assets


INFO:tensorflow:Assets written to: /home/jovyan/Masterarbeit/RNN_DDPG/policy/154000/assets


step 155000: average return = -12772.0 cullsteps = 35.0
step 156000: average return = -14872.1 cullsteps = 35.9
step 157000: average return = -13917.1 cullsteps = 38.0
step 158000: average return = -13480.2 cullsteps = 40.6
step 159000: average return = -13495.2 cullsteps = 35.4
step 160000: average return = -11860.3 cullsteps = 33.9
step 161000: average return = -12329.1 cullsteps = 35.0
step 162000: average return = -11352.7 cullsteps = 37.7
step 163000: average return = -11531.0 cullsteps = 32.2
step 164000: average return = -11497.5 cullsteps = 55.0
step 165000: average return = -11132.7 cullsteps = 51.9
step 166000: average return = -10838.2 cullsteps = 49.4
step 167000: average return = -11499.6 cullsteps = 32.0
step 168000: average return = -10048.9 cullsteps = 34.2
step 169000: average return = -10114.7 cullsteps = 41.5
step 170000: average return = -10842.1 cullsteps = 44.7
step 171000: average return = -11687.6 cullsteps = 18.1
step 172000: average return = -12917.6 cullsteps