# *** Unified PCL Trainer for pendulum ***

In [875]:
import gym_wrapper
import env_spec
import objective_PCL
import policy
import baseline

import tensorflow as tf
import numpy as np

In [876]:
import imp
imp.reload(policy)
imp.reload(baseline)
imp.reload(env_spec)
imp.reload(gym_wrapper)

<module 'gym_wrapper' from '/home/adrian/PycharmProjects/PCL/gym_wrapper.py'>

At first setup the environment

In [877]:
env_str = 'Pendulum-v0'
#env_str = 'HalfCheetah-v1'
env_gym = gym_wrapper.GymWrapper(env_str,
                             distinct= 1,
                             count= 1)

env_spec_gym =  env_spec.EnvSpec(env_gym.get_one())

Setup required algorithm parameters

In [878]:
LOG_DIR = '/home/adrian/Schreibtisch/Uni/Data-Innovation-Lab/tensorflowlogs'

batch_size_episodes = 20
replay_batch_size_episodes = None #--> replay batch size; defaults to batch_size

max_steps_episodes = 100
total_number_of_steps = 1000
critic_weight = 1.0
policy_weight = 1.0
learning_rate = 0.01
# Clip_adv (Advantage) is required in objective class
clip_adv = 0.0 # --> Clip advantages at this value --> Leave as 0 to not clip at all
# Clip norm is required in objective class to clip the gradients
clip_norm = 5.0
# Entropy regularization paramter tau
tau = 0.1 # --> If using decaying tau, this is the final value
tau_decy = None # --> decay tau by this much every 100 steps
tau_start = 0.1 # --> start tau at this value
gamma = 1.0 # --> Discount factor
rollout = 10 # --> Rollout length for PCL objective
# If we use unified episodes we need to ensure our batch_size_episodes is 1
unify_episodes = False #--> Make sure replay buffer holds entire episodes, even across distinct sampling steps
batch_by_steps = False #--> ensure each training batch has batch_size * max_step steps'

# Neural network settings now
input_prev_actions = True #--> Required for unified PCL since Q(a.s) is modeled
recurrent = True #--> Indicate that we are going to use a recurrent policy (Q function approximator)
input_time_step = False #--> Indicator if the current time step scould also be an input to the model
internal_dim = 254 #--> Internal RNN dimension 
fixed_std = False #--> fix the std in Gaussian distributions
# If fixed we obtain the following std
# log_std = tf.get_variable('std%d' % i, [1, sampling_dim // 2])

# Settings related to value function if considered seperatly
value_hidden_layers = 0 #--> number of hidden layers in value estimate

# Settings related to the replay buffer
replay_buffer_size = 5000 
replay_buffer_alpha = 0.5 #--> replay buffer alpha param
replay_buffer_freq = 0 #--> replay buffer frequency (only supports -1/0/1)'
eviction = 'rand' #--> How to evict from replay buffer: rand/rank/fifo
prioritize_by = 'rewards' #--> Prioritize replay buffer by "rewards" or "step"

# The following are only required for Trust-PCL
eps_lambda = 0.0 #--> start tau at this value
update_eps_lambda = False #--> Update lambda automatically based on last 100 episodes

Setup PCL-Objective object

Setup Recurrent Policy - In the unified approach the output is also used as foundation to compute the values
* Try recurrent

Determine unified baseline:
* In case of a seperate network one needs to clarify the function of the "input_policy_state" paramter
* If false the "input" looks like the following "obs, action, time_step"
* If true the "input" looks like "internal_policy_states, time_step"

In [879]:
tf.reset_default_graph()
sess = tf.Session()

objective_pendulum = objective_PCL.Objective(learning_rate = learning_rate,
                                     clip_norm = clip_norm,
                                     policy_weight = policy_weight,
                                     critic_weight = critic_weight,
                                     tau = tau, 
                                     gamma = gamma,
                                     rollout = rollout,
                                     eps_lambda = eps_lambda,
                                     clip_adv = clip_adv)

policy_unified_pendulum = policy.Policy(env_spec_gym,
                                       internal_dim,
                                       recurrent = recurrent,
                                       input_prev_actions = input_prev_actions)


baseline_unified_pendulum = baseline.UnifiedBaseline(env_spec_gym,
                                                    internal_dim,
                                                    input_prev_actions=input_prev_actions,
                                                    input_time_step=input_time_step,
                                                    input_policy_state=recurrent,  # may want to change this
                                                    n_hidden_layers=value_hidden_layers,
                                                    hidden_dim=internal_dim,
                                                    tau=tau,
                                                    eps_lambda = eps_lambda)

In the unified case we do not need a value function optimizer "get_value_opt"

## Now it es time to setup the tensorflow graph

### Start with setup of placeholder

In [880]:
# summary placeholder

avg_episode_reward = tf.placeholder(
        tf.float32, [], 'avg_episode_reward')

# sampling placeholders

internal_state = tf.placeholder(tf.float32,
                                    [None, policy_unified_pendulum.rnn_state_dim],
                                    'internal_state')

# One episode of observations (Time_Steps, Observation dimension)
single_observation = []
for i, (obs_dim, obs_type) in enumerate(zip(env_spec_gym.obs_dims, env_spec_gym.obs_types)):
    if env_spec_gym.is_discrete(obs_type):
        single_observation.append(
            tf.placeholder(tf.int32, [None], 'obs%d' % i))
    elif env_spec_gym.is_box(obs_type):
        single_observation.append(
            tf.placeholder(tf.float32, [None, obs_dim], 'obs%d' % i))
    else:
        assert False
        
# One episode of actions (Time_steps, action dimension)        
single_action = []
for i, (action_dim, action_type) in enumerate(zip(env_spec_gym.act_dims, env_spec_gym.act_types)):
    if env_spec_gym.is_discrete(action_type):
        single_action.append(
            tf.placeholder(tf.int32, [None], 'act%d' % i))
    elif env_spec_gym.is_box(action_type):
        single_action.append(
            tf.placeholder(tf.float32, [None, action_dim], 'act%d' % i))
    else:
        assert False
        
# training placeholders

# Observations batch size many episodes of time length [batch size, time length, observation dim]
observations = []
for i, (obs_dim, obs_type) in enumerate(zip(env_spec_gym.obs_dims, env_spec_gym.obs_types)):
    if env_spec_gym.is_discrete(obs_type):
        observations.append(
            tf.placeholder(tf.int32, [None, None], 'all_obs%d' % i))
    else:
        observations.append(
            tf.placeholder(tf.float32, [None, None, obs_dim], 'all_obs%d' % i))
        
# Actions batch size many episodes of time length [batch size, time length, action dim]        
actions = []
for i, (action_dim, action_type) in enumerate(zip(env_spec_gym.act_dims, env_spec_gym.act_types)):
    if env_spec_gym.is_discrete(action_type):
        actions.append(
            tf.placeholder(tf.int32, [None, None], 'all_act%d' % i))
    if env_spec_gym.is_box(action_type):
        actions.append(
            tf.placeholder(tf.float32, [None, None, action_dim],
                       'all_act%d' % i))
        
# Rewards of Batch Size many episodes of time length [batch size, time length]
rewards = tf.placeholder(tf.float32, [None, None], 'rewards')
# Indicator if episode has terminated 
terminated = tf.placeholder(tf.float32, [None], 'terminated')
# Batch Size many episodes of time length indicators if episode has ended
pads = tf.placeholder(tf.float32, [None, None], 'pads')

### Setup computation graph

In [881]:
tf.summary.scalar('avg_episode_reward', avg_episode_reward)

with tf.variable_scope('model', reuse=None):
    # policy network
    with tf.variable_scope('policy_net'):
        # ,entropies, self_kls)
        (policy_internal_states, logits, log_probs) = \
                policy_unified_pendulum.multi_step(observations,
                                                internal_state,
                                                actions)
        out_log_probs = sum(log_probs)

"""
       # value network
    with tf.variable_scope('value_net'):
        (values,
         regression_input,
         regression_weight) = baseline_unified_pendulum.get_values(
            observations, actions,
            policy_internal_states, logits)

    
      # evaluate objective
    (loss, raw_loss, regression_target,
     gradient_ops, summary) = objective_pendulum.get(
      rewards, pads,
      values[:-1, :],
      values[-1, :] * (1 - terminated),
      log_probs)
"""

    
tf.summary.FileWriter("/home/adrian/Schreibtisch/Uni/Data-Innovation-Lab/tensorflowlogs", sess.graph).close()





In [882]:
 # we re-use variables for the sampling operations
with tf.variable_scope('model', reuse=True):
    scope = 'policy_net'
    with tf.variable_scope(scope):
        next_internal_state, sampled_actions = \
            policy_unified_pendulum.sample_step(single_observation,
                                internal_state,
                                single_action)
        greedy_next_internal_state, greedy_sampled_actions = \
            policy_unified_pendulum.sample_step(single_observation,
                               internal_state,
                               single_action,
                                greedy=True)









In [883]:
sess.run(tf.initialize_all_variables())

## Sample Batch

Original sample episodes code from the paper code 'pcl_rl'

In [884]:
def initial_internal_state():
    return np.zeros(policy_unified_pendulum.rnn_state_dim)

In [885]:
last_obs = env_spec_gym.initial_obs(1)
last_act = env_spec_gym.initial_act(len(env_gym))
last_pad = np.zeros(len(env_gym))

start_episode = np.array([True] * len(env_gym))
step_count = np.array([0] * len(env_gym))
episode_running_rewards = np.zeros(len(env_gym))
episode_running_lengths = np.zeros(len(env_gym))
episode_rewards = []
episode_lengths = []
total_rewards = []

cutoff_agent = 1
max_step = 1000

unify_episodes = False

all_obs_global = []
all_act_global = []
all_pad_global = []
rewards_global = []

internal_state_global = np.array([initial_internal_state()] * len(env_gym))

start_id_global = 0

In [886]:
(initial_state_, observations_, actions_, rewards_, terminated_, pads_) = sample_episodes_pcl(sess)

In [887]:
print(np.shape(initial_state_))
print(np.shape(observations_))
print(np.shape(actions_))
print(np.shape(rewards_))
print(np.shape(terminated_))
print(np.shape(pads_))

(5, 254)
(1, 201, 5, 3)
(1, 201, 5, 1)
(200, 5)
(5,)
(200, 5)


In [891]:
batch_size = np.shape(initial_state_)[0]
initial_actions = [act[0] for act in actions_]
all_actions = [tf.concat([act[1:], act[0:1]], 0)
                   for act in actions_]

Given those observations we can feed them now to the model to obtain the logits

In [888]:
outputs = [policy_internal_states, logits, log_probs]

feed_dict = {internal_state: internal_state_}
for action_place, action in zip(actions, actions_):
    feed_dict[action_place] = action
for obs_place, obs in zip(observations, observations_):
    feed_dict[obs_place] = obs

internals, logits, log_probs = sess.run(outputs, feed_dict=feed_dict)

InvalidArgumentError: ConcatOp : Dimensions of inputs should match: shape[0] = [5,127] vs. shape[1] = [1,127]
	 [[Node: model/policy_net/scan/while/policy_net/output_projection_wrapper/output_projection_wrapper/lstm_cell/concat = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32, _device="/job:localhost/replica:0/task:0/device:CPU:0"](model/policy_net/scan/while/add_1, model/policy_net/scan/while/policy_net/output_projection_wrapper/output_projection_wrapper/lstm_cell/Slice_1, model/policy_net/scan/while/concat/axis)]]

Caused by op 'model/policy_net/scan/while/policy_net/output_projection_wrapper/output_projection_wrapper/lstm_cell/concat', defined at:
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-881-63b9fa6ec5ff>", line 9, in <module>
    actions)
  File "/home/adrian/PycharmProjects/PCL/policy.py", line 307, in multi_step
    batch_size, initial_state, initial_actions))
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/ops/functional_ops.py", line 584, in scan
    back_prop=back_prop, swap_memory=swap_memory)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py", line 2816, in while_loop
    result = loop_context.BuildLoop(cond, body, loop_vars, shape_invariants)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py", line 2640, in BuildLoop
    pred, body, original_loop_vars, loop_vars, shape_invariants)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py", line 2590, in _BuildLoop
    body_result = body(*packed_vars_for_body)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/ops/functional_ops.py", line 574, in compute
    a_out = fn(packed_a, packed_elems)
  File "/home/adrian/PycharmProjects/PCL/policy.py", line 275, in single_step
    obs, prev_internal_state, prev_actions)
  File "/home/adrian/PycharmProjects/PCL/policy.py", line 104, in core
    output, next_state = cell(cell_input, prev_internal_state)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/ops/rnn_cell_impl.py", line 183, in __call__
    return super(RNNCell, self).__call__(inputs, state)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/layers/base.py", line 575, in __call__
    outputs = self.call(inputs, *args, **kwargs)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py", line 230, in call
    output, res_state = self._cell(inputs, state)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/ops/rnn_cell_impl.py", line 183, in __call__
    return super(RNNCell, self).__call__(inputs, state)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/layers/base.py", line 575, in __call__
    outputs = self.call(inputs, *args, **kwargs)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/ops/rnn_cell_impl.py", line 611, in call
    lstm_matrix = self._linear1([inputs, m_prev])
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/ops/rnn_cell_impl.py", line 1189, in __call__
    res = math_ops.matmul(array_ops.concat(args, 1), self._weights)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/ops/array_ops.py", line 1099, in concat
    return gen_array_ops._concat_v2(values=values, axis=axis, name=name)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 706, in _concat_v2
    "ConcatV2", values=values, axis=axis, name=name)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
    op_def=op_def)
  File "/home/adrian/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): ConcatOp : Dimensions of inputs should match: shape[0] = [5,127] vs. shape[1] = [1,127]
	 [[Node: model/policy_net/scan/while/policy_net/output_projection_wrapper/output_projection_wrapper/lstm_cell/concat = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32, _device="/job:localhost/replica:0/task:0/device:CPU:0"](model/policy_net/scan/while/add_1, model/policy_net/scan/while/policy_net/output_projection_wrapper/output_projection_wrapper/lstm_cell/Slice_1, model/policy_net/scan/while/concat/axis)]]


In [861]:
def convert_from_batched_episodes_pcl(initial_state_, observations_, actions_, rewards_, terminated_, pads_):
    """Convert time-major batch of episodes to batch-major list of episodes."""

    rewards_ = np.array(rewards_)
    pads_ = np.array(pads_)
    observations_ = [np.array(obs) for obs in observations_]
    actions_ = [np.array(act) for act in actions_]

    total_rewards_ = np.sum(rewards_ * (1 - pads_), axis=0)
    total_length_ = np.sum(1 - pads_, axis=0).astype('int32')

    episodes_ = []
    num_episodes_ = rewards_.shape[1]
    for i in range(num_episodes_):
        length = total_length_[i]
        ep_initial = initial_state_[i]
        ep_obs = [obs[:length, i, ...] for obs in observations_]
        ep_act = [act[:length + 1, i, ...] for act in actions_]
        ep_rewards = rewards_[:length, i]
        
        episodes_.append(
          [ep_initial, ep_obs, ep_act, ep_rewards, terminated_[i]])
    
    return episodes_

In [860]:
def sample_episodes_pcl(sess):
    """Sample steps from the environment until we have enough for a batch."""

    # define global variables
    global max_step, env_gym, start_id, total_rewards, episode_running_rewards, episode_running_lengths
    global episode_running_rewards, episode_running_lengths, start_episode, cutoff_agent, step_count
    global episode_rewards, episode_lengths
    
    # check if last batch ended with episode that was not terminated
    #if unify_episodes:
    #    all_new_ep = self.start_episode[0]

    # sample episodes until we either have enough episodes or enough steps
    episodes = []
    total_steps = 0
    while total_steps < max_step * len(env_gym):
        (initial_state_,
        observations, actions, rewards,
        pads) = _sample_episodes_pcl(sess)

        observations = zip(*observations)
        actions = zip(*actions)

        terminated = np.array(env_gym.dones)

        total_rewards = np.sum(np.array(rewards[start_id_global:]) *
                                  (1 - np.array(pads[start_id_global:])), axis=0)
        episode_running_rewards *= 1 - start_episode
        episode_running_lengths *= 1 - start_episode
        episode_running_rewards += total_rewards
        episode_running_lengths += np.sum(1 - np.array(pads[start_id_global:]), axis=0)

        episodes.extend(convert_from_batched_episodes_pcl(
          initial_state_, observations, actions, rewards,
          terminated, pads))
        total_steps += np.sum(1 - np.array(pads))

        # set next starting episodes
        start_episode = np.logical_or(terminated,
                                         step_count >= cutoff_agent)
        episode_rewards = episode_running_rewards[start_episode].tolist()
        episode_rewards.extend(episode_rewards)
        episode_lengths.extend(episode_running_lengths[start_episode].tolist())
        # ToDo: Check why 100
        episode_rewards = episode_rewards[-100:]
        episode_lengths = episode_lengths[-100:]

        """
        if (self.save_trajectories_file is not None and
          (self.best_batch_rewards is None or
           np.mean(self.total_rewards) > self.best_batch_rewards)):
        self.best_batch_rewards = np.mean(self.total_rewards)
        my_episodes = self.convert_from_batched_episodes(
          initial_state, observations, actions, rewards,
          terminated, pads)
        with gfile.GFile(self.save_trajectories_file, 'w') as f:
            pickle.dump(my_episodes, f)
        """
        """
        if not self.batch_by_steps:
        return (initial_state,
                observations, actions, rewards,
                terminated, pads)        
        """
        
    return convert_to_batched_episodes(episodes)

In [718]:
def _sample_episodes_pcl(sess, greedy=False):
    """Sample episodes from environment using model."""
    
    # Define global variables
    global start_episode, env_gym, step_count, internal_state, last_obs, last_act, last_pad, env_spec_gym, start_id
    global internal_state_global, all_obs_global, all_act_global, all_pad_global, rewards_global, last_act
    global last_obs, max_step
    
    # reset environments as necessary
    obs_after_reset = env_gym.reset_if(start_episode)

    for i, obs in enumerate(obs_after_reset):
        if obs is not None:
            step_count[i] = 0
            internal_state_global[i] = initial_internal_state()
            for j in range(len(env_spec_gym.obs_dims)):
                last_obs[j][i] = obs[j]
            for j in range(len(env_spec_gym.act_dims)):
                last_act[j][i] = -1
            last_pad[i] = 0

    # maintain episode as a single unit if the last sampling
    # batch ended before the episode was terminated
    """
    if unify_episodes:
        assert len(obs_after_reset) == 1
        new_ep = obs_after_reset[0] is not None
    else:
        new_ep = True
    """
    new_ep = True
    
    # ToDo: Make start_id global
    start_id_global = 0 if new_ep else len(all_obs_global[:])

    initial_state = internal_state_global
    all_obs = [] if new_ep else all_obs_global[:]
    all_act = ([last_act] if new_ep else all_act_global[:])
    all_pad = [] if new_ep else all_pad_global[:]
    rewards = [] if new_ep else rewards_global[:]

    # start stepping in the environments
    step = 0
    while not env.all_done():
        step_count += 1 - np.array(env.dones)

        next_internal_state, sampled_actions = sample_step_pcl(
          sess, last_obs, internal_state_global, last_act,
          greedy=greedy)

        env_actions = env_spec_gym.convert_actions_to_env(sampled_actions)
        next_obs, reward, next_dones, _ = env_gym.step(env_actions)

        all_obs.append(last_obs)
        all_act.append(sampled_actions)
        all_pad.append(last_pad)
        rewards.append(reward)

        internal_state_global = next_internal_state
        last_obs = next_obs
        last_act = sampled_actions
        last_pad = np.array(next_dones).astype('float32')

        step += 1
        if max_step and step >= max_step:
            break

    all_obs_global = all_obs[:]
    all_act_global = all_act[:]
    all_pad_global = all_pad[:]
    rewards_global = rewards[:]

    # append final observation
    all_obs_global.append(last_obs)

    return initial_state, all_obs, all_act, rewards, all_pad

In [719]:
def sample_step_pcl(sess, single_observation_, internal_state_, single_action_, greedy=False):
    """Sample batch of steps from policy."""
    global greedy_next_internal_state, greedy_sampled_actions, next_internal_state, sampled_actions
    global internal_state, single_action, single_observation
    
    if greedy:
        outputs = [greedy_next_internal_state, greedy_sampled_actions]
    else:
        outputs = [next_internal_state, sampled_actions]

    feed_dict = {internal_state: internal_state_}
    for action_place, action in zip(single_action, single_action_):
        feed_dict[action_place] = action
    for obs_place, obs in zip(single_observation, single_observation_):
        feed_dict[obs_place] = obs

    return sess.run(outputs, feed_dict=feed_dict)

In [759]:
def convert_to_batched_episodes_pcl(episodes_, max_length=None):
    """Convert batch-major list of episodes to time-major batch of episodes."""
    lengths = [len(ep[-2]) for ep in episodes_]
    max_length = max_length or max(lengths)

    new_episodes = []
    for ep, length in zip(episodes_, lengths):
        initial_, observations_, actions_, rewards_, terminated_ = ep
        observations_ = [np.resize(obs, [max_length + 1] + list(obs.shape)[1:])
                      for obs in observations_]
        actions = [np.resize(act, [max_length + 1] + list(act.shape)[1:])
                     for act in actions_]
        pads_ = np.array([0] * length + [1] * (max_length - length))
        rewards_ = np.resize(rewards_, [max_length]) * (1 - pads_)
        new_episodes.append([initial_, observations_, actions_, rewards_,
                           terminated_, pads_])

    (initial_, observations_, actions_, rewards_,
     terminated_, pads_) = zip(*new_episodes)
    observations_ = [np.swapaxes(obs, 0, 1)
                    for obs in zip(*observations_)]
    actions_ = [np.swapaxes(act, 0, 1)
               for act in zip(*actions_)]
    rewards_ = np.transpose(rewards_)
    pads_ = np.transpose(pads_)

    return (initial_, observations_, actions_, rewards_, terminated_, pads_)