# *** Unified PCL Trainer for pendulum ***

In [110]:
import gym_wrapper
import env_spec
import objective_PCL
import policy
import baseline
import replay_buffer
import optimizers
import tensorflow as tf
import numpy as np

In [111]:
import imp
imp.reload(policy)
imp.reload(baseline)
imp.reload(env_spec)
imp.reload(gym_wrapper)
imp.reload(objective_PCL)

<module 'objective_PCL' from '/home/adrian/PycharmProjects/PCL/objective_PCL.py'>

At first setup the environment

In [112]:
#env_str = 'Pendulum-v0'
env_str = 'HalfCheetah-v1'
env_gym = gym_wrapper.GymWrapper(env_str,
                             distinct= 1,
                             count= 1)

env_spec_gym =  env_spec.EnvSpec(env_gym.get_one())

Setup required algorithm parameters

In [128]:
LOG_DIR = '/home/adrian/Schreibtisch/Uni/Data-Innovation-Lab/tensorflowlogs'

batch_size_episodes = 1
replay_batch_size = 25 #--> replay batch size; defaults to batch_size
batch_by_steps = False #--> ensure each training batch has batch_size * max_step steps

cutoff_agent = 1000
max_step = 4000
unify_episodes = False

total_number_of_steps = 1000
critic_weight = 0.0
policy_weight = 1.0
learning_rate = 0.002
# Clip_adv (Advantage) is required in objective class
clip_adv = 1.0 # --> Clip advantages at this value --> Leave as 0 to not clip at all
# Clip norm is required in objective class to clip the gradients
clip_norm = 40
# Entropy regularization paramter tau
tau = 0.1 # --> If using decaying tau, this is the final value
tau_decy = None # --> decay tau by this much every 100 steps
tau_start = 0.1 # --> start tau at this value
gamma = 0.995 # --> Discount factor
rollout = 10 # --> Rollout length for PCL objective
# If we use unified episodes we need to ensure our batch_size_episodes is 1
unify_episodes = False #--> Make sure replay buffer holds entire episodes, even across distinct sampling steps
batch_by_steps = False #--> ensure each training batch has batch_size * max_step steps'

# Neural network settings now
input_prev_actions = True #--> Required for unified PCL since Q(a.s) is modeled
recurrent = True #--> Indicate that we are going to use a recurrent policy (Q function approximator)
input_time_step = False #--> Indicator if the current time step scould also be an input to the model
internal_dim = 64 #--> Internal RNN dimension 
fixed_std = False #--> fix the std in Gaussian distributions

# If fixed we obtain the following std
# log_std = tf.get_variable('std%d' % i, [1, sampling_dim // 2])

# Settings related to value function if considered seperatly
value_hidden_layers = 1 #--> number of hidden layers in value estimate

# Settings related to the replay buffer
replay_buffer_size = 20000
replay_buffer_alpha = 0.1 #--> replay buffer alpha param
replay_buffer_freq = 1 #--> replay buffer frequency (only supports -1/0/1)'
eviction = 'fifo' #--> How to evict from replay buffer: rand/rank/fifo
prioritize_by = 'step' #--> Prioritize replay buffer by "rewards" or "step"

# The following are only required for Trust-PCL
eps_lambda = 0.0 #--> start tau at this value
update_eps_lambda = False #--> Update lambda automatically based on last 100 episodes
max_divergence=0.001
sample_from = 'target' 
update_eps_lambda = True
target_network_lag=0.99
max_divergence = 0.01

# Determine what updates to use
use_offline_batch = True
use_online_batch = False

# Sampling setup 
last_obs = env_spec_gym.initial_obs(1)
last_act = env_spec_gym.initial_act(len(env_gym))
last_pad = np.zeros(len(env_gym))

start_episode = np.array([True] * len(env_gym))
step_count = np.array([0] * len(env_gym))
episode_running_rewards = np.zeros(len(env_gym))
episode_running_lengths = np.zeros(len(env_gym))
episode_rewards = []
episode_lengths = []
total_rewards = []

all_obs_global = []
all_act_global = []
all_pad_global = []
rewards_global = []

internal_state_global = np.array([initial_internal_state()] * len(env_gym))

start_id_global = 0

Setup PCL-Objective object

Setup Recurrent Policy - In the unified approach the output is also used as foundation to compute the values
* Try recurrent

Determine unified baseline:
* In case of a seperate network one needs to clarify the function of the "input_policy_state" paramter
* If false the "input" looks like the following "obs, action, time_step"
* If true the "input" looks like "internal_policy_states, time_step"

In [114]:
tf.reset_default_graph()
sess = tf.Session()

objective_pendulum = objective_PCL.PCL(learning_rate = learning_rate,
                                     clip_norm = clip_norm,
                                     policy_weight = policy_weight,
                                     critic_weight = critic_weight,
                                     tau = tau, 
                                     gamma = gamma,
                                     rollout = rollout,
                                     eps_lambda = eps_lambda,
                                     clip_adv = clip_adv)
"""
policy_unified_pendulum = policy.Policy(env_spec_gym,
                                       internal_dim,
                                       recurrent = recurrent,
                                       input_prev_actions = input_prev_actions)
"""

# Non recurrent policy
policy_unified_pendulum = policy.MLPPolicy(env_spec_gym,
                                       internal_dim,
                                       recurrent = input_time_step,
                                       input_prev_actions = input_prev_actions)

"""
# Unified model
baseline_unified_pendulum = baseline.UnifiedBaseline(env_spec_gym,
                                                    internal_dim,
                                                    input_prev_actions=input_prev_actions,
                                                    input_time_step=input_time_step,
                                                    input_policy_state=recurrent,  # may want to change this
                                                    n_hidden_layers=value_hidden_layers,
                                                    hidden_dim=internal_dim,
                                                    tau=tau,
                                                    eps_lambda = eps_lambda)
"""

# Seperate Value function
baseline_sep_pendulum = baseline.Baseline(env_spec_gym,
                                          internal_dim,
                                          input_prev_actions=input_prev_actions,
                                          input_time_step=input_time_step,
                                          input_policy_state=False,  # may want to change this
                                          n_hidden_layers=value_hidden_layers,
                                          hidden_dim=internal_dim,
                                          tau=tau,
                                          eps_lambda = eps_lambda)

# Setup corresponding optimizer for value function
value_opt_pendulum = optimizers.GradOptimization(
                        learning_rate=learning_rate,
                        max_iter=5,
                        mix_frac=0.05)

# Setup replay buffer
replay_buffer_pendulum = replay_buffer.PrioritizedReplayBuffer(replay_buffer_size,
                                                               alpha=replay_buffer_alpha,
                                                               eviction_strategy=eviction)

In the unified case we do not need a value function optimizer "get_value_opt"

## Now it es time to setup the tensorflow graph

### Start with setup of placeholder

In [115]:
# summary placeholder

avg_episode_reward = tf.placeholder(
        tf.float32, [], 'avg_episode_reward')

# sampling placeholders

internal_state = tf.placeholder(tf.float32,
                                    [None, policy_unified_pendulum.rnn_state_dim],
                                    'internal_state')

# One episode of observations (Time_Steps, Observation dimension)
single_observation = []
for i, (obs_dim, obs_type) in enumerate(zip(env_spec_gym.obs_dims, env_spec_gym.obs_types)):
    if env_spec_gym.is_discrete(obs_type):
        single_observation.append(
            tf.placeholder(tf.int32, [None], 'obs%d' % i))
    elif env_spec_gym.is_box(obs_type):
        single_observation.append(
            tf.placeholder(tf.float32, [None, obs_dim], 'obs%d' % i))
    else:
        assert False
        
# One episode of actions (Time_steps, action dimension)        
single_action = []
for i, (action_dim, action_type) in enumerate(zip(env_spec_gym.act_dims, env_spec_gym.act_types)):
    if env_spec_gym.is_discrete(action_type):
        single_action.append(
            tf.placeholder(tf.int32, [None], 'act%d' % i))
    elif env_spec_gym.is_box(action_type):
        single_action.append(
            tf.placeholder(tf.float32, [None, action_dim], 'act%d' % i))
    else:
        assert False
        
# training placeholders

# Observations batch size many episodes of time length [batch size, time length, observation dim]
observations = []
for i, (obs_dim, obs_type) in enumerate(zip(env_spec_gym.obs_dims, env_spec_gym.obs_types)):
    if env_spec_gym.is_discrete(obs_type):
        observations.append(
            tf.placeholder(tf.int32, [None, None], 'all_obs%d' % i))
    else:
        observations.append(
            tf.placeholder(tf.float32, [None, None, obs_dim], 'all_obs%d' % i))
        
# Actions batch size many episodes of time length [batch size, time length, action dim]        
actions = []
for i, (action_dim, action_type) in enumerate(zip(env_spec_gym.act_dims, env_spec_gym.act_types)):
    if env_spec_gym.is_discrete(action_type):
        actions.append(
            tf.placeholder(tf.int32, [None, None], 'all_act%d' % i))
    if env_spec_gym.is_box(action_type):
        actions.append(
            tf.placeholder(tf.float32, [None, None, action_dim],
                       'all_act%d' % i))
        
# Rewards of Batch Size many episodes of time length [batch size, time length]
rewards = tf.placeholder(tf.float32, [None, None], 'rewards')
# Indicator if episode has terminated 
terminated = tf.placeholder(tf.float32, [None], 'terminated')
# Batch Size many episodes of time length indicators if episode has ended
pads = tf.placeholder(tf.float32, [None, None], 'pads')

### Setup computation graph

In [116]:
tf.summary.scalar('avg_episode_reward', avg_episode_reward)

with tf.variable_scope('model', reuse=None):
    # policy network
    with tf.variable_scope('policy_net'):
        # ,entropies, self_kls)
        (policy_internal_states, logits, log_probs) = \
                policy_unified_pendulum.multi_step(observations,
                                                internal_state,
                                                actions)
        out_log_probs = sum(log_probs)


       # value network
    with tf.variable_scope('value_net'):
        (values,
         regression_input,
         regression_weight) = baseline_sep_pendulum.get_values(
            observations, actions,
            policy_internal_states, logits)

    # target policy network
    with tf.variable_scope('target_policy_net'):
        (target_policy_internal_states,
         target_logits, target_log_probs) = \
            policy_unified_pendulum.multi_step(observations,
                                               internal_state,
                                               actions)

    # target value network
    with tf.variable_scope('target_value_net'):
        (target_values, _, _) = baseline_sep_pendulum.get_values(
                            observations, actions,
                            target_policy_internal_states, target_logits)
    
    

    # construct copy op online --> target
    all_vars = tf.trainable_variables()
    online_vars = [p for p in all_vars if
                 '/policy_net' in p.name or '/value_net' in p.name]
    target_vars = [p for p in all_vars if
                 'target_policy_net' in p.name or 'target_value_net' in p.name]
    online_vars.sort(key=lambda p: p.name)
    target_vars.sort(key=lambda p: p.name)
    aa = target_network_lag
    copy_op = tf.group(*[
      target_p.assign(aa * target_p + (1 - aa) * online_p)
    for online_p, target_p in zip(online_vars, target_vars)])    
    
    
    # evaluate objective
    (loss, raw_loss, regression_target,
     gradient_ops, summary) = objective_pendulum.get(
      rewards, pads,
      values[:-1, :],
      values[-1, :] * (1 - terminated),
      log_probs,
      target_log_probs)

    regression_target = tf.reshape(regression_target, [-1])
    
    policy_vars = [
        v for v in tf.trainable_variables()
        if '/policy_net' in v.name]
    value_vars = [
        v for v in tf.trainable_variables()
        if '/value_net' in v.name]


# value optimizer
if value_opt_pendulum is not None:
    with tf.variable_scope('trust_region_value', reuse=None):
        value_opt_pendulum.setup(
            value_vars,
            tf.reshape(values[:-1, :], [-1]),
            regression_target,
            tf.reshape(pads, [-1]),
            regression_input, 
            regression_weight)
    

 # we re-use variables for the sampling operations
with tf.variable_scope('model', reuse=True):
    scope = ('target_policy_net' if sample_from == 'target'
            else 'policy_net')
    with tf.variable_scope(scope):
        next_internal_state, sampled_actions = \
            policy_unified_pendulum.sample_step(single_observation,
                                internal_state,
                                single_action)
        greedy_next_internal_state, greedy_sampled_actions = \
            policy_unified_pendulum.sample_step(single_observation,
                               internal_state,
                               single_action,
                                greedy=True)
            
   
         

In [117]:
merged = tf.summary.merge_all()    
train_writer = tf.summary.FileWriter("/home/adrian/Schreibtisch/Uni/Data-Innovation-Lab/tensorflowlogs", sess.graph)
sess.run(tf.initialize_all_variables())

## Sample Batch

Original sample episodes code from the paper code 'pcl_rl'

In [118]:
def initial_internal_state():
    return np.zeros(policy_unified_pendulum.rnn_state_dim)

In [129]:
(initial_state_, observations_, actions_, rewards_, terminated_, pads_) = sample_episodes_pcl(sess)

In [132]:
pads_

array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       ..., 
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]])

In [119]:
global_step = tf.train.get_or_create_global_step()
cur_step = 0

losses_train = []
rewards_train = []
all_ep_rewards = []
for step in range(10000):
    

    
    # sample from env
    (initial_state_,
     observations_, actions_, rewards_, terminated_, pads_) = sample_episodes_pcl(sess)
    
    if step % 10 == 0:
        print(step)
        
        
    # Add sampled episodes to replay batch
    add_to_replay_buffer_pcl(replay_buffer_pendulum, initial_state_, observations_, actions_, rewards_, terminated_, pads_, step)
    
    loss_step, summary_step = 0, None
    
    if update_eps_lambda:
        episode_rewards_ = np.array(episode_rewards)
        episode_lengths_ = np.array(episode_lengths)
        eps_lambda_ = find_best_eps_lambda_(episode_rewards_, episode_lengths_)
        sess.run(objective_pendulum.assign_eps_lambda,
                feed_dict={objective_pendulum.new_eps_lambda: eps_lambda_})
        
    if use_online_batch:
        outputs = [raw_loss, gradient_ops, summary]
        feed_dict = {internal_state: initial_state_,
                 rewards: rewards_,
                 terminated: terminated_,
                 pads: pads_,
                 avg_episode_reward: np.mean(episode_rewards)}
        for action_place, action in zip(actions, actions_):
            feed_dict[action_place] = action
        for obs_place, obs in zip(observations, observations_):
            feed_dict[obs_place] = obs

        loss_step, _, summary_step = sess.run(outputs, feed_dict=feed_dict)

    if use_offline_batch:
        replay_batch_, replay_probs_ = get_from_replay_buffer_pcl(replay_buffer_pendulum, replay_batch_size)
       
        #print(len(replay_buffer_pendulum))
        # Check if replay batch is not none
        if replay_batch_:
            
            (initial_state_, observations_, actions_, rewards_, terminated_, pads_) = replay_batch_
        
            outputs = [raw_loss, gradient_ops, summary]
            feed_dict = {internal_state: initial_state_,
                     rewards: rewards_,
                     terminated: terminated_,
                     pads: pads_,
                     avg_episode_reward: np.mean(episode_rewards)}
            for action_place, action in zip(actions, actions_):
                feed_dict[action_place] = action
            for obs_place, obs in zip(observations, observations_):
                feed_dict[obs_place] = obs

            loss_step, _, summary_step = sess.run(outputs, feed_dict=feed_dict)        
          
            # Perform value function update based on sampled data
            if value_opt_pendulum is not None:

                feed_dict_opt = {internal_state: initial_state_,
                                 rewards: rewards_,
                                 terminated: terminated_,
                                 pads: pads_}
                
                for action_place, action in zip(actions, actions_):
                    feed_dict_opt[action_place] = action
                for obs_place, obs in zip(observations, observations_):
                    feed_dict_opt[obs_place] = obs
            
                value_opt_pendulum.optimize(sess, feed_dict_opt)
        
    losses_train.append(loss_step)
    rewards_train.append(total_rewards)
    all_ep_rewards.extend(episode_rewards)
    
    if summary_step is not None:
        train_writer.add_summary(summary_step, step)
        
    

0


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


ValueError: bad axis2 argument to swapaxes

In [96]:
def find_best_eps_lambda_(rewards, lengths):
    """Find the best lambda given a desired epsilon = FLAGS.max_divergence."""
    # perhaps not the best way to do this
    global max_divergence
    
    desired_div = max_divergence * np.mean(lengths)
    
    def calc_divergence(eps_lambda):
        max_reward = np.max(rewards)
        logz = (max_reward / eps_lambda +
            np.log(np.mean(np.exp((rewards - max_reward) / eps_lambda))))
        exprr = np.mean(np.exp(rewards / eps_lambda - logz) *
                    rewards / eps_lambda)
        return exprr - logz

    left = 0.0
    right = 1000.0

    if len(rewards) <= 8:
        return (left + right) / 2

    num_iter = max(4, 1 + int(np.log((right - left) / 0.1) / np.log(2.0)))
    
    for _ in range(num_iter):
        mid = (left + right) / 2
        cur_div = calc_divergence(mid)
        if cur_div > desired_div:
            left = mid
        else:
            right = mid
        
    
    return (left + right) / 2

In [1407]:
print(np.shape(initial_state_))
print(np.shape(observations_))
print(np.shape(actions_))
print(np.shape(rewards_))
print(np.shape(terminated_))
print(np.shape(pads_))

(1, 64)
(1, 201, 1, 3)
(1, 201, 1, 1)
(200, 1)
(1,)
(200, 1)


## Get value estimates

Understand tf.scan:
* fn: The callable to be performed 
* initializer:  A tensor or (possibly nested) sequence of tensors, initial value for the accumulator, and the expected output type of fn

The callable fn takes two tensors as arguments:
* The first argument is the accumulated value computed from the preceding invocation of fn --> (next_state, tuple(actions), tuple(logits), tuple(log_probs))
* The second argument is the one 

In [23]:
def sample_episodes_pcl(sess):
    """Sample steps from the environment until we have enough for a batch."""

    # define global variables
    global max_step, env_gym, start_id, total_rewards, episode_running_rewards, episode_running_lengths
    global episode_running_rewards, episode_running_lengths, start_episode, cutoff_agent, step_count
    global episode_rewards, episode_lengths
    
    # check if last batch ended with episode that was not terminated
    #if unify_episodes:
    #    all_new_ep = self.start_episode[0]

    # sample episodes until we either have enough episodes or enough steps
    episodes = []
    total_steps = 0
    while total_steps < max_step * len(env_gym):
        (initial_state_,
        observations, actions, rewards,
        pads) = _sample_episodes_pcl(sess)

        observations = zip(*observations)
        actions = zip(*actions)

        terminated = np.array(env_gym.dones)

        total_rewards = np.sum(np.array(rewards[start_id_global:]) *
                                  (1 - np.array(pads[start_id_global:])), axis=0)
        episode_running_rewards *= 1 - start_episode
        episode_running_lengths *= 1 - start_episode
        episode_running_rewards += total_rewards
        episode_running_lengths += np.sum(1 - np.array(pads[start_id_global:]), axis=0)

        episodes.extend(convert_from_batched_episodes_pcl(
          initial_state_, observations, actions, rewards,
          terminated, pads))
        total_steps += np.sum(1 - np.array(pads))

        # set next starting episodes
        start_episode = np.logical_or(terminated,
                                         step_count >= cutoff_agent)
        episode_rewards = episode_running_rewards[start_episode].tolist()
        episode_rewards.extend(episode_rewards)
        episode_lengths.extend(episode_running_lengths[start_episode].tolist())
        # ToDo: Check why 100
        episode_rewards = episode_rewards[-100:]
        episode_lengths = episode_lengths[-100:]

        """
        if (self.save_trajectories_file is not None and
          (self.best_batch_rewards is None or
           np.mean(self.total_rewards) > self.best_batch_rewards)):
        self.best_batch_rewards = np.mean(self.total_rewards)
        my_episodes = self.convert_from_batched_episodes(
          initial_state, observations, actions, rewards,
          terminated, pads)
        with gfile.GFile(self.save_trajectories_file, 'w') as f:
            pickle.dump(my_episodes, f)
        """
        """
        if not self.batch_by_steps:
        return (initial_state,
                observations, actions, rewards,
                terminated, pads)        
        """
        
    return convert_to_batched_episodes_pcl(episodes)

In [21]:
def _sample_episodes_pcl(sess, greedy=False):
    """Sample episodes from environment using model."""
    
    # Define global variables
    global start_episode, env_gym, step_count, internal_state, last_obs, last_act, last_pad, env_spec_gym, start_id
    global internal_state_global, all_obs_global, all_act_global, all_pad_global, rewards_global, last_act
    global last_obs, max_step
    
    # reset environments as necessary
    obs_after_reset = env_gym.reset_if(start_episode)

    for i, obs in enumerate(obs_after_reset):
        if obs is not None:
            step_count[i] = 0
            internal_state_global[i] = initial_internal_state()
            for j in range(len(env_spec_gym.obs_dims)):
                last_obs[j][i] = obs[j]
            for j in range(len(env_spec_gym.act_dims)):
                last_act[j][i] = -1
            last_pad[i] = 0

    # maintain episode as a single unit if the last sampling
    # batch ended before the episode was terminated
    """
    if unify_episodes:
        assert len(obs_after_reset) == 1
        new_ep = obs_after_reset[0] is not None
    else:
        new_ep = True
    """
    new_ep = True
    
    # ToDo: Make start_id global
    start_id_global = 0 if new_ep else len(all_obs_global[:])

    initial_state = internal_state_global
    all_obs = [] if new_ep else all_obs_global[:]
    all_act = ([last_act] if new_ep else all_act_global[:])
    all_pad = [] if new_ep else all_pad_global[:]
    rewards = [] if new_ep else rewards_global[:]

    # start stepping in the environments
    step = 0
    while not env_gym.all_done():
        step_count += 1 - np.array(env_gym.dones)

        next_internal_state, sampled_actions = sample_step_pcl(
          sess, last_obs, internal_state_global, last_act,
          greedy=greedy)

        env_actions = env_spec_gym.convert_actions_to_env(sampled_actions)
        next_obs, reward, next_dones, _ = env_gym.step(env_actions)

        all_obs.append(last_obs)
        all_act.append(sampled_actions)
        all_pad.append(last_pad)
        rewards.append(reward)

        internal_state_global = next_internal_state
        last_obs = next_obs
        last_act = sampled_actions
        last_pad = np.array(next_dones).astype('float32')

        step += 1
        if max_step and step >= max_step:
            break

    all_obs_global = all_obs[:]
    all_act_global = all_act[:]
    all_pad_global = all_pad[:]
    rewards_global = rewards[:]

    # append final observation
    all_obs_global.append(last_obs)

    return initial_state, all_obs, all_act, rewards, all_pad

In [13]:
def sample_step_pcl(sess, single_observation_, internal_state_, single_action_, greedy=False):
    """Sample batch of steps from policy."""
    global greedy_next_internal_state, greedy_sampled_actions, next_internal_state, sampled_actions
    global internal_state, single_action, single_observation
    
    if greedy:
        outputs = [greedy_next_internal_state, greedy_sampled_actions]
    else:
        outputs = [next_internal_state, sampled_actions]

    feed_dict = {internal_state: internal_state_}
    for action_place, action in zip(single_action, single_action_):
        feed_dict[action_place] = action
    for obs_place, obs in zip(single_observation, single_observation_):
        feed_dict[obs_place] = obs

    return sess.run(outputs, feed_dict=feed_dict)

In [14]:
def convert_from_batched_episodes_pcl(initial_state_, observations_, actions_, rewards_, terminated_, pads_):
    """Convert time-major batch of episodes to batch-major list of episodes."""

    rewards_ = np.array(rewards_)
    pads_ = np.array(pads_)
    observations_ = [np.array(obs) for obs in observations_]
    actions_ = [np.array(act) for act in actions_]

    total_rewards_ = np.sum(rewards_ * (1 - pads_), axis=0)
    total_length_ = np.sum(1 - pads_, axis=0).astype('int32')

    episodes_ = []
    num_episodes_ = rewards_.shape[1]
    for i in range(num_episodes_):
        length = total_length_[i]
        ep_initial = initial_state_[i]
        ep_obs = [obs[:length, i, ...] for obs in observations_]
        ep_act = [act[:length + 1, i, ...] for act in actions_]
        ep_rewards = rewards_[:length, i]
        
        episodes_.append(
          [ep_initial, ep_obs, ep_act, ep_rewards, terminated_[i]])
    
    return episodes_

In [15]:
def convert_to_batched_episodes_pcl(episodes_, max_length=None):
    """Convert batch-major list of episodes to time-major batch of episodes."""
    lengths = [len(ep[-2]) for ep in episodes_]
    max_length = max_length or max(lengths)

    new_episodes = []
    for ep, length in zip(episodes_, lengths):
        initial_, observations_, actions_, rewards_, terminated_ = ep
        observations_ = [np.resize(obs, [max_length + 1] + list(obs.shape)[1:])
                      for obs in observations_]
        actions = [np.resize(act, [max_length + 1] + list(act.shape)[1:])
                     for act in actions_]
        pads_ = np.array([0] * length + [1] * (max_length - length))
        rewards_ = np.resize(rewards_, [max_length]) * (1 - pads_)
        new_episodes.append([initial_, observations_, actions_, rewards_,
                           terminated_, pads_])

    (initial_, observations_, actions_, rewards_,
     terminated_, pads_) = zip(*new_episodes)
    observations_ = [np.swapaxes(obs, 0, 1)
                    for obs in zip(*observations_)]
    actions_ = [np.swapaxes(act, 0, 1)
               for act in zip(*actions_)]
    rewards_ = np.transpose(rewards_)
    pads_ = np.transpose(pads_)

    return (initial_, observations_, actions_, rewards_, terminated_, pads_)

## Functions replay buffer

In [64]:
def add_to_replay_buffer_pcl(replay_buffer_, initial_state,
                        observations, actions, rewards,
                        terminated, pads, step):
    
    global prioritize_by
    
    """Add batch of episodes to replay buffer."""
    if replay_buffer_ is None:
        return

    rewards = np.array(rewards)
    pads = np.array(pads)
    total_rewards = np.sum(rewards * (1 - pads), axis=0)

    episodes = convert_from_batched_episodes_pcl(
      initial_state, observations, actions, rewards,
      terminated, pads)

    priorities = (total_rewards if prioritize_by == 'reward'
                 else step)
    
    #if not self.unify_episodes or self.all_new_ep:
    slast_idxs = replay_buffer_.add(episodes, priorities)
    #else:
      # If we are unifying episodes, we attempt to
      # keep them unified in the replay buffer.
      # The first episode sampled in the current batch is a
      # continuation of the last episode from the previous batch
    #self.replay_buffer.add(episodes[:1], priorities, self.last_idxs[-1:])
    #if len(episodes) > 1:
     #   self.replay_buffer.add(episodes[1:], priorities)

def get_from_replay_buffer_pcl(replay_buffer_, batch_size):
    """Sample a batch of episodes from the replay buffer."""
    if replay_buffer_ is None or len(replay_buffer_) < 1 * batch_size:
        return None, None

    desired_count = batch_size * max_step
    # in the case of batch_by_steps, we sample larger and larger
    # amounts from the replay buffer until we have enough steps.
    while True:
        if batch_size > len(replay_buffer_):
            batch_size = len(replay_buffer_)
        episodes, probs = replay_buffer_.get_batch(batch_size)
        count = sum(len(ep[-2]) for ep in episodes)
        if count >= desired_count or not batch_by_steps:
            break
        if batch_size == len(replay_buffer_):
            return None, None
        batch_size *= 1.2

    return (convert_to_batched_episodes_pcl(episodes), probs)

def seed_replay_buffer_pcl(replay_buffer_, episodes):
    """Seed the replay buffer with some episodes."""
    if replay_buffer_ is None:
        return

    # just need to add initial state
    for i in range(len(episodes)):
        episodes[i] = [initial_internal_state()] + episodes[i]

    replay_buffer_.seed_buffer(episodes)