# *** Implement Unified PCL ***

At first we need to define all placeholders

In [None]:
# One episode of observations (Time_Steps, Observation dimension)
self.single_observation = tf.placeholder(tf.float32, [None, obs_dim], 'obs%d' % i)

# One episode of actions (Time_steps, action dimension)
self.single_action = tf.placeholder(tf.float32, [None, action_dim], 'act%d' % i)

# Observations batch size many episodes of time length [batch size, time length, observation dim]
self.observations = tf.placeholder(tf.float32, [None, None, obs_dim], 'all_obs%d' % i)

# Actions batch size many episodes of time length [batch size, time length, action dim]
self.actions = tf.placeholder(tf.float32, [None, None, action_dim], 'all_act%d' % i)

# Rewards of Batch Size many episodes of time length [batch size, time length]
self.rewards = tf.placeholder(tf.float32, [None, None], 'rewards')

# Indicator if episode has terminated 
self.terminated = tf.placeholder(tf.float32, [None], 'terminated')

# Batch Size many episodes of time length indicators if episode has ended
self.pads = tf.placeholder(tf.float32, [None, None], 'pads')

Define computation graph of policy evaluation
* Internal States are related to the states of the RNN-Network
* Logits are the output of the neural network (mu, sigma)
* Log Probs are log probabilities of the policy at state "obs" --> log(N(mu,sigma))
* Entropy is only used by Actor-Critic Objective as reguralizer in the objective
* KL-Divergence is only used by TRPO

In [None]:
# policy network
with tf.variable_scope('policy_net'):
    (self.policy_internal_states, self.logits, self.log_probs, self.entropies, self.self_kls) = \
                self.policy.multi_step(self.observations,
                                       self.internal_state,
                                       self.actions)

To receive the required value fucntions we need to define a graph for them:
* 1 Option - seperate neural network
* 2 Option - Calculated by the same network as the actions

Unified PCL:
* regression_input and regression_weight are not used since they are required if we have a seperate NN for the value function
* policy_internal_states are also only used to get the input into the value function if the value function is recurrent and we consider seperate models
* The values are only calculated based on the logits

Separate value function
* regression_input and regression_weigh are used to update the model
* The policy_internal_states is considered as input if we have a recurrent model
* The actions are also considered as input if required
* We can also conside the time step as input into the model

In [None]:
# value network
with tf.variable_scope('value_net'):
    (self.values,
     self.regression_input,
     self.regression_weight) = self.baseline.get_values(
        self.observations, 
        self.actions,
        self.policy_internal_states, 
        self.logits)

Evaluate the objective based on the computed:
* rewards of the rollouts of the current policy
* value function evaluations of the current value function
* pads and terminated indicator
* log probs of the policy at state and action t

The following input is required if we consider trust pcl:
* target Log Probs --> are the log probs sampled by the target network $ \tilde{\theta} \leftarrow \alpha \tilde{\theta} + (1 - \alpha) \theta$

Inputs never used:
* logits

Inputs used by other algorithms:
 * entropy --> reguralizer A3C
 * prev_log_probs --> TRPO

In [None]:
# evaluate objective
(self.loss, self.raw_loss, self.regression_target, self.gradient_ops, self.summary) = self.objective.get(
                      self.rewards, self.pads,
                      self.values[:-1, :],
                      self.values[-1, :] * (1 - self.terminated),
                      self.log_probs, self.prev_log_probs, self.target_log_probs,
                      self.entropies,
                      self.logits)

Define the sampling operations
* We can sample from the current policy "poliyc_net" or from our target policy "target_policy_net"

The policy_sample step function calls the single_step function and returns the current next state of the RNN and the sampled
actions. There are to options to sample the actions:
* Greedy --> take the mean value as the action to take
* More exploration --> take the mean disturbed by the standard deviation as next action (means + std * tf.random_normal([batch_size, act_dim]))

In [None]:
# we re-use variables for the sampling operations
with tf.variable_scope('model', reuse=True):
    scope = ('target_policy_net' if self.sample_from == 'target'
               else 'policy_net')
    with tf.variable_scope(scope):
        self.next_internal_state, self.sampled_actions = \
            self.policy.sample_step(self.single_observation,
                                self.internal_state,
                                self.single_action)
        self.greedy_next_internal_state, self.greedy_sampled_actions = \
            self.policy.sample_step(self.single_observation,
                                self.internal_state,
                                self.single_action,
                                greedy=True)