In [3]:
import multiprocessing
import threading
import tensorflow as tf
import numpy as np
import gym
import os
import shutil
import matplotlib.pyplot as plt

From scratch - PCL

In [4]:
GAME = 'Pendulum-v0'
OUTPUT_GRAPH = True
LOG_DIR = '/home/adrian/Schreibtisch/Uni/Data-Innovation-Lab/tensorflowlogs'
N_WORKERS = multiprocessing.cpu_count()
MAX_EP_STEP = 200
MAX_GLOBAL_EP = 500
GLOBAL_NET_SCOPE = 'Global_Net'
UPDATE_GLOBAL_ITER = 10
GAMMA = 0.9
ENTROPY_BETA = 0.01
LR_A = 0.0001    # learning rate for actor
LR_C = 0.001    # learning rate for critic
GLOBAL_RUNNING_R = []
GLOBAL_EP = 0

# PCL specific
TAU = 0.1
ROLLOUT = 10

env = gym.make(GAME)

N_S = env.observation_space.shape[0]
N_A = env.action_space.shape[0]
A_BOUND = [env.action_space.low, env.action_space.high]

In [873]:
class spaces(object):
    discrete = 0
    box = 1

def get_space(space):
    if hasattr(space, 'n'):
        return space.n, spaces.discrete, None
    elif hasattr(space, 'shape'):
        return np.prod(space.shape), spaces.box, (space.low, space.high)

def get_spaces(spaces):
    if hasattr(spaces, 'spaces'):
        return zip(*[get_space(space) for space in spaces.spaces])
    else:
        return [(ret,) for ret in get_space(spaces)]

def sampling_dim(dim, typ):
    if typ == spaces.discrete:
        return dim
    elif typ == spaces.box:
        return 2 * dim  # Gaussian mean and std
    else:
        assert False

In [885]:
obs_space = env.observation_space
obs_dims, obs_types, obs_info = get_spaces(obs_space)
act_space = env.action_space
act_dims, act_types, act_info = get_spaces(act_space)
total_obs_dim = sum(obs_dims)
total_sampled_act_dim = sum(act_dims)
print(obs_dims)
print(obs_types)
print(obs_info)
print(N_S)

(3,)
(1,)
((array([-1., -1., -8.]), array([ 1.,  1.,  8.])),)
3


In [886]:
[total_obs_dim, total_sampled_act_dim]

[3, 1]

In [851]:
obs_dims_and_types = zip(obs_dims, obs_types)
act_dims_and_types = zip(act_dims, act_types)

In [853]:
total_sampling_act_dim = sum(sampling_dim(dim, typ) for dim, typ in act_dims_and_types)

In [882]:
get_spaces(act_space)

[(1,), (1,), ((array([-2.]), array([ 2.])),)]

Hierarchy of calls to collect data
* samlple_episodes
* sample_single_episodes
* sample_step

Output of sample_episodes is a batch of batchsize_episodes many episodes
* This batch is then feed to update model

Update model itself is divided into two parts
* update_on_policy --> Using current batch
* upadte_off_policy --> Using batch drawn from a replay buffer

Upadte_on_policy and update_off_policy are taking this batch off episodes and call
* train_step

Train step is receiving data of the form
* observations (steps, batch_size, obs_dim)
* internal_state ()
* actions (steps, batch_sitze, act_dim)
* rewards (steps, batch_size)
* terminated (batch_size)
* pads (steps, batch_size)

Output of this call is
* raw_loss
* gradient_ops
* summary

All of this outputs are returned evaluating the objective objective.get()
* gradient_ops = self.training_ops(loss, learning_rate=self.learning_rate)
* loss = (self.policy_weight * policy_loss + self.critic_weight * critic_loss)

We need to define Batches in Terms of Batch of episodes not in terms of single steps  
* First dimension --> Length of episode --> Steps performed  
* Second dimension --> Batch_Size (Count of episodes)

In [287]:
single_state = tf.placeholder(tf.float32, [None, N_S], 'S')
single_action = tf.placeholder(tf.float32, [None, N_A], 'A')
internal_state_tf = tf.placeholder(tf.float32, [None, 10],'internal_state')

# Those are needed to perform update of neural network
states_episodic = tf.placeholder(tf.float32, [None, None, N_S], 'all_obs_1')
actions_episodic = tf.placeholder(tf.float32, [None, None, N_A], 'all_act_1')

rewards_episodic = tf.placeholder(tf.float32, [None, None], 'rewards')
terminated_episodic = tf.placeholder(tf.float32, [None], 'terminated')
pads_episodic = tf.placeholder(tf.float32, [None, None], 'pads')

Start with setting up data pipeline to sample episodes

Define Batchsize (Epsidodes) and steps to perfom in every episode

In [5]:
max_steps_in_env = 150
episode_length = 10

global INTERNAL_STATE, LAST_OBS, LAST_ACT, LAST_PAD
INTERNAL_STATE = list(env.observation_space.sample())
LAST_OBS =  list(env.observation_space.sample())
LAST_ACT = [env.action_space.sample().tolist(),  env.action_space.sample().tolist()]
LAST_PAD = [0,0]

1. sample_step

In [8]:
# Given an observation get new action
def sample_step(single_observation):
    # Normally call somethin like this 
    """
    s = s[np.newaxis, :]
    return SESS.run(self.A, {self.s: s})[0]
    """
    # return random action.
    return env.action_space.sample()

Calls model.sample_step

In [7]:
def sample_step(self, sess,
              single_observation, internal_state, single_action,
              greedy=False):
    """Sample batch of steps from policy."""
    if greedy:
        outputs = [self.greedy_next_internal_state, self.greedy_sampled_actions]
    else:
        outputs = [self.next_internal_state, self.sampled_actions]

    feed_dict = {self.internal_state: internal_state}
    for action_place, action in zip(self.single_action, single_action):
        feed_dict[action_place] = action
    for obs_place, obs in zip(self.single_observation, single_observation):
        feed_dict[obs_place] = obs

    return sess.run(outputs, feed_dict=feed_dict)

2. sample_single_episode --> requires global variables internal_state, last_act
* internal_state allows to not reset the environment after one episode got collected

In [9]:
def sample_single_episode(episode_length):
    
    global INTERNAL_STATE, LAST_OBS, LAST_ACT, LAST_PAD
    env.reset()
    
    # ToDo: Figure out how initial state is gernated by the recurrent neural network
    """
    next_internal_state, sampled_actions = self.model.sample_step(
          sess, self.last_obs, self.internal_state, self.last_act,
          greedy=greedy)
    """
    #initial_state = INTERNAL_STATE
    all_obs = [] 
    all_act = [LAST_ACT]
    all_pad = []
    rewards = [] 
    done = [False, False]
    
    step = 0
    while not done[0] and not done[1]:
        
        sampled_action = sample_step(LAST_OBS)
        # Convert action to gym --> (see env_spec.convert_action_to_gym)
        #env_actions = sampled_action[0]
        
        next_obs, reward, next_dones, tt1 = env.step(sampled_action)
        next_obs = next_obs.tolist()
        next_obs1, reward1, next_dones1, tt2 = env.step(sampled_action)
        next_obs1 = next_obs1.tolist()
        
        outputs = [[next_obs, reward, next_dones, tt1], [next_obs1, reward1, next_dones1, tt2]]
        # Convert observations to list
        
        
        next_obs, reward, next_dones, tt = zip(*outputs)     
        done = next_dones
        
        all_obs.append(LAST_OBS)
        all_act.append([sampled_action.tolist(), sampled_action.tolist()])
        all_pad.append(LAST_PAD)
        rewards.append(reward)
        
        LAST_OBS = next_obs
        LAST_ACT = [sampled_action.tolist(), sampled_action.tolist()]
        LAST_PAD = np.array(next_dones).astype('float32')
        
        step +=1
        # Required for unsolved environments like Pendulum
        # Done escape will trigger if the environment needs to reset at some time e.g. Cart-Pole
        if episode_length <= step:
            break
            
    # append final observation
    all_obs.append(LAST_OBS)
    
    return  all_obs, all_act, rewards, all_pad

In [10]:
observations, actions, rewards, pads = sample_single_episode(16)
terminated = np.array([False, False])
print(np.shape(observations))
print(np.shape(actions))
print(np.shape(rewards))
print(np.shape(pads))

(17,)
(17, 2, 1)
(16, 2)
(16, 2)


3. sample_episodes
* Ouput is a list of dimension (episodes, 4)
* With 4 = observations, actions, rewards, terminated indicator

In [11]:
def sample_episodes(max_steps_in_env, episode_length):
    
    episodes = []
    total_steps = 0
    
    while total_steps < max_steps_in_env * 1:
    
        observations, actions, rewards, pads = sample_single_episode(episode_length)
        terminated = np.array([False, False])
        
        episodes.extend(convert_from_batched_episodes(observations, actions, rewards, terminated, pads))
        
        total_steps += np.sum(1 - np.array(pads))
        
    return episodes

In [15]:
episodes = sample_episodes(max_steps_in_env, episode_length)

In [18]:
print(np.shape(episodes))
episodes[0]

(16, 4)


[[array([ 0.20545786, -0.97866596,  3.64683662]),
  array([ 0.46616799, -0.88469622, -0.73742387]),
  array([ 0.28114144, -0.95966634, -2.30974903]),
  array([-0.04467913, -0.99900139, -3.63462885]),
  array([-0.48737655, -0.8731919 , -4.97115319]),
  array([-0.88093592, -0.47323557, -5.81918304]),
  array([-0.99503088,  0.09956684, -5.94598188]),
  array([-0.78630792,  0.61783481, -5.56286559]),
  array([-0.4027277 ,  0.91531983, -4.62252086]),
  array([-0.05963747,  0.9982201 , -3.12262975])],
 [array([ 0.2737358]),
  array([-1.9248408]),
  array([ 0.47054199]),
  array([ 0.44838289]),
  array([ 0.46773599]),
  array([ 1.77499231]),
  array([ 0.7272812]),
  array([-0.5619684]),
  array([-0.25187218]),
  array([ 0.79052478]),
  array([-1.75909811])],
 array([ -1.1081043 ,  -1.65538253,  -2.93138487,  -5.21727999,
         -8.64856242, -12.15809796, -10.90376566,  -7.59973161,
         -4.77168498,  -2.815184  ]),
 False]

In [16]:
observations, actions, rewards, terminated, pads = convert_to_batched_episodes(episodes)
print(np.shape(observations))
print(np.shape(actions))
print(np.shape(rewards))
print(np.shape(terminated))
print(np.shape(pads))

(10, 16, 3)
(11, 16, 1)
(10, 16)
(16,)
(10, 16)


Training is than called with the output of the convert_to_batched_episodes function  
* train(sess, observations, initial_state, actions, rewards, terminated, pads)
* This functions than calls model.train_step (ess, observations, initial_state, actions, rewards, terminated, pads, avg_episode_reward=np.mean(self.episode_rewards))

In [None]:
feed_dict = {self.internal_state: internal_state,
             self.rewards: rewards,
             self.terminated: terminated,
             self.pads: pads,
             self.avg_episode_reward: avg_episode_reward,
             self.actions: actions
             self.observations: observations}

outputs = [self.raw_loss, self.gradient_ops, self.summary]

All Tensorflow-objects get created by the call of objective.get which does compute the loss function. This fucntion takes the following elements:
* rewards
* pads
* values
* final_values
* log_probs
* prev_log_probs --> Used by TRPO
* target_log_probs --> Only used by Trust-PCL

The loss is calculated and then passed on to the training_ops
* gradient_ops = self.training_ops(loss, learning_rate=self.learning_rate)
* Gradient ops return the apply_gradient obs and when called updates the model
* Needs:
* params = tf.trainable_variables()
* grads = tf.gradients(loss, params)
* loss --> list of tensors to be differentiated
* params --> list of tensors to be used for differentiation
* Brings objective and model specification together

In [843]:
 def training_ops(self, loss, learning_rate=None):
    """Gradient ops."""
    opt = self.get_optimizer(learning_rate) #--> tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=2e-4)
    params = tf.trainable_variables()
    grads = tf.gradients(loss, params)

    if self.clip_norm:
        grads, global_norm = tf.clip_by_global_norm(grads, self.clip_norm)
        tf.summary.scalar('grad_global_norm', global_norm)

    return opt.apply_gradients(zip(grads, params))

So far we used the following as our input to train_step:
* rewards
* pads 


Now we evaluate where the following come from:
* values --> Input observations, actions, 
* log_probs --> Input observations, internal_state, actions

In [None]:
# Get log_probs
(self.policy_internal_states, self.logits, self.log_probs, self.entropies, self.self_kls) = \
                    self.policy.multi_step(self.observations,
                                           self.internal_state,
                                           self.actions)

In [None]:
# Get values
(self.values, self.regression_input, self.regression_weight) = \
                    self.baseline.get_values(
                            self.observations,
                            self.actions,
                            self.policy_internal_states, 
                            self.logits)

Help function to bundle observations, actions, rewards and pads into one object

In [13]:
def convert_from_batched_episodes(observations, actions, rewards, terminated, pads):
    """Convert time-major batch of episodes to batch-major list of episodes."""

    rewards = np.array(rewards)
    if len(np.shape(rewards)) == 1:
        rewards = np.reshape(rewards, [np.shape(rewards)[0], 1])
        
    pads = np.array(pads)
    if len(np.shape(pads)) == 1:
        rewards = np.reshape(pads, [np.shape(pads)[0], 1])
        
    observations = [np.array(obs) for obs in observations]
    actions = [np.array(act) for act in actions]

    
    total_rewards = np.sum(rewards * (1 - pads), axis=0)
    total_length = np.sum(1 - pads, axis=0).astype('int32')
    
    if isinstance(total_length, np.integer):
        total_length =  [total_length]
        
    episodes = []
    
    if len(np.shape(observations)) == 2:
        length = total_length[0]
        ep_obs = observations[:length]
        ep_act = actions[:length+1]
        ep_rewards = rewards[:length, 0]
        episodes.append([ep_obs, ep_act, ep_rewards, terminated])
        return episodes
    
    num_episodes = rewards.shape[1]
    for i in range(num_episodes):
        length = total_length[i]
        ep_obs = [obs[i] for obs in observations][:length]
        ep_act = [act[i] for act in actions][:length+1]
        ep_rewards = rewards[:length, i]

        episodes.append(
          [ep_obs, ep_act, ep_rewards, terminated[i]])

    return episodes

Help function to convert batach-major list of episodes to time-major batch of episodes

In [14]:
def convert_to_batched_episodes(episodes, max_length=None):
    """Convert batch-major list of episodes to time-major batch of episodes."""
    lengths = [len(ep[-2]) for ep in episodes]
    max_length = max_length or max(lengths)

    new_episodes = []
    for ep, length in zip(episodes, lengths):
        observations, actions, rewards, terminated = ep
        observations = np.vstack(observations)
        actions = np.vstack(actions)
        pads = np.array([0] * length + [1] * (max_length - length))
        rewards = np.resize(rewards, [max_length]) * (1 - pads)
        new_episodes.append([observations, actions, rewards, terminated, pads])

    (observations, actions, rewards, terminated, pads) = zip(*new_episodes)
    observations = np.swapaxes(observations, 0, 1)
    actions = np.swapaxes(actions, 0, 1)
    rewards = np.transpose(rewards)
    pads = np.transpose(pads)

    return (observations, actions, rewards, terminated, pads)

Tensorflow code to evaluate rolling discount on rewards

In [809]:
rewards_1 = tf.concat([tf.zeros([5 - 1, 16]), rewards_episodic], 0)

discount_filter = tf.reshape(0.9 ** tf.range(float(5)), [-1, 1, 1])
expanded_values = tf.concat([rewards_1, tf.zeros([5 - 1, tf.shape(rewards_1)[1]])], 0)
expanded_dims = tf.expand_dims(tf.transpose(expanded_values), -1)
conv1 = tf.nn.conv1d(expanded_dims, discount_filter, stride=1, padding='VALID')
conv1_sq = tf.squeeze(conv1, -1)
conv_values = tf.transpose(conv1_sq)

In [810]:
SESS = tf.Session()
feed_dict = {rewards_episodic: rewards}
conv1_sq, expanded_dims, conv1, filter_, catched_discounted_sum, expanded_ = SESS.run([conv1_sq, expanded_dims, conv1, discount_filter, conv_values, expanded_values], feed_dict)