In [11]:
import tensorflow as tf
import numpy as np
import random
import os
import pickle

import controller
import model
import policy
import baseline
from objective import PCL
import optimizers
import replay_buffer
import expert_paths
#import gym_wrapper
import env_spec

import gym
import numpy as np
import random

In [2]:
def get_env(env_str):
  return gym.make(env_str)


class GymWrapper(object):

  def __init__(self, env_str, distinct=1, count=1, seeds=None):
    self.distinct = distinct
    self.count = count
    self.total = self.distinct * self.count
    self.seeds = seeds or [random.randint(0, 1e12)
                           for _ in range(self.distinct)]

    self.envs = []
    for seed in self.seeds:
      for _ in range(self.count):
        env = get_env(env_str)
        env.seed(seed)
        if hasattr(env, 'last'):
          env.last = 100  # for algorithmic envs
        self.envs.append(env)

    self.dones = [True] * self.total
    self.num_episodes_played = 0

    one_env = self.get_one()
    self.use_action_list = hasattr(one_env.action_space, 'spaces')
    self.env_spec = env_spec.EnvSpec(self.get_one())

  def get_seeds(self):
    return self.seeds

  def reset(self):
    self.dones = [False] * self.total
    self.num_episodes_played += len(self.envs)

    # reset seeds to be synchronized
    self.seeds = [random.randint(0, 1e12) for _ in range(self.distinct)]
    counter = 0
    for seed in self.seeds:
      for _ in range(self.count):
        self.envs[counter].seed(seed)
        counter += 1

    return [self.env_spec.convert_obs_to_list(env.reset())
            for env in self.envs]

  def reset_if(self, predicate=None):
    if predicate is None:
      predicate = self.dones
    if self.count != 1:
      assert np.all(predicate)
      return self.reset()
    self.num_episodes_played += sum(predicate)
    output = [self.env_spec.convert_obs_to_list(env.reset())
              if pred else None
              for env, pred in zip(self.envs, predicate)]
    for i, pred in enumerate(predicate):
      if pred:
        self.dones[i] = False
    return output

  def all_done(self):
    return all(self.dones)

  def step(self, actions):

    def env_step(action):
      action = self.env_spec.convert_action_to_gym(action)
      obs, reward, done, tt = env.step(action)
      obs = self.env_spec.convert_obs_to_list(obs)
      return obs, reward, done, tt

    actions = zip(*actions)
    outputs = [env_step(action)
               if not done else (self.env_spec.initial_obs(None), 0, True, None)
               for action, env, done in zip(actions, self.envs, self.dones)]
    for i, (_, _, done, _) in enumerate(outputs):
      self.dones[i] = self.dones[i] or done
    obs, reward, done, tt = zip(*outputs)
    obs = [list(oo) for oo in zip(*obs)]
    return [obs, reward, done, tt]

  def get_one(self):
    return random.choice(self.envs)

  def __len__(self):
    return len(self.envs)

In [3]:
batch_size = 1 # FLAGS.batch_size
replay_batch_size = 25 # FLAGS.replay_batch_size
num_samples = 1 # FLAGS.num_samples # number of samples from each random seed initialization
env_str = 'HalfCheetah-v1' # FLAGS.env
env = GymWrapper(env_str, distinct=1 // 1, count=1)
env_spec = env_spec.EnvSpec(env.get_one())
max_step = 100 # FLAGS.max_step
cutoff_agent = 1000 # FLAGS.cutoff_agent
num_steps = 100000 # FLAGS.num_steps
validation_frequency = 50 # FLAGS.validation_frequency
target_network_lag = 0.99 # FLAGS.target_network_lag
sample_from = 'target' # FLAGS.sample_from
critic_weight = 0.0 # FLAGS.critic_weight
objective = 'pcl' # FLAGS.objective
trust_region_p = False # FLAGS.trust_region_p
value_opt = 'grad' # FLAGS.value_opt
max_divergence = 0.001 # FLAGS.max_divergence
learning_rate = 0.002 # FLAGS.learning_rate
clip_norm = 40 # FLAGS.clip_norm
clip_adv = 1.0 # FLAGS.clip_adv
tau = 0.0 # FLAGS.tau
tau_decay = None # FLAGS.tau_decay # decay tau by this much every 100 steps
tau_start = 0.1 # FLAGS.tau_start
eps_lambda = 0.0 # FLAGS.eps_lambda # relative entropy regularizer
update_eps_lambda = True # FLAGS.update_eps_lambda
gamma = 0.995 # FLAGS.gamma
rollout = 10 # FLAGS.rollout
fixed_std = True # FLAGS.fixed_std # fix the std in Gaussian distributions
input_prev_actions = True # FLAGS.input_prev_actions # input previous actions to policy network
recurrent = False # FLAGS.recurrent
input_time_step = False # FLAGS.input_time_step # input time step into value calucations
use_online_batch = False # FLAGS.use_online_batch
batch_by_steps = True # FLAGS.batch_by_steps
unify_episodes = True # FLAGS.unify_episodes
replay_buffer_size = 20000 # FLAGS.replay_buffer_size
replay_buffer_alpha = 0.1 # FLAGS.replay_buffer_alpha
replay_buffer_freq = 1 # FLAGS.replay_buffer_freq
eviction = 'fifo' # FLAGS.eviction
prioritize_by = 'step' # FLAGS.prioritize_by
num_expert_paths = 0 # FLAGS.num_expert_paths
internal_dim = 64 # FLAGS.internal_dim
value_hidden_layers = 2 # FLAGS.value_hidden_layers
tf_seed = 42 # FLAGS.tf_seed # random seed for tensorflow
save_trajectories_dir = None # FLAGS.save_trajectories_dir # directory to save trajectories to, if desired
load_trajectories_dir = None # FLAGS.load_trajectories_dir # file to load expert trajectories from

In [19]:
class Trainer2(object):
    
    def __init__(self, batch_size = 1, # FLAGS.batch_size
                replay_batch_size = 25, # FLAGS.replay_batch_size
                num_samples = 1, # FLAGS.num_samples # number of samples from each random seed initialization
                env_str = 'HalfCheetah-v1', # FLAGS.env
                max_step = 100, # FLAGS.max_step
                cutoff_agent = 1000, # FLAGS.cutoff_agent
                num_steps = 100000, # FLAGS.num_steps
                validation_frequency = 50, # FLAGS.validation_frequency
                target_network_lag = 0.99, # FLAGS.target_network_lag
                sample_from = 'target', # FLAGS.sample_from
                critic_weight = 0.0, # FLAGS.critic_weight
                objective = 'pcl', # FLAGS.objective
                trust_region_p = False, # FLAGS.trust_region_p
                value_opt = 'grad', # FLAGS.value_opt
                max_divergence = 0.001, # FLAGS.max_divergence
                learning_rate = 0.002, # FLAGS.learning_rate
                clip_norm = 40, # FLAGS.clip_norm
                clip_adv = 1.0, # FLAGS.clip_adv
                tau = 0.0, # FLAGS.tau
                tau_decay = None, # FLAGS.tau_decay # decay tau by this much every 100 steps
                tau_start = 0.1, # FLAGS.tau_start
                eps_lambda = 0.0, # FLAGS.eps_lambda # relative entropy regularizer
                update_eps_lambda = True, # FLAGS.update_eps_lambda
                gamma = 0.995, # FLAGS.gamma
                rollout = 10, # FLAGS.rollout
                fixed_std = True, # FLAGS.fixed_std # fix the std in Gaussian distributions
                input_prev_actions = True, # FLAGS.input_prev_actions # input previous actions to policy network
                recurrent = False, # FLAGS.recurrent
                input_time_step = False, # FLAGS.input_time_step # input time step into value calucations
                use_online_batch = False, # FLAGS.use_online_batch
                batch_by_steps = True, # FLAGS.batch_by_steps
                unify_episodes = True, # FLAGS.unify_episodes
                replay_buffer_size = 20000, # FLAGS.replay_buffer_size
                replay_buffer_alpha = 0.1, # FLAGS.replay_buffer_alpha
                replay_buffer_freq = 1, # FLAGS.replay_buffer_freq
                eviction = 'fifo', # FLAGS.eviction
                prioritize_by = 'step', # FLAGS.prioritize_by
                num_expert_paths = 0, # FLAGS.num_expert_paths
                internal_dim = 64, # FLAGS.internal_dim
                value_hidden_layers = 2, # FLAGS.value_hidden_layers
                tf_seed = 42, # FLAGS.tf_seed # random seed for tensorflow
                save_trajectories_dir = None, # FLAGS.save_trajectories_dir # directory to save trajectories to, if desired
                load_trajectories_dir = None # FLAGS.load_trajectories_dir # file to load expert trajectories from):
                ):
        
        self.batch_size = batch_size
        self.replay_batch_size = replay_batch_size
        if self.replay_batch_size is None:
          self.replay_batch_size = self.batch_size
        self.num_samples = num_samples

        self.env_str = env_str
        self.env = GymWrapper(self.env_str,
                              distinct=batch_size // self.num_samples,
                              count=self.num_samples)
        self.env_spec = env_spec.EnvSpec(self.env.get_one())

        self.max_step = max_step
        self.cutoff_agent = cutoff_agent
        self.num_steps = num_steps
        self.validation_frequency = validation_frequency

        self.target_network_lag = target_network_lag
        self.sample_from = sample_from
        assert self.sample_from in ['online', 'target']

        self.critic_weight = critic_weight
        self.objective = objective
        self.trust_region_p = False
        self.value_opt = value_opt
        assert not self.trust_region_p or self.objective in ['pcl', 'trpo']
        assert self.objective != 'trpo' or self.trust_region_p
        assert self.value_opt is None or self.critic_weight == 0.0
        self.max_divergence = max_divergence

        self.learning_rate = learning_rate
        self.clip_norm = clip_norm
        self.clip_adv = clip_adv
        self.tau = tau
        self.tau_decay = tau_decay
        self.tau_start = tau_start
        self.eps_lambda = eps_lambda
        self.update_eps_lambda = update_eps_lambda
        self.gamma = gamma
        self.rollout = rollout
        self.fixed_std = fixed_std
        self.input_prev_actions = input_prev_actions
        self.recurrent = recurrent
        assert not self.trust_region_p or not self.recurrent
        self.input_time_step = input_time_step
        assert not self.input_time_step or (self.cutoff_agent <= self.max_step)

        self.use_online_batch = use_online_batch
        self.batch_by_steps = batch_by_steps
        self.unify_episodes = unify_episodes
        if self.unify_episodes:
          assert self.batch_size == 1

        self.replay_buffer_size = replay_buffer_size
        self.replay_buffer_alpha = replay_buffer_alpha
        self.replay_buffer_freq = replay_buffer_freq
        assert self.replay_buffer_freq in [-1, 0, 1]
        self.eviction = eviction
        self.prioritize_by = prioritize_by
        assert self.prioritize_by in ['rewards', 'step']
        self.num_expert_paths = num_expert_paths

        self.internal_dim = internal_dim
        self.value_hidden_layers = value_hidden_layers
        self.tf_seed = tf_seed
        self.global_step = tf.train.get_or_create_global_step()
        #self.save_trajectories_dir = (
        #    save_trajectories_dir or save_dir)
        #self.save_trajectories_file = (
        #    os.path.join(
        #        self.save_trajectories_dir, self.env_str.replace('-', '_'))
        #    if self.save_trajectories_dir else None)
        #self.load_trajectories_file = load_trajectories_file
        
    def get_objective(self):
        tau = self.tau
        if self.tau_decay is not None:
            assert self.tau_start >= self.tau
            tau = tf.maximum(
                  tf.train.exponential_decay(
                  self.tau_start, self.global_step, 100, self.tau_decay),
                  self.tau)

        if self.objective in ['pcl', 'a3c', 'trpo', 'upcl']:
            cls = (objective.PCL if self.objective in ['pcl', 'upcl'] else
                 objective.TRPO if self.objective == 'trpo' else
                 objective.ActorCritic)
        policy_weight = 1.0

        return cls(self.learning_rate,
                 clip_norm=self.clip_norm,
                 policy_weight=policy_weight,
                 critic_weight=self.critic_weight,
                 tau=tau, gamma=self.gamma, rollout=self.rollout,
                 eps_lambda=self.eps_lambda, clip_adv=self.clip_adv)


    def get_policy(self):
        if self.recurrent:
            cls = policy.Policy
        else:
            cls = policy.MLPPolicy
        return cls(self.env_spec, self.internal_dim,
                   fixed_std=self.fixed_std,
                   recurrent=self.recurrent,
                   input_prev_actions=self.input_prev_actions)

    def get_baseline(self):
        cls = (baseline.UnifiedBaseline if self.objective == 'upcl' else
               baseline.Baseline)
        return cls(self.env_spec, self.internal_dim,
                   input_prev_actions=self.input_prev_actions,
                   input_time_step=self.input_time_step,
                   input_policy_state=self.recurrent,  # may want to change this
                   n_hidden_layers=self.value_hidden_layers,
                   hidden_dim=self.internal_dim,
                   tau=self.tau)

    def get_trust_region_p_opt(self):
          return None

    def get_value_opt(self):
        if self.value_opt == 'grad':
            return optimizers.GradOptimization(
              learning_rate=self.learning_rate, max_iter=5, mix_frac=0.05)
        elif self.value_opt == 'lbfgs':
            return optimizers.LbfgsOptimization(max_iter=25, mix_frac=0.1)
        elif self.value_opt == 'best_fit':
            return optimizers.BestFitOptimization(mix_frac=1.0)
        else:
            return None
    
    def get_model(self):
        cls = model.Model
        return cls(self.env_spec, self.global_step,
                   target_network_lag=self.target_network_lag,
                   sample_from=self.sample_from,
                   get_policy=self.get_policy,
                   get_baseline=self.get_baseline,
                   get_objective=self.get_objective,
                   get_trust_region_p_opt=self.get_trust_region_p_opt,
                   get_value_opt=self.get_value_opt)

In [20]:
trainer = Trainer2()

In [21]:
model_PCL = trainer.get_model()

AttributeError: 'Trainer2' object has no attribute 'global_step'

In [15]:
env = GymWrapper('HalfCheetah-v1',
                 distinct=1 // 1,
                 count=1)


Get objective of PCL

In [37]:
policy_weight = 1.0
objective_PCL = PCL(learning_rate,
                    clip_norm=clip_norm,
                    policy_weight=policy_weight,
                    critic_weight=critic_weight,
                    tau=tau, gamma=gamma, rollout=rollout,
                    eps_lambda=eps_lambda, clip_adv=clip_adv)

Get Policy setup of PCL

In [40]:
cls = policy.MLPPolicy
policy_PCL = cls(env_spec, internal_dim,
                 fixed_std=fixed_std,
                 recurrent=recurrent,
                 input_prev_actions=input_prev_actions)

Get baseline/value Function

In [45]:
cls = (baseline.UnifiedBaseline if objective == 'upcl' else baseline.Baseline)
baseline_PCL = cls(env_spec, internal_dim,
                   input_prev_actions=input_prev_actions,
                   input_time_step=input_time_step,
                   input_policy_state=recurrent,  # may want to change this
                   n_hidden_layers=value_hidden_layers,
                   hidden_dim=internal_dim,
                   tau=tau)

Get value function optimizer

In [50]:
if value_opt == 'grad':
  optimizer_PCL =  optimizers.GradOptimization(learning_rate=learning_rate, max_iter=5, mix_frac=0.05)
elif value_opt == 'lbfgs':
  optimizer_PCL = optimizers.LbfgsOptimization(max_iter=25, mix_frac=0.1)
elif value_opt == 'best_fit':
  optimizer_PCL = optimizers.BestFitOptimization(mix_frac=1.0)


Get model object that is responsible to set up all required optimization
ops, including gradient ops, trust region ops, and value optimizers  

In [55]:
global_step = tf.train.get_or_create_global_step()
cls = model.Model
model_PCL = cls(env_spec, global_step,
               target_network_lag=target_network_lag,
               sample_from=sample_from,
               get_policy= policy_PCL,
               get_baseline=baseline_PCL,
               get_objective=objective_PCL,
               get_trust_region_p_opt=None,
               get_value_opt=optimizer_PCL)

TypeError: 'MLPPolicy' object is not callable

<tf.Variable 'global_step:0' shape=() dtype=int64_ref>