In [1]:
import numpy as np 
import time
import random
import mujoco_py
import gym
import os

from mjremote import mjremote
from mujoco_py import MjSim

from gym_hand_sim.envs import mpl_thumb_grasp_env

import tensorflow as tf 
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)
import tf_agents
from tf_agents.environments import suite_mujoco
from tf_agents.environments import tf_py_environment, batched_py_environment
from tf_agents.networks.actor_distribution_rnn_network import ActorDistributionRnnNetwork
from tf_agents.networks.value_rnn_network import ValueRnnNetwork
from tf_agents.agents.ppo import ppo_clip_agent
from tf_agents.utils import common

MAX_EPISODE_STEPS = 50

num_iterations = 20000 # @param {type:"integer"}
initial_collect_steps = 100  # @param {type:"integer"} 
collect_steps_per_iteration = 1  # @param {type:"integer"}
replay_buffer_max_length = 100000  # @param {type:"integer"}
batch_size = 64  # @param {type:"integer"}
learning_rate = 1e-3  # @param {type:"number"}
log_interval = 200  # @param {type:"integer"}
num_eval_episodes = 10  # @param {type:"integer"}
eval_interval = 1000  # @param {type:"integer"}

tf_agents.system.multiprocessing.enable_interactive_mode()



running build_ext


In [50]:
def compute_avg_return(environment, policy, num_episodes=10):
    total_return = 0.0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return
    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

def create_nets(observation_spec, action_spec):
    preprocessing_layers = {
        'achieved_goal': tf.keras.layers.experimental.preprocessing.Normalization(),
        'desired_goal': tf.keras.layers.experimental.preprocessing.Normalization(),
        'observation': tf.keras.layers.experimental.preprocessing.Normalization()
    }
    preprocessing_combiner = tf.keras.layers.Concatenate(axis=-1)
    conv_layer_params = None
    
    policy_net = ActorDistributionRnnNetwork(
        observation_spec,
        action_spec,
        conv_layer_params = conv_layer_params,
        preprocessing_layers = preprocessing_layers,
        preprocessing_combiner = preprocessing_combiner,
        input_fc_layer_params = (256,),
        lstm_size = (128,),
        output_fc_layer_params = (4,),
        activation_fn = tf.keras.activations.relu)

    value_net = ValueRnnNetwork(
        observation_spec,
        conv_layer_params = conv_layer_params,
        preprocessing_layers = preprocessing_layers,
        preprocessing_combiner = preprocessing_combiner,
        input_fc_layer_params = (256,),
        lstm_size = (128,),
        output_fc_layer_params = (128,),
        activation_fn = tf.keras.activations.relu)
    
    return policy_net, value_net

from tf_agents.distributions.utils import DistributionSpecV2
from tf_agents.utils import nest_utils
def assert_specs_are_compatible(
    network_output_spec,
    spec,
    message_prefix: str):
    """Checks that the output of `network.create_variables` matches a spec.
    Args:
        network_output_spec: The output of `network.create_variables`.
        spec: The spec we are matching to.
        message_prefix: The message prefix for error messages, used when the specs
            don't match.
    Raises:
        ValueError: If the specs don't match.
      """
    def to_event(s):
        return s.event_spec if isinstance(s, DistributionSpecV2) else s

    event_spec = tf.nest.map_structure(to_event, network_output_spec)

    nest_utils.assert_same_structure(
      event_spec,
      spec,
      message=("{}:\n{}\nvs.\n{}".format(message_prefix, event_spec, spec)))

    def compare_output_to_spec(s1, s2):
        if not s1.is_compatible_with(s2):
            raise ValueError("{}:\n{}\nvs.\n{}".format(message_prefix, event_spec,
                                                 spec))

    tf.nest.map_structure(compare_output_to_spec, event_spec, spec)

def train_eval_mpl(
    n_env_steps = 3e7,
    collect_episode_per_iteration = 32,
    num_parallel_environments = 8,
    replay_buffer_capacity = 301,
    n_epochs = 25,
    learning_rate = 4e-4,
    eval_interval = 500,
    log_interval = 50):
    
    train_py_env = suite_mujoco.load('gym_hand_sim:MplThumbGraspBall-v0')
    eval_py_env = suite_mujoco.load('gym_hand_sim:MplThumbGraspBall-v0')
    train_env = tf_py_environment.TFPyEnvironment(batched_py_environment.BatchedPyEnvironment([train_py_env] * num_parallel_environments))
    eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

    policy_net, value_net = create_nets(train_env.observation_spec(), train_env.action_spec())
    
    
    net_spec = policy_net.create_variables(train_env.time_step_spec().observation)
    print("-----------")
    print(net_spec)
    print(net_spec.event_spec)
    assert_specs_are_compatible(net_spec, train_env.action_spec(),'fail')
    
    
    global_step = tf.compat.v1.train.get_or_create_global_step()
    optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate, epsilon = 1e-5)
    
    agent = ppo_clip_agent.PPOClipAgent(
                train_env.time_step_spec(),
                train_env.action_spec(),
                optimizer = optimizer,
                actor_net = policy_net,
                value_net = value_net,
                num_epochs = n_epochs,
                train_step_counter = global_step,
                discount_factor = 0.998,
                gradient_clipping = 0.5,
                entropy_regularization = 1e-2,
                importance_ratio_clipping = 0.2,
                use_gae = True,
                use_td_lambda_return = True)
    agent.initialize()

In [51]:
train_eval_mpl()

c:\users\xieji\repos\gym-hand-sim\gym_hand_sim\envs
BoundedArraySpec(shape=(4,), dtype=dtype('int32'), name='action', minimum=0, maximum=10)
c:\users\xieji\repos\gym-hand-sim\gym_hand_sim\envs
-----------
<DistributionSpecV2: event_shape=(), dtype=<dtype: 'int32'>, parameters=<Params: type=<class 'tensorflow_probability.python.distributions.categorical.Categorical'>, params={'logits': TensorSpec(shape=(4, 11), dtype=tf.float32, name=None)}>>
TensorSpec(shape=(), dtype=tf.int32, name=None)


ValueError: fail:
TensorSpec(shape=(), dtype=tf.int32, name=None)
vs.
BoundedTensorSpec(shape=(4,), dtype=tf.int32, name='action', minimum=array(0), maximum=array(10))

In [None]:


'''
Init policy (actor) network and value (critic) network

for k = 0,1,2:
    collect set of trajectories {tau_i} by running pi_k = pi(theta_k) in env
    compute rewards R_t_hat
    compute advantage estimates A_t_hat based on current value function V_phi_k
    update policy (pi(theta_k+1))via PPO-Clip - SGA w/ Adam
    fit value function (V_phi_k+1) by regression
'''

In [4]:
time_step = train_py_env.reset()

#??
fc_layer_params = (100,100)

q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()
eval_polivy = agent.policy
collect_policy = agent.collect_policy

random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                train_env.action_spec())

compute_avg_return(eval_env, random_policy)
""""
while True:
    next_time_step = train_py_env.step(np.array([1.,1.,1.,1.]))
    train_py_env.render()
    print(next_time_step)"""

NameError: name 'agent' is not defined

NameError: name 'eval_env' is not defined