In [1]:
import tensorflow as tf
tf.compat.v1.enable_eager_execution()
import numpy as np
import ray
from ray.rllib import agents
from tqdm.notebook import tqdm
import random
from ray.rllib.policy.policy import Policy
from gym.spaces import Discrete, Box
from ray.rllib.agents.ppo import PPOTrainer
from functools import partial
from ray.tune.registry import register_env, _global_registry, ENV_CREATOR
from ray.tune.logger import pretty_print
from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
from ray.rllib.models import ModelCatalog


import ray
from ray import tune
from ray.tune import track

import math
import gym

from gym_compete_to_rllib import GymCompeteToRLLibAdapter

from ray.rllib.models.tf.tf_modelv2 import TFModelV2

from load_gym_compete_policy import get_policy_value_nets
from ray.tune.registry import register_env

from ray.rllib.models.tf.tf_action_dist import DiagGaussian 
import datetime, uuid

In [2]:
class KerasModelModel(TFModelV2):
    """Create an RLLib policy from policy+value keras models."""
    def __init__(self, *args, policy_net=None, value_net=None, **kwargs):
        super(KerasModelModel, self).__init__(*args, **kwargs)
        self.policy_net = policy_net
        self.value_net = value_net
        self.register_variables(policy_net.variables + value_net.variables)
        
    def forward(self, input_dict, state, seq_lens):
        obs = input_dict["obs"]
        model_out = self.policy_net(obs)
        self._value_out = self.value_net(obs)
        #if obs.shape[0] == 1:
        self._value_out = self._value_out[0]
        return model_out, state
    
    def value_function(self):
        return self._value_out
    
class GymCompetePretrainedModel(KerasModelModel):
    """Load a policy from gym_compete."""
    def __init__(self, *args, **kwargs):
        env_name = args[3]['custom_model_config']['env_name']
        agent_id = args[3]['custom_model_config']['agent_id']
        nets = get_policy_value_nets(env_name, agent_id)
        n_out = int(nets['policy_mean_logstd_flat'].output_shape[1])
        super(GymCompetePretrainedModel, self).__init__(*args, **kwargs,
                                                        policy_net=nets['policy_mean_logstd_flat'],
                                                        value_net=nets['value'])


In [3]:
str(datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + '-' + str(uuid.uuid1()))

'20200717-030329-5395ac7e-c7c9-11ea-bad7-00155d2cc007'

In [4]:
ModelCatalog.register_custom_model("GymCompetePretrainedModel", GymCompetePretrainedModel)
ModelCatalog.register_custom_action_dist("DiagGaussian", DiagGaussian)

In [5]:
def gym_compete_env_with_video(env_name, directory=None):
    """Record videos from gym_compete environments using aprl."""
    
    try:
        from aprl.envs.wrappers import VideoWrapper
        from aprl.visualize.annotated_gym_compete import AnnotatedGymCompete
        from aprl.score_agent import default_score_config
    except:
        pass

    
    # hacks to make it work with tf2
    import sys
    from unittest.mock import Mock
    sys.modules['stable_baselines'] = Mock()
    import tensorflow as tf
    tf.Session = Mock()
    
    if directory is None:
        directory = 'video-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + '-' + str(uuid.uuid1())

    from aprl.envs.wrappers import VideoWrapper
    from aprl.visualize.annotated_gym_compete import AnnotatedGymCompete
    from aprl.score_agent import default_score_config
    

    config = default_score_config()
    env = gym.make(env_name)

    env = AnnotatedGymCompete(env=env, env_name=env_name, agent_a_type=config['agent_a_type'], agent_b_type=config['agent_b_type'],
                        agent_a_path=config['agent_a_path'], agent_b_path=config['agent_b_path'],
                        mask_agent_index=config['mask_agent_index'], resolution=config['video_params']['annotation_params']['resolution'],
                        font=config['video_params']['annotation_params']['font'], font_size=config['video_params']['annotation_params']['font_size'],
                        short_labels=config['video_params']['annotation_params']['short_labels'], camera_config=config['video_params']['annotation_params']['camera_config']
    )

    env = VideoWrapper(env=env, directory=directory)
    
    #sys.modules['stable_baselines'] = b
    #delattr(tf, 'Session')


    return env

In [6]:
env_config = {}
env_name = 'multicomp/YouShallNotPassHumans-v0'
env_name_rllib = env_name.split('/')[1] + '_rllib'
created_envs = []
def create_env(config=None, env_name=env_name):
    #env = gym.make(env_name)
    env = gym_compete_env_with_video(env_name)
    created_envs.append(env)
    return GymCompeteToRLLibAdapter(lambda: env)
register_env(env_name_rllib, create_env)
env_cls = create_env


def build_trainer_config(restore_state=None, train_policies=None, config=None):
    """Build configuration for 1 run."""
    obs_space = env_cls(env_config).observation_space
    act_space = env_cls(env_config).action_space

    policy_template = "player_%d"

    def get_agent_config(agent_id):
        agent_config = (PPOTFPolicy, obs_space, act_space, {
            'model': {
                        "custom_model": "GymCompetePretrainedModel",
                        "custom_model_config": {
                            "agent_id": agent_id - 1,
                            "env_name": env_name,
                            "model_config": {},
                            "name": "model_%s" % (agent_id - 1)
                        },           
                        
                    },
            
            "framework": "tfe",
        })
        
        return agent_config
        
        agent_config = (PPOTFPolicy, obs_space, act_space, {
                    "model": {
                        "use_lstm": False,
                        "fcnet_hiddens": [64, 64],
                        #"custom_action_dist": "DiagGaussian",
                    },
                    "framework": "tfe",
                })
        
        return agent_config

    N_POLICIES = 2

    policies = {policy_template % i: get_agent_config(i) for i in range(1, 1  + N_POLICIES)}
    policies_keys = list(sorted(policies.keys()))

    def select_policy(agent_id):
        assert agent_id in ["player_1", "player_2"]
        agent_ids = ["player_1", "player_2"]
        
        # selecting the corresponding policy (only for 2 policies)
        return policies_keys[agent_ids.index(agent_id)]

        # randomly choosing an opponent
        # return np.random.choice(list(policies.keys()))
    
    if train_policies is None:
        train_policies = list(policies.keys())
        
    for k in train_policies:
        assert k in policies.keys()

    config = {
        "env": env_name_rllib,
    #    "gamma": 0.9,
      "num_workers": 0,
    #  "num_envs_per_worker": 10,
    #   "rollout_fragment_length": 10,
       "train_batch_size": config['train_batch_size'],
        "multiagent": {
            "policies_to_train": train_policies,
            "policies": policies,
            "policy_mapping_fn": select_policy,
        },
        "framework": "tfe",
        #"train_batch_size": 512
        #"num_cpus_per_worker": 2
    }
    return config


def build_trainer(restore_state=None, train_policies=None, config=None):
    """Create a RPS trainer for 2 agents, restore state, and only train specific policies."""
    
    print("Using config")
    print(config)
    cls = PPOTrainer
    trainer = cls(config=config)
    env = trainer.workers.local_worker().env
    if restore_state is not None:
        trainer.restore_from_object(restore_state)
    return trainer

def train(trainer, stop_iters, do_track=True):
    """Train the agents and return the state of the trainer."""
    for _ in range(stop_iters):
        results = trainer.train()
        print(pretty_print(results))
        if do_track:
            track.log(**results)
    o = trainer.save_to_object()
    return o

trainer = None

def train_one(config, restore_state=None, do_track=True):
    print(config)
    rl_config = build_trainer_config(restore_state=restore_state,
                              train_policies=config['train_policies'],
                              config=config)
    global trainer
    trainer = build_trainer(restore_state=None, config=rl_config)
    train(trainer, config['train_steps'], do_track=do_track)


# try changing learning rate
config = {'train_batch_size': 128}

config['train_steps'] = 10
config['train_policies'] = [] #['player_1', 'player_2']
config['num_workers'] = 3

In [None]:
train_one(config, do_track=False)

{'train_batch_size': 128, 'train_steps': 10, 'train_policies': [], 'num_workers': 3}


GLFW error: 65544, desc: b'Linux: Failed to watch for joystick connections in /dev/input: No such file or directory'
GLFW error: 65544, desc: b'Linux: Failed to open joystick device directory /dev/input: No such file or directory'


Creating agent humanoid_blocker
Reading agent XML from: /home/sergei/git/chai/multiagent-competition/gym_compete/new_envs/assets/humanoid_body.xml
Creating agent humanoid
Reading agent XML from: /home/sergei/git/chai/multiagent-competition/gym_compete/new_envs/assets/humanoid_body.xml
Scene XML path: /home/sergei/git/chai/multiagent-competition/gym_compete/new_envs/assets/world_body.humanoid_body.humanoid_body.xml
Created Scene with agents
Creating agent humanoid_blocker
Reading agent XML from: /home/sergei/git/chai/multiagent-competition/gym_compete/new_envs/assets/humanoid_body.xml
Creating agent humanoid
Reading agent XML from: /home/sergei/git/chai/multiagent-competition/gym_compete/new_envs/assets/humanoid_body.xml
Scene XML path: /home/sergei/git/chai/multiagent-competition/gym_compete/new_envs/assets/world_body.humanoid_body.humanoid_body.xml
Created Scene with agents


pip install 'ray[tune]' to see TensorBoard files.
Could not instantiate TBXLogger: No module named 'tensorboardX'.


Using config
{'env': 'YouShallNotPassHumans-v0_rllib', 'num_workers': 0, 'train_batch_size': 128, 'multiagent': {'policies_to_train': [], 'policies': {'player_1': (<class 'ray.rllib.policy.tf_policy_template.PPOTFPolicy'>, Box(380,), Box(17,), {'model': {'custom_model': 'GymCompetePretrainedModel', 'custom_model_config': {'agent_id': 0, 'env_name': 'multicomp/YouShallNotPassHumans-v0', 'model_config': {}, 'name': 'model_0'}}, 'framework': 'tfe'}), 'player_2': (<class 'ray.rllib.policy.tf_policy_template.PPOTFPolicy'>, Box(380,), Box(17,), {'model': {'custom_model': 'GymCompetePretrainedModel', 'custom_model_config': {'agent_id': 1, 'env_name': 'multicomp/YouShallNotPassHumans-v0', 'model_config': {}, 'name': 'model_1'}}, 'framework': 'tfe'})}, 'policy_mapping_fn': <function build_trainer_config.<locals>.select_policy at 0x7fcb88247790>}, 'framework': 'tfe'}
Creating agent humanoid_blocker
Reading agent XML from: /home/sergei/git/chai/multiagent-competition/gym_compete/new_envs/assets/h

Install gputil for GPU system monitoring.


Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 380)]        0                                            
__________________________________________________________________________________________________
observation_preprocessing_layer (None, 380)          761         input_2[0][0]                    
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 64)           24384       observation_preprocessing_layer_2
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, 64)           4160        dense_2[0][0]                    
____________________________________________________________________________________________

In [None]:
obs = env_cls().reset()

In [None]:
acts = [trainer.compute_action(obs['player_1'], policy_id='player_1') for _ in range(1000)]

In [None]:
np.mean(acts, axis=0)

In [None]:
nets = get_policy_value_nets(env_name, 0)
policy_net_orig = nets['policy_mean_logstd_flat']

In [None]:
m = trainer.get_policy('player_1').model
m.policy_net(obs['player_1'].reshape(1, -1))

In [None]:
nets['policy_mean_logstd'](obs['player_1'].reshape(1, -1))

In [None]:
mean, logstd = tf.split(policy_net_orig(obs['player_1'].reshape(1, -1)), 2, axis=1)

In [None]:
np.mean([np.clip(np.random.normal(loc=mean[0], scale=np.exp(logstd[0]), size=(17,)), -0.4, 0.4) for _ in range(1000)], axis=0)

In [None]:
np.mean([trainer.compute_action(obs['player_1'], policy_id='player_1') for _ in range(1000)], axis=0)

In [None]:
trainer.compute_action(obs['player_1'], policy_id='player_1')

In [None]:
env_cls().action_space.high

In [None]:
[(np.allclose(x, y), np.linalg.norm(x-y, ord=1), np.linalg.norm(x)) for x, y in zip(policy_net_orig.get_weights(), m.policy_net.get_weights())]

In [None]:
m.policy_net

In [None]:
?trainer.compute_action

In [None]:
created_envs

In [None]:
# CLOSE ENVS

In [None]:
[x.close() for x in created_envs]

In [None]:
dir(trainer)

In [None]:
?trainer.step

In [None]:
trainer.config