In [33]:
import ray
import time
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.agents.dqn import DQNTrainer
from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy
from rlcard.rllib_utils.random_policy import RandomPolicy
from rlcard.rllib_utils.model import ParametricActionsModel
from ray.rllib.models import ModelCatalog
from rlcard.rllib_utils.rlcard_wrapper import RLCardWrapper
from ray.tune.registry import register_env

In [2]:
# Decide which RLcard environment to use
# rlcard_env_id = 'blackjack'
# rlcard_env_id = 'doudizhu'
# rlcard_env_id = 'gin-rummy'
rlcard_env_id = 'leduc-holdem'
# rlcard_env_id = 'limit-holdem'
# rlcard_env_id = 'mahjong'
# rlcard_env_id = 'no-limit-holdem'
# rlcard_env_id = 'simple-doudizhu'
# rlcard_env_id = 'uno'

In [27]:
env_config = {
    "rlcard_env_id": rlcard_env_id,
    "randomize_agents_eval": [1]
}

In [28]:
ray.init(num_cpus=4)

RuntimeError: Maybe you called ray.init twice by accident? This error can be suppressed by passing in 'ignore_reinit_error=True' or by calling 'ray.shutdown()' prior to 'ray.init()'.

In [29]:
# Register env and model to be used by rllib
rlcard_environment = lambda _: RLCardWrapper(env_config)
register_env(rlcard_env_id, rlcard_environment)
ModelCatalog.register_custom_model("parametric_model_tf", ParametricActionsModel)

In [30]:
env_tmp = rlcard_environment(None)
policies = {
    "ppo_policy_1": (PPOTFPolicy,
                     env_tmp.observation_space,
                     env_tmp.action_space,
                     ppo_trainer_config),
    "rand_policy": (RandomPolicy,
                    env_tmp.observation_space,
                    env_tmp.action_space,
                    {}),
}

In [31]:
# Define the trainer
ppo_trainer_config = {
    # "env": rlcard_env_id,
    "model": {
        "custom_model": "parametric_model_tf",
    },
}

trainer = PPOTrainer(config={
    "env": rlcard_env_id,
    "multiagent": {
        "policies_to_train": ['ppo_policy_1'],
        "policies": policies,
        "policy_mapping_fn": lambda agent_id: "ppo_policy_1",
    },
    # "num_gpus": 0.5,
    # "num_gpus_per_worker": 0,
})

trainer_eval = PPOTrainer(config={
    "env": rlcard_env_id,
    "multiagent": {
        "policies_to_train": ['ppo_policy_1'],
        "policies": policies,
        "policy_mapping_fn": lambda agent_id: "ppo_policy_1" if agent_id == "player_1" else "rand_policy",
    },
    # "num_gpus": 0.5,
})

2020-11-17 00:29:01,570	ERROR syncer.py:46 -- Log sync requires rsync to be installed.
2020-11-17 00:29:04,349	ERROR syncer.py:46 -- Log sync requires rsync to be installed.


In [34]:
start = time.time()

for i in range(20):
    trainer.train()

    trainer_eval.set_weights(trainer.get_weights(["ppo_policy_1"]))
    res = trainer_eval.train()

    policy_rewards = sorted(['{}: {}'.format(k, v) for k, v in res['policy_reward_mean'].items()])
    print("Iteration {}. policy_reward_mean: {}".format(i, policy_rewards))

stop = time.time()
train_duration = time.strftime('%H:%M:%S', time.gmtime(stop-start))
print('Training finished ({}), check the results in ~/ray_results/<dir>/'.format(train_duration))

Iteration 0. policy_reward_mean: ['ppo_policy_1: 1.21875', 'rand_policy: -1.21875']
Iteration 1. policy_reward_mean: ['ppo_policy_1: 1.1794171220400729', 'rand_policy: -1.1794171220400729']
Iteration 2. policy_reward_mean: ['ppo_policy_1: 1.2281938325991189', 'rand_policy: -1.2281938325991189']
Iteration 3. policy_reward_mean: ['ppo_policy_1: 1.2985803016858917', 'rand_policy: -1.2985803016858917']
Iteration 4. policy_reward_mean: ['ppo_policy_1: 1.269642857142857', 'rand_policy: -1.269642857142857']
Iteration 5. policy_reward_mean: ['ppo_policy_1: 1.2637867647058822', 'rand_policy: -1.2637867647058822']
Iteration 6. policy_reward_mean: ['ppo_policy_1: 1.400699912510936', 'rand_policy: -1.400699912510936']
Iteration 7. policy_reward_mean: ['ppo_policy_1: 1.2151060070671378', 'rand_policy: -1.2151060070671378']
Iteration 8. policy_reward_mean: ['ppo_policy_1: 1.3214928057553956', 'rand_policy: -1.3214928057553956']
Iteration 9. policy_reward_mean: ['ppo_policy_1: 1.1961400359066428', 'r