# Learning card games with RLLib

In [1]:
import ray
import time
from ray import tune
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.agents.dqn import DQNTrainer
from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy
from rlcard.rllib_utils.random_policy import RandomPolicy
from rlcard.rllib_utils.model import ParametricActionsModel
from ray.rllib.models import ModelCatalog
from rlcard.rllib_utils.rlcard_wrapper import RLCardWrapper
from rlcard.rllib_utils.custom_metrics import PlayerScoreCallbacks
from ray.tune.registry import register_env

### RLCard environments

RLCard is a Python library implementing some of the most popular card games, including Blackjack and some different flavours of Poker. In our fork of this library you can also experiment with a popular Italian game, *Scopone*.
Card games are a nice playgroiund for Reinforcement Learning because the reward is often straightforward, while the size of the state space can become huge quite fast, depending on the game. Furthermore, besides single player games like Blackjack, there are many multiplayer games in which the agents have to compete or collaborate.
We implemented a wrapper of RLLib for RLCard environments, which allows the researcher a lot of flexibility in assigning the same policy or different ones to each agent, experimenting different techniques of solving Multi-Agent Reinforcement Learning (MARL) problems.
For example, in a game like *Scopone*, in which 2 pairs of players play against each other, one can try to assign the same policy to each player, 2 different policies (one per team or one per player of the team) or 4 different policies, while also deciding whether to train all of them together or to freeze some of them while training the others.

In [2]:
# Decide which RLcard environment to use
# rlcard_env_id = 'blackjack'
# rlcard_env_id = 'doudizhu'
# rlcard_env_id = 'gin-rummy'
# rlcard_env_id = 'leduc-holdem'
# rlcard_env_id = 'limit-holdem'
# rlcard_env_id = 'mahjong'
# rlcard_env_id = 'no-limit-holdem'
# rlcard_env_id = 'simple-doudizhu'
# rlcard_env_id = 'uno'
rlcard_env_id = 'scopone'

### Define the environment, the policies, the trainer

Here we prepare the configuration of the training and the evaluation environment. The only difference is that we set some agents to behave randomly during the evaluation, so that we can estimate the increase in performance during the training. A random agent is most likely not a good benchmark, but we do not have any better deterministic baseline. One might want to use a pre-trained agent as a baseline, but we have not implemented this feature in this notebook. Please see *policy_arena.py* to have an idea how to do it.

In [3]:
env_config = {
    "rlcard_env_id": rlcard_env_id,
}

env_config_eval = {
    "rlcard_env_id": rlcard_env_id,
    "explore": False
#     "randomize_agents_eval": ['player_2', 'player_4']
}

In [4]:
ray.init(num_cpus=4)

2020-11-19 08:41:30,714	INFO resource_spec.py:212 -- Starting Ray with 4.3 GiB memory available for workers and up to 2.17 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-11-19 08:41:31,593	INFO services.py:1165 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


{'node_ip_address': '10.92.120.113',
 'raylet_ip_address': '10.92.120.113',
 'redis_address': '10.92.120.113:6379',
 'object_store_address': 'tcp://127.0.0.1:62370',
 'raylet_socket_name': 'tcp://127.0.0.1:62237',
 'webui_url': 'localhost:8265',
 'session_dir': 'C:\\Users\\chiappal\\AppData\\Local\\Temp\\ray\\session_2020-11-19_08-41-30_707813_21380'}

In [5]:
# Register env and model to be used by rllib
rlcard_environment = lambda _: RLCardWrapper(env_config)
register_env(rlcard_env_id, rlcard_environment)
ModelCatalog.register_custom_model("parametric_model_tf", ParametricActionsModel)

Traceback (most recent call last):
  File "c:\users\chiappal\appdata\local\continuum\miniconda3\envs\rl\lib\site-packages\ray\dashboard/dashboard.py", line 960, in <module>
    metrics_export_address=metrics_export_address)
  File "c:\users\chiappal\appdata\local\continuum\miniconda3\envs\rl\lib\site-packages\ray\dashboard/dashboard.py", line 513, in __init__
    build_dir = setup_static_dir(self.app)
  File "c:\users\chiappal\appdata\local\continuum\miniconda3\envs\rl\lib\site-packages\ray\dashboard/dashboard.py", line 414, in setup_static_dir
    "&& npm run build)", build_dir)
FileNotFoundError: [Errno 2] Dashboard build directory not found. If installing from source, please follow the additional steps required to build the dashboard(cd python/ray/dashboard/client && npm ci && npm run build): 'c:\\users\\chiappal\\appdata\\local\\continuum\\miniconda3\\envs\\rl\\lib\\site-packages\\ray\\dashboard\\client/build'



In [6]:
env_tmp = rlcard_environment(None)
policy_class = PPOTFPolicy
policy_config = {
    "model": {
        "custom_model": "parametric_model_tf",
        "fcnet_hiddens": [256, 256],
        "fcnet_activation": "relu"
    },
}

policies = {
    "ppo_policy_1": (policy_class,
                     env_tmp.observation_space,
                     env_tmp.action_space,
                     policy_config),
    "ppo_policy_2": (policy_class,
                     env_tmp.observation_space,
                     env_tmp.action_space,
                     policy_config),
    "rand_policy": (RandomPolicy,
                    env_tmp.observation_space,
                    env_tmp.action_space,
                    {}),
}

In [7]:
trainer_class = PPOTrainer

agent_to_policy_dict = {
    "player_1": "ppo_policy_1",
    "player_2": "ppo_policy_2",
    "player_3": "ppo_policy_1",
    "player_4": "ppo_policy_2"
}

trainer_config = {
    "env": rlcard_env_id,
    "env_config": env_config,
    "multiagent": {
        "policies_to_train": ['ppo_policy_1'],
        "policies": policies,
        "policy_mapping_fn": lambda agent_id: agent_to_policy_dict[agent_id],
    },
    "timesteps_per_iteration": 10000,
    "num_workers": 3,
    "evaluation_num_workers": 0,
    "evaluation_config": {
        "env_config": env_config_eval
    },
    "evaluation_num_episodes": 100,
    "evaluation_interval": 1,
    "callbacks": PlayerScoreCallbacks
}

start = time.time()
trainer = trainer_class(trainer_config)
for i in range(20):
    res = trainer.train()

#     trainer_eval.set_weights(trainer.get_weights(["ppo_policy_1"]))
#     res = trainer_eval.train()

    policy_rewards = sorted(['{}: {}'.format(k, v) for k, v in res['policy_reward_mean'].items()])
    print("Iteration {}. policy_reward_mean: {}".format(i, policy_rewards))

stop = time.time()
train_duration = time.strftime('%H:%M:%S', time.gmtime(stop-start))
print('Training finished ({}), check the results in ~/ray_results/<dir>/'.format(train_duration))

In [None]:
res = tune.run(
    trainer_class,
    name="2020-11-18-scopone",  # This is used to specify the logging directory.
    stop={
        "training_iteration": 1000,
#         "episodes_total": 10000
    },
    verbose=0,
    config=trainer_config,
    local_dir="./outputs",
    checkpoint_freq=100,
    checkpoint_at_end=True,
    restore=None
)

2020-11-19 08:41:58,005	ERROR syncer.py:46 -- Log sync requires rsync to be installed.


[2m[36m(pid=18672)[0m 2020-11-19 08:42:02,158	INFO trainer.py:585 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
[2m[36m(pid=18672)[0m 2020-11-19 08:42:02,158	INFO trainer.py:612 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=16544)[0m   ret = umr_sum(arr, axis, dtype, out, keepdims)
[2m[36m(pid=18672)[0m   ret = umr_sum(arr, axis, dtype, out, keepdims)


2020-11-19 08:42:42,118	ERROR trial_runner.py:350 -- Trial Runner checkpointing failed.
Traceback (most recent call last):
  File "c:\users\chiappal\appdata\local\continuum\miniconda3\envs\rl\lib\site-packages\ray\tune\trial_runner.py", line 348, in step
    self.checkpoint()
  File "c:\users\chiappal\appdata\local\continuum\miniconda3\envs\rl\lib\site-packages\ray\tune\trial_runner.py", line 279, in checkpoint
    os.rename(tmp_file_name, self.checkpoint_file)
FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'C:\\Users\\chiappal\\Documents\\rl_project\\rlcard\\outputs\\2020-11-18-scopone\\.tmp_checkpoint' -> 'C:\\Users\\chiappal\\Documents\\rl_project\\rlcard\\outputs\\2020-11-18-scopone\\experiment_state-2020-11-19_08-41-57.json'
2020-11-19 08:43:06,534	ERROR trial_runner.py:350 -- Trial Runner checkpointing failed.
Traceback (most recent call last):
  File "c:\users\chiappal\appdata\local\continuum\miniconda3\envs\rl\lib\site-packages\ray\tune\tria

To visualize tensorboard: tensorboard --logdir=./outputs