In [12]:
"""A simple multi-agent env with two agents playing rock paper scissors.

This demonstrates running the following policies in competition:
    (1) heuristic policy of repeating the same move
    (2) heuristic policy of beating the last opponent move
    (3) LSTM/feedforward PG policies
    (4) LSTM policy with custom entropy loss
"""

import argparse
import random
from gym.spaces import Discrete

from ray import tune
from ray.rllib.agents.pg.pg import PGTrainer
from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy
from ray.rllib.policy.policy import Policy
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.utils import try_import_tf

parser = argparse.ArgumentParser()
parser.add_argument("--stop", type=int, default=1000)

tf = try_import_tf()

ROCK = 0
PAPER = 1
SCISSORS = 2

In [13]:
class RockPaperScissorsEnv(MultiAgentEnv):
    """Two-player environment for rock paper scissors.

    The observation is simply the last opponent action."""

    def __init__(self, _):
        self.action_space = Discrete(3)
        self.observation_space = Discrete(3)
        self.player1 = "player1"
        self.player2 = "player2"
        self.last_move = None
        self.num_moves = 0

    def reset(self):
        self.last_move = (0, 0)
        self.num_moves = 0
        return {
            self.player1: self.last_move[1],
            self.player2: self.last_move[0],
        }

    def step(self, action_dict):
        move1 = action_dict[self.player1]
        move2 = action_dict[self.player2]
        self.last_move = (move1, move2)
        obs = {
            self.player1: self.last_move[1],
            self.player2: self.last_move[0],
        }
        r1, r2 = {
            (ROCK, ROCK): (0, 0),
            (ROCK, PAPER): (-1, 1),
            (ROCK, SCISSORS): (1, -1),
            (PAPER, ROCK): (1, -1),
            (PAPER, PAPER): (0, 0),
            (PAPER, SCISSORS): (-1, 1),
            (SCISSORS, ROCK): (-1, 1),
            (SCISSORS, PAPER): (1, -1),
            (SCISSORS, SCISSORS): (0, 0),
        }[move1, move2]
        rew = {
            self.player1: r1,
            self.player2: r2,
        }
        self.num_moves += 1
        done = {
            "__all__": self.num_moves >= 10,
        }
        return obs, rew, done, {}


class AlwaysSameHeuristic(Policy):
    """Pick a random move and stick with it for the entire episode."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.exploration = self._create_exploration()

    def get_initial_state(self):
        return [random.choice([ROCK, PAPER, SCISSORS])]

    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        info_batch=None,
                        episodes=None,
                        **kwargs):
        return list(state_batches[0]), state_batches, {}

    def learn_on_batch(self, samples):
        pass

    def get_weights(self):
        pass

    def set_weights(self, weights):
        pass


class BeatLastHeuristic(Policy):
    """Play the move that would beat the last move of the opponent."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.exploration = self._create_exploration()

    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        info_batch=None,
                        episodes=None,
                        **kwargs):
        def successor(x):
            if x[ROCK] == 1:
                return PAPER
            elif x[PAPER] == 1:
                return SCISSORS
            elif x[SCISSORS] == 1:
                return ROCK

        return [successor(x) for x in obs_batch], [], {}

    def learn_on_batch(self, samples):
        pass

    def get_weights(self):
        pass

    def set_weights(self, weights):
        pass


def run_same_policy(args):
    """Use the same policy for both agents (trivial case)."""

    tune.run(
        "PG",
        stop={"timesteps_total": args.stop},
        config={"env": RockPaperScissorsEnv})


def run_heuristic_vs_learned(args, use_lstm=False, trainer="PG"):
    """Run heuristic policies vs a learned agent.

    The learned agent should eventually reach a reward of ~5 with
    use_lstm=False, and ~7 with use_lstm=True. The reason the LSTM policy
    can perform better is since it can distinguish between the always_same vs
    beat_last heuristics.
    """

    def select_policy(agent_id):
        if agent_id == "player1":
            return "learned"
        else:
            return random.choice(["always_same", "beat_last"])

    tune.run(
        trainer,
        stop={"timesteps_total": args.stop},
        config={
            "env": RockPaperScissorsEnv,
            "gamma": 0.9,
            "num_workers": 0,
            "num_envs_per_worker": 4,
            "rollout_fragment_length": 10,
            "train_batch_size": 200,
            "multiagent": {
                "policies_to_train": ["learned"],
                "policies": {
                    "always_same": (AlwaysSameHeuristic, Discrete(3),
                                    Discrete(3), {}),
                    "beat_last": (BeatLastHeuristic, Discrete(3), Discrete(3),
                                  {}),
                    "learned": (None, Discrete(3), Discrete(3), {
                        "model": {
                            "use_lstm": use_lstm
                        }
                    }),
                },
                "policy_mapping_fn": select_policy,
            },
        })


def run_with_custom_entropy_loss(args):
    """Example of customizing the loss function of an existing policy.

    This performs about the same as the default loss does."""

    def entropy_policy_gradient_loss(policy, model, dist_class, train_batch):
        logits, _ = model.from_batch(train_batch)
        action_dist = dist_class(logits, model)
        return (-0.1 * action_dist.entropy() - tf.reduce_mean(
            action_dist.logp(train_batch["actions"]) *
            train_batch["advantages"]))

    EntropyPolicy = PGTFPolicy.with_updates(
        loss_fn=entropy_policy_gradient_loss)
    EntropyLossPG = PGTrainer.with_updates(
        name="EntropyPG", get_policy_class=lambda _: EntropyPolicy)
    run_heuristic_vs_learned(args, use_lstm=True, trainer=EntropyLossPG)

In [18]:
args = parser.parse_args([])
run_same_policy(args)
print("run_same_policy: ok.")
run_heuristic_vs_learned(args, use_lstm=True)
print("run_heuristic_vs_learned(w/ lstm): ok.")
run_heuristic_vs_learned(args, use_lstm=False)
print("run_heuristic_vs_learned (w/o lstm): ok.")
run_with_custom_entropy_loss(args)
print("run_with_custom_entropy_loss: ok.")

2020-06-23 17:15:45,379	INFO resource_spec.py:204 -- Starting Ray with 2.83 GiB memory available for workers and up to 1.42 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-06-23 17:15:46,904	INFO services.py:1168 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


Trial name,status,loc
PG_RockPaperScissorsEnv_00000,RUNNING,


[2m[36m(pid=3903)[0m E0623 17:15:49.548642600    3903 socket_utils_common_posix.cc:208] check for SO_REUSEPORT: {"created":"@1592925349.548612900","description":"Protocol not available","errno":92,"file":"external/com_github_grpc_grpc/src/core/lib/iomgr/socket_utils_common_posix.cc","file_line":185,"os_error":"Protocol not available","syscall":"getsockopt(SO_REUSEPORT)"}
[2m[36m(pid=3903)[0m E0623 17:15:49.549015700    3903 socket_utils_common_posix.cc:313] setsockopt(TCP_USER_TIMEOUT) Protocol not available
[2m[36m(pid=3896)[0m E0623 17:15:50.038192800    3896 socket_utils_common_posix.cc:208] check for SO_REUSEPORT: {"created":"@1592925350.038172900","description":"Protocol not available","errno":92,"file":"external/com_github_grpc_grpc/src/core/lib/iomgr/socket_utils_common_posix.cc","file_line":185,"os_error":"Protocol not available","syscall":"getsockopt(SO_REUSEPORT)"}
[2m[36m(pid=3896)[0m E0623 17:15:50.038433600    3896 socket_utils_common_posix.cc:313] setsockopt(T

Trial name,status,loc,iter,total time (s),ts,reward
PG_RockPaperScissorsEnv_00000,RUNNING,192.168.1.45:3899,1,0.452963,400,0


Result for PG_RockPaperScissorsEnv_00000:
  custom_metrics: {}
  date: 2020-06-23_17-15-56
  done: true
  episode_len_mean: 10.0
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 20
  episodes_total: 60
  experiment_id: c3dcbbcb5122435a84e9038466325533
  experiment_tag: '0'
  hostname: Sergei
  info:
    learner:
      model: {}
    num_steps_sampled: 1200
    num_steps_trained: 1200
  iterations_since_restore: 3
  node_ip: 192.168.1.45
  num_healthy_workers: 0
  off_policy_estimator: {}
  perf: {}
  pid: 3899
  policy_reward_max: {}
  policy_reward_mean: {}
  policy_reward_min: {}
  sampler_perf:
    mean_env_wait_ms: 0.034886251661536256
    mean_inference_ms: 1.3272085008224939
    mean_processing_ms: 0.491137815304975
  time_since_restore: 1.1745717525482178
  time_this_iter_s: 0.3583505153656006
  time_total_s: 1.1745717525482178
  timers:
    learn_throughput: 14988.579
    learn_time_ms: 26.687
    sample_throughput: 1098.766
  

Trial name,status,loc,iter,total time (s),ts,reward
PG_RockPaperScissorsEnv_00000,TERMINATED,,3,1.17457,1200,0


run_same_policy: ok.


Trial name,status,loc
PG_RockPaperScissorsEnv_00000,RUNNING,


[2m[36m(pid=3900)[0m 2020-06-23 17:16:01,383	INFO trainer.py:421 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
[2m[36m(pid=3900)[0m 2020-06-23 17:16:01,390	INFO trainer.py:578 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=3900)[0m 2020-06-23 17:16:01,912	INFO trainable.py:217 -- Getting current IP.
Result for PG_RockPaperScissorsEnv_00000:
  custom_metrics: {}
  date: 2020-06-23_17-16-02
  done: false
  episode_len_mean: 10.0
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 20
  episodes_total: 20
  experiment_id: 08a38c276f904a1689c9e39d980a76e8
  experiment_tag: '0'
  hostname: Sergei
  info:
    learner:
      learned:
        model: {}
    num_steps_sampled: 200
    num_steps_trained: 200
  iterations_since_restore: 1
  node_ip: 192.168.1.45
  num_healthy_workers: 0
  off_policy_estimator: {}
  perf:
    c

Trial name,status,loc,iter,total time (s),ts,reward
PG_RockPaperScissorsEnv_00000,RUNNING,192.168.1.45:3900,1,0.283674,200,0


Result for PG_RockPaperScissorsEnv_00000:
  custom_metrics: {}
  date: 2020-06-23_17-16-02
  done: true
  episode_len_mean: 10.0
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 20
  episodes_total: 100
  experiment_id: 08a38c276f904a1689c9e39d980a76e8
  experiment_tag: '0'
  hostname: Sergei
  info:
    learner:
      learned:
        model: {}
    num_steps_sampled: 1000
    num_steps_trained: 1000
  iterations_since_restore: 5
  node_ip: 192.168.1.45
  num_healthy_workers: 0
  off_policy_estimator: {}
  perf: {}
  pid: 3900
  policy_reward_max:
    always_same: 4.0
    beat_last: 5.0
    learned: 6.0
  policy_reward_mean:
    always_same: -0.07692307692307693
    beat_last: 0.3958333333333333
    learned: -0.15
  policy_reward_min:
    always_same: -6.0
    beat_last: -4.0
    learned: -5.0
  sampler_perf:
    mean_env_wait_ms: 0.0792821452293502
    mean_inference_ms: 1.7834565792182753
    mean_processing_ms: 1.3736229573026144
 

Trial name,status,loc,iter,total time (s),ts,reward
PG_RockPaperScissorsEnv_00000,TERMINATED,,5,0.818723,1000,0


run_heuristic_vs_learned(w/ lstm): ok.


Trial name,status,loc
PG_RockPaperScissorsEnv_00000,RUNNING,


[2m[36m(pid=3903)[0m 2020-06-23 17:16:06,486	INFO trainer.py:421 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
[2m[36m(pid=3903)[0m 2020-06-23 17:16:06,493	INFO trainer.py:578 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=3903)[0m 2020-06-23 17:16:06,947	INFO trainable.py:217 -- Getting current IP.
Result for PG_RockPaperScissorsEnv_00000:
  custom_metrics: {}
  date: 2020-06-23_17-16-07
  done: false
  episode_len_mean: 10.0
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 20
  episodes_total: 20
  experiment_id: 18b020a609bf4380afaf3b7751edaeab
  experiment_tag: '0'
  hostname: Sergei
  info:
    learner:
      learned:
        model: {}
    num_steps_sampled: 200
    num_steps_trained: 200
  iterations_since_restore: 1
  node_ip: 192.168.1.45
  num_healthy_workers: 0
  off_policy_estimator: {}
  perf:
    c

Trial name,status,loc,iter,total time (s),ts,reward
PG_RockPaperScissorsEnv_00000,TERMINATED,,5,0.718626,1000,0


run_heuristic_vs_learned (w/o lstm): ok.


Trial name,status,loc
EntropyPG_RockPaperScissorsEnv_00000,RUNNING,


[2m[36m(pid=3902)[0m 2020-06-23 17:16:11,042	INFO trainer.py:421 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
[2m[36m(pid=3902)[0m 2020-06-23 17:16:11,047	INFO trainer.py:578 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=3902)[0m 2020-06-23 17:16:11,537	INFO trainable.py:217 -- Getting current IP.
Result for EntropyPG_RockPaperScissorsEnv_00000:
  custom_metrics: {}
  date: 2020-06-23_17-16-11
  done: false
  episode_len_mean: 10.0
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 20
  episodes_total: 20
  experiment_id: 706e86afc4174bc4a67f74d4e41de712
  experiment_tag: '0'
  hostname: Sergei
  info:
    learner:
      learned:
        model: {}
    num_steps_sampled: 200
    num_steps_trained: 200
  iterations_since_restore: 1
  node_ip: 192.168.1.45
  num_healthy_workers: 0
  off_policy_estimator: {}
  perf

Trial name,status,loc,iter,total time (s),ts,reward
EntropyPG_RockPaperScissorsEnv_00000,TERMINATED,,5,0.702607,1000,0


run_with_custom_entropy_loss: ok.


In [7]:
import ray.rllib.examples as e

In [8]:
dir(e)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__']