In [None]:
import gym, ray
import numpy as np
import copy
import requests
from ray.rllib.agents.ppo import PPOTrainer

In [None]:
ray.init(local_mode=True)

Note:

- This is a two-player game. So, maybe we could fix one player to always do the same thing, and then learn against that?
- Or, we could learn two agents?
- Or ???

In [None]:
class IteratedRPSEnv(gym.Env):
    
    done  = False
    
    def __init__ (self, env_config):
        # Note: Our action space is for ONE player; namely, exactly three
        # choices:
        #  - Rock
        #  - Paper
        #  - Scissors
        self.action_space      = gym.spaces.Discrete(3)
        
        # Observe the last moves of the two players
        self.observation_space = gym.spaces.Tuple((gym.spaces.Discrete(3), gym.spaces.Discrete(3)))

        """
        self.observation_space = gym.spaces.Box(
                                      low=0
                                    , high=np.iinfo(np.int32).max
                                    , shape=(1,)
                                    , dtype=np.int32
                                    )
        """
        
        # self.seed(1)
        self.reset()
    
    
    def reset (self):
        self.done = False
        return (0, 0)
    
    def action_to_int(self, action):
        if action == "Rock":
            return 0
        elif action == "Paper":
            return 1
        elif action == "Scissors":
            return 2
        
    def int_to_action(self, i):
        if i == 0:
            return "Rock"
        elif i == 1:
            return "Paper"
        elif i == 2:
            return "Scissors"
    
    def step (self, action):
        # action is either 0 or 1 or 2.
        
        player1Action = self.int_to_action(action)
        
        player2Action = "Rock" # For now let's have a fixed strategy for the opponent
        
        assert action in [0, 1, 2], "Unknown action!"
        
        # We are done once "step" is called; a round of the game is a single episode.
        self.done = True
        
        data = { "player1Action": player1Action
               , "player2Action": player2Action
               }
        
        # Do a post to the server; get the payoffs.
        response = requests.post("http://localhost:3000/play", json=data).json()
        
        reward = response["player1Payoff"]
        
        obs = (self.action_to_int (player1Action), self.action_to_int (player2Action))
        
        return [ obs, reward, self.done, response ]

In [None]:
env = IteratedRPSEnv(env_config = {})

In [None]:
env.step(action=0)

### Let's try training it!

In [None]:
trainer = PPOTrainer(env=IteratedRPSEnv, config={
    "framework": "tf2",
    "num_workers": 1,
    "env_config": {},
    "create_env_on_driver": True
})

In [None]:
for i in range(10):
    print(f"Training loop {i}")
    trainer.train()

In [None]:
trainer.evaluate()

In [None]:
# Then, open TensorBoard:
# cd ~/ray_results && conda activate rlib-client && tensorboard --logdir .