In [None]:
import gym, ray
import numpy as np
import copy
import requests
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.env.multi_agent_env import MultiAgentEnv

In [None]:
ray.init(local_mode=True)

In [None]:
COOPERATE = 0
DEFECT = 1

In [None]:
class IteratedPDEnv(MultiAgentEnv):
    def __init__ (self, env_config={}):
        # Exactly two agents.
        self.num_agents = 2
        self._agent_ids = [f"agent-{i}" for i in range(self.num_agents)]
        
        # Note: Our action space is for ONE player; namely, exactly two
        # choices:IteratedPDEnv
        #  - Defect
        #  - Cooperate
        self.action_space      = gym.spaces.Discrete(2)
        
        # Our observation is simply the last move of both players.
        self.observation_space = gym.spaces.Tuple((gym.spaces.Discrete(2), gym.spaces.Discrete(2)))
    
    
    def reset (self):  
        obs = {}
        for i in range(self.num_agents):
            obs[i] = (COOPERATE, COOPERATE)
        return obs
    
    def int_to_action(self, i):
        if i == 0:
            return "Cooperate"
        elif i == 1:
            return "Defect"
    
    def step (self, action_dict):        
        playerAction = {}
        print(action_dict)
        for i in range(self.num_agents):
            # action is either 0 or 1.
            assert action_dict[i] in [0, 1], "Unknown action!"
            playerAction[i] = self.int_to_action(action_dict[i])

        data = { "player1Action": playerAction[0]
               , "player2Action": playerAction[1]
               }
        
        # Do a post to the server; get the payoffs.
        response = requests.post("http://localhost:3000/play", json=data).json()

        reward = {}
        for i in range(self.num_agents):
            reward[i] = response[f"player{i+1}Payoff"]

        obs = copy.copy(action_dict)
        rew = reward
        
        dones = {}
        infos = {}
        for i in range(self.num_agents):
            dones[i] = True
            infos[i] = {}

        dones["__all__"] = True
        
        return obs, rew, dones, infos

In [None]:
from ray.tune import register_env

def env_creator(_):
    return IteratedPDEnv()
single_env = IteratedPDEnv()
env_name = "IteratedPDEnv"
register_env(env_name, env_creator)

In [None]:
obs_space = single_env.observation_space
act_space = single_env.action_space
num_agents = single_env.num_agents
def gen_policy():
    return (None, obs_space, act_space, {})
policy_graphs = {}
for i in range(num_agents):
    policy_graphs['agent-' + str(i)] = gen_policy()
def policy_mapping_fn(agent_id):
        return 'agent-' + str(agent_id)

In [None]:
config={
    "log_level": "WARN",
    "num_workers": 3,
    "num_cpus_for_driver": 1,
    "num_cpus_per_worker": 1,
    "lr": 5e-3,
    "model":{"fcnet_hiddens": [8, 8]},
    "multiagent": {
        "policies": policy_graphs,
        "policy_mapping_fn": policy_mapping_fn,
    },
    "env": "IteratedPDEnv"
}

In [None]:
from ray import tune

exp_name = 'more_jail_time_yay'
exp_dict = {
        'name': exp_name,
        'run_or_experiment': 'PG',
        "stop": {
            "training_iteration": 100
        },
        'checkpoint_freq': 20,
        "config": config,
}
# ray.init()
tune.run(**exp_dict)

In [None]:
env = IteratedPDEnv(env_config = {})

In [None]:
env.step(action=0)

### Let's try training it!

In [None]:
trainer = PPOTrainer(env=IteratedPDEnv, config={
    "framework": "tf2",
    "num_workers": 1,
    "env_config": {},
    "create_env_on_driver": True
})

In [None]:
for i in range(10):
    print(f"Training loop {i}")
    trainer.train()

In [None]:
# trainer.evaluate()

In [None]:
# Then, open TensorBoard:
# cd ~/ray_results && conda activate rlib-client && tensorboard --logdir .