In [26]:

from pettingzoo.mpe import simple_tag_v3
from pettingzoo import ParallelEnv
import numpy as np
import functools
import os
import sys
sys.path.append('/home/mariusvaardal/AAMAS_project/AAMAS_project')

import supersuit as ss
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy

from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold


In [27]:
NUM_GOOD = 1
NUM_ADV = 3
NUM_OBST = 0
MAX_CYCLES = 50
CONTINOUS_ACTIONS = False
RENDER_MODE = None

In [28]:
def remove_agent_0_from_dicts(dicts):
    ret = []
    for dict in dicts:
        del dict['agent_0']
        ret.append(dict)
    return ret

In [31]:
# from agent_types.AvoidingAgent import AvoidingAgent
# from agent_types.AvoidingNearestAdversaryAgent import AvoidingNearestAdversaryAgent
from agent_types.ImmobileAgent import ImmobileAgent

class CustomEnvironment(ParallelEnv):
    metadata = {
        "name": "custom_environment_v0",
    }

    def __init__(self, num_good, num_adversaries, num_obstacles, max_cycles, continuous_actions, render_mode):
        self.env = simple_tag_v3.parallel_env(num_good=num_good, num_adversaries=num_adversaries, num_obstacles=num_obstacles, max_cycles=max_cycles, continuous_actions=continuous_actions, render_mode=render_mode)
        self.env.reset() 
        # Setting all the required attributes
        self.agents = [agent for agent in self.env.agents if agent.startswith("adversary")]
        self.possible_agents = [adv for adv in self.env.possible_agents if adv.startswith("adversary")]
        self.render_mode = render_mode
        # Adding agent_0 as part of the environment. Agent_0 is not meant to be included in the training
        self.agent_0 = ImmobileAgent('agent_0', num_adversaries=NUM_ADV, num_landmarks=NUM_OBST)
        
    def reset(self, seed=None, options=None):
        observations, infos = self.env.reset(seed=seed, options=options)
        self.agent_0.see(observations[self.agent_0.name])
        observations, infos = remove_agent_0_from_dicts([observations, infos])
        return observations, infos

    def step(self, actions):
        actions['agent_0'] = self.agent_0.get_action()
        observations, rewards, terminations, truncations, infos =  self.env.step(actions)
        if observations:
            self.agent_0.see(observations[self.agent_0.name])
            observations, rewards, terminations, truncations, infos = remove_agent_0_from_dicts([observations, rewards, terminations, truncations, infos])
        return observations, rewards, terminations, truncations, infos

    def render(self):
        self.env.render()

    # Observation space should be defined here.
    # lru_cache allows observation and action spaces to be memoized, reducing clock cycles required to get each agent's space.
    # If your spaces change over time, remove this line (disable caching).
    @functools.lru_cache(maxsize=None)
    def observation_space(self, agent):
        return self.env.observation_space(agent)

    # Action space should be defined here.
    # If your spaces change over time, remove this line (disable caching).
    @functools.lru_cache(maxsize=None)
    def action_space(self, agent):
        return self.env.action_space(agent)


In [32]:
log_path = "../../logs/log"

In [33]:
env = CustomEnvironment(num_good=NUM_GOOD, num_adversaries=NUM_ADV, num_obstacles=NUM_OBST, max_cycles=MAX_CYCLES, continuous_actions=CONTINOUS_ACTIONS, render_mode='human')
observations, infos = env.reset()

terminated = False
timestep = 1
while not terminated:
    # this is where you would insert your policy
    actions = {agent: env.action_space(agent).sample() for agent in env.agents}

    observations, rewards, terminations, truncations, infos = env.step(actions)
    if not observations:
        terminated = True
env.close()

In [34]:
env = CustomEnvironment(num_good=NUM_GOOD, num_adversaries=NUM_ADV, num_obstacles=NUM_OBST, max_cycles=MAX_CYCLES, continuous_actions=CONTINOUS_ACTIONS, render_mode=RENDER_MODE)
env.reset(seed=45)
conv_env = ss.pettingzoo_env_to_vec_env_v1(env)
conv_env = ss.concat_vec_envs_v1(conv_env, 2, num_cpus=0, base_class="stable_baselines3")


In [35]:
log_path = "../../logs/log"

model = PPO(
        MlpPolicy,
        conv_env,
        verbose=3,
        learning_rate=1e-3,
        batch_size=256,
        tensorboard_log=log_path,
    )

Using cpu device


In [24]:
best_model_save_path = "../../best_model/best_model"
threshold_callback = StopTrainingOnRewardThreshold(reward_threshold=300, verbose=1)
eval_callback = EvalCallback(conv_env,
                             eval_freq=10_000,
                             best_model_save_path=best_model_save_path,
                             callback_on_new_best=threshold_callback,
                             log_path=log_path,
                             n_eval_episodes=10,
                             verbose=1)
model.learn(total_timesteps=5_000_000, callback=eval_callback)

Logging to ../../logs/log/PPO_1


------------------------------
| time/              |       |
|    fps             | 4386  |
|    iterations      | 1     |
|    time_elapsed    | 2     |
|    total_timesteps | 12288 |
------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 3762        |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 24576       |
| train/                  |             |
|    approx_kl            | 0.009332046 |
|    clip_fraction        | 0.102       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.6        |
|    explained_variance   | -0.00683    |
|    learning_rate        | 0.001       |
|    loss                 | 6.18        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00498    |
|    value_loss           | 13.9        |
-----------------------------------------
---------------------------



New best mean reward!
------------------------------
| time/              |       |
|    fps             | 3360  |
|    iterations      | 5     |
|    time_elapsed    | 18    |
|    total_timesteps | 61440 |
------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 3326         |
|    iterations           | 6            |
|    time_elapsed         | 22           |
|    total_timesteps      | 73728        |
| train/                  |              |
|    approx_kl            | 0.0074867937 |
|    clip_fraction        | 0.0768       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.55        |
|    explained_variance   | 0.154        |
|    learning_rate        | 0.001        |
|    loss                 | 15           |
|    n_updates            | 50           |
|    policy_gradient_loss | -0.00412     |
|    value_loss           | 31           |
------------------------------

KeyboardInterrupt: 

In [32]:
del model

In [59]:
evaluate_policy(model=model, env=conv_env, n_eval_episodes=10)

(167.0, 48.590122453025366)

In [90]:
RENDER_MODE = "human"
env = CustomEnvironment(num_good=NUM_GOOD, num_adversaries=NUM_ADV, num_obstacles=NUM_OBST, max_cycles=MAX_CYCLES, continuous_actions=CONTINOUS_ACTIONS, render_mode=RENDER_MODE)
observations, infos = env.reset()

terminated = False
timestep = 1
episode_reward = 0
while not terminated:

    # this is where you would insert your policy
    actions = {agent: model.predict(observations[agent])[0] for agent in env.agents}

    observations, rewards, terminations, truncations, infos = env.step(actions)

    if not observations:
        terminated = True
    
print(f"Episode reward: {episode_reward}")
env.close()

: 

In [19]:
obs = [1, 2, 2, 1,1, 2, 2, 1,1, 2, 2, 1]
model.predict(obs)[0]

array(2)

In [25]:
%history -n 1-20

   1:

from pettingzoo.mpe import simple_tag_v3
from pettingzoo import ParallelEnv
import numpy as np
import functools
import os
import sys
sys.path.append('/home/mariusvaardal/AAMAS_project/AAMAS_project')

import supersuit as ss
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy

from stable_baselines3.common.evaluation import evaluate_policy
   2:
NUM_GOOD = 1
NUM_ADV = 3
NUM_OBST = 0
MAX_CYCLES = 50
CONTINOUS_ACTIONS = False
   3:
def remove_agent_0_from_dicts(dicts):
    ret = []
    for dict in dicts:
        del dict['agent_0']
        ret.append(dict)
    return ret
   4:
# from agent_types.AvoidingAgent import AvoidingAgent
# from agent_types.AvoidingNearestAdversaryAgent import AvoidingNearestAdversaryAgent
from agent_types.ImmobileAgent import ImmobileAgent

class CustomEnvironment(ParallelEnv):
    metadata = {
        "name": "custom_environment_v0",
    }

    def __init__(self, num_good, num_adversaries, num_obstacles, max_cycles, continuous_ac

In [None]:
# from agent_types.AvoidingAgent import AvoidingAgent
# from agent_types.AvoidingNearestAdversaryAgent import AvoidingNearestAdversaryAgent
from agent_types.ImmobileAgent import ImmobileAgent

class CustomEnvironment(ParallelEnv):
    metadata = {
        "name": "custom_environment_v0",
    }

    def __init__(self, num_good, num_adversaries, num_obstacles, max_cycles, continuous_actions, render_mode):
        self.env = simple_tag_v3.parallel_env(num_good=num_good, num_adversaries=num_adversaries, num_obstacles=num_obstacles, max_cycles=max_cycles, continuous_actions=continuous_actions, render_mode=render_mode)
        self.env.reset() 
        # Setting all the required attributes
        self.agents = [agent for agent in self.env.agents if agent.startswith("adversary")]
        self.possible_agents = [adv for adv in self.env.possible_agents if adv.startswith("adversary")]
        self.render_mode = render_mode
        # Adding agent_0 as part of the environment. Agent_0 is not meant to be included in the training
        self.agent_0 = ImmobileAgent('agent_0', num_adversaries=NUM_ADV, num_landmarks=NUM_OBST)
        
    def reset(self, seed=None, options=None):
        observations, infos = self.env.reset(seed=seed, options=options)
        self.agent_0.see(observations[self.agent_0.name])
        observations, infos = remove_agent_0_from_dicts([observations, infos])
        return observations, infos

    def step(self, actions):
        actions['agent_0'] = self.agent_0.get_action()
        observations, rewards, terminations, truncations, infos =  self.env.step(actions)
        if observations:
            self.agent_0.see(observations[self.agent_0.name])
            observations, rewards, terminations, truncations, infos = remove_agent_0_from_dicts([observations, rewards, terminations, truncations, infos])
        return observations, rewards, terminations, truncations, infos

    def render(self):
        self.env.render()

    # Observation space should be defined here.
    # lru_cache allows observation and action spaces to be memoized, reducing clock cycles required to get each agent's space.
    # If your spaces change over time, remove this line (disable caching).
    @functools.lru_cache(maxsize=None)
    def observation_space(self, agent):
        return self.env.observation_space(agent)

    # Action space should be defined here.
    # If your spaces change over time, remove this line (disable caching).
    @functools.lru_cache(maxsize=None)
    def action_space(self, agent):
        return self.env.action_space(agent)
