In [23]:

from pettingzoo.mpe import simple_tag_v3
from pettingzoo import ParallelEnv
import numpy as np
import functools
import os
import sys
sys.path.append('/home/mariusvaardal/AAMAS_project/AAMAS_project')

import supersuit as ss
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy

from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold


In [24]:
NUM_GOOD = 1
NUM_ADV = 4
NUM_OBST = 0
MAX_CYCLES = 200
CONTINOUS_ACTIONS = False
RENDER_MODE = None

In [25]:
def remove_agent_0_from_dicts(dicts):
    ret = []
    for dict in dicts:
        del dict['agent_0']
        ret.append(dict)
    return ret

In [1]:
from agent_types.AvoidingAgent import AvoidingAgent
# from agent_types.AvoidingNearestAdversaryAgent import AvoidingNearestAdversaryAgent
# from agent_types.ImmobileAgent import ImmobileAgent

class CustomEnvironment(ParallelEnv):
    metadata = {
        "name": "custom_environment_v0",
    }

    def __init__(self, num_good, num_adversaries, num_obstacles, max_cycles, continuous_actions, render_mode):
        self.env = simple_tag_v3.parallel_env(num_good=num_good, num_adversaries=num_adversaries, num_obstacles=num_obstacles, max_cycles=max_cycles, continuous_actions=continuous_actions, render_mode=render_mode)
        self.env.reset() 
        # Setting all the required attributes
        self.agents = [agent for agent in self.env.agents if agent.startswith("adversary")]
        self.possible_agents = [adv for adv in self.env.possible_agents if adv.startswith("adversary")]
        self.render_mode = render_mode
        # Adding agent_0 as part of the environment. Agent_0 is not meant to be included in the training
        # self.agent_0 = AvoidingNearestAdversaryAgent('agent_0', num_adversaries=NUM_ADV, num_landmarks=NUM_OBST)
        self.agent_0 = AvoidingAgent('agent_0', num_adversaries=NUM_ADV, num_landmarks=NUM_OBST)
        
    def reset(self, seed=None, options=None):
        observations, infos = self.env.reset(seed=seed, options=options)
        self.agent_0.see(observations[self.agent_0.name])
        observations, infos = remove_agent_0_from_dicts([observations, infos])
        return observations, infos

    def step(self, actions):
        actions['agent_0'] = self.agent_0.get_action()
        observations, rewards, terminations, truncations, infos =  self.env.step(actions)
        if observations:
            self.agent_0.see(observations[self.agent_0.name])
            observations, rewards, terminations, truncations, infos = remove_agent_0_from_dicts([observations, rewards, terminations, truncations, infos])
        return observations, rewards, terminations, truncations, infos

    def render(self):
        self.env.render()

    # Observation space should be defined here.
    # lru_cache allows observation and action spaces to be memoized, reducing clock cycles required to get each agent's space.
    # If your spaces change over time, remove this line (disable caching).
    @functools.lru_cache(maxsize=None)
    def observation_space(self, agent):
        return self.env.observation_space(agent)

    # Action space should be defined here.
    # If your spaces change over time, remove this line (disable caching).
    @functools.lru_cache(maxsize=None)
    def action_space(self, agent):
        return self.env.action_space(agent)


ModuleNotFoundError: No module named 'agent_types'

In [33]:
env = CustomEnvironment(num_good=NUM_GOOD, num_adversaries=NUM_ADV, num_obstacles=NUM_OBST, max_cycles=MAX_CYCLES, continuous_actions=CONTINOUS_ACTIONS, render_mode='human')
observations, infos = env.reset()

terminated = False
timestep = 1
while not terminated:
    # this is where you would insert your policy
    actions = {agent: env.action_space(agent).sample() for agent in env.agents}

    observations, rewards, terminations, truncations, infos = env.step(actions)
    if not observations:
        terminated = True
env.close()

In [27]:
env = CustomEnvironment(num_good=NUM_GOOD, num_adversaries=NUM_ADV, num_obstacles=NUM_OBST, max_cycles=MAX_CYCLES, continuous_actions=CONTINOUS_ACTIONS, render_mode=RENDER_MODE)
env.reset(seed=45)
conv_env = ss.pettingzoo_env_to_vec_env_v1(env)
conv_env = ss.concat_vec_envs_v1(conv_env, 2, num_cpus=0, base_class="stable_baselines3")


In [53]:
log_path = "./logs/4_adv_0_to_50M_steps"

model = PPO(
        MlpPolicy,
        conv_env,
        verbose=3,
        learning_rate=1e-3,
        batch_size=256,
        tensorboard_log=log_path,
    )

Using cpu device


In [48]:
model_path = "./models/3_adv"
model = PPO.load(os.path.join(model_path, "106M_ANAA"), conv_env)

In [9]:
model_path = "./models/3_adv"
model = PPO.load(os.path.join(model_path, "best_model"), conv_env)

In [49]:
custom_object = {'learning_rate': 1e-4}

model = PPO.load('./models/4_adv/best_model_30M/best_model', conv_env, custom_objects=custom_object)

In [54]:
model = PPO(
        MlpPolicy,
        conv_env,
        verbose=3,
        learning_rate=1e-3,
        batch_size=256,
    )

Using cpu device


In [55]:
model.learn(total_timesteps=1_000)
model.save("./models/4_adv/4_adv_1k_steps")

------------------------------
| time/              |       |
|    fps             | 4772  |
|    iterations      | 1     |
|    time_elapsed    | 3     |
|    total_timesteps | 16384 |
------------------------------


In [52]:
from stable_baselines3.common.callbacks import CallbackList, CheckpointCallback, EvalCallback

best_model_save_path = "./models/4_adv/"

# # Save a checkpoint every 1000 steps
# checkpoint_callback = CheckpointCallback(
#                              save_freq=500_000,
#                              save_path="./models/4_adv",
#                              name_prefix="4_adv",
#                             )
# eval_callback = EvalCallback(conv_env,
#                              eval_freq=10_000,
#                              best_model_save_path=best_model_save_path,
#                              log_path=log_path,
#                              n_eval_episodes=100,
#                              verbose=1)

# callback = CallbackList([eval_callback])

# first 10M timesteps
model.learn(total_timesteps=1000)
model.

Logging to ./logs/4_adv_0_to_50M_steps/PPO_7


------------------------------
| time/              |       |
|    fps             | 4383  |
|    iterations      | 1     |
|    time_elapsed    | 3     |
|    total_timesteps | 16384 |
------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x7eff5ec730d0>

In [48]:
PPO.load??

[0;31mSignature:[0m
[0mPPO[0m[0;34m.[0m[0mload[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpath[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mpathlib[0m[0;34m.[0m[0mPath[0m[0;34m,[0m [0mio[0m[0;34m.[0m[0mBufferedIOBase[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0menv[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mgymnasium[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mEnv[0m[0;34m,[0m [0mForwardRef[0m[0;34m([0m[0;34m'VecEnv'[0m[0;34m)[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdevice[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mdevice[0m[0;34m,[0m [0mstr[0m[0;34m][0m [0;34m=[0m [0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcustom_objects[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mAny[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0

In [56]:

evaluate_policy(model=model, env=conv_env, n_eval_episodes=500)

(841.34, 95.57617067030881)

In [28]:
RENDER_MODE = None
MAX_CYCLES = 200
env = CustomEnvironment(num_good=NUM_GOOD, num_adversaries=NUM_ADV, num_obstacles=NUM_OBST, max_cycles=MAX_CYCLES, continuous_actions=CONTINOUS_ACTIONS, render_mode=RENDER_MODE)
observations, infos = env.reset()

terminated = False
timestep = 1
episode_reward = 0
print(env.agents)
while not terminated:

    # this is where you would insert your policy
    actions = {agent: model.predict(observations[agent])[0] for agent in env.agents}

    observations, rewards, terminations, truncations, infos = env.step(actions)

    episode_reward += sum(rewards.values()) / NUM_ADV

    if not observations:
        terminated = True
    
print(f"Episode reward: {episode_reward}")
env.close()

['adversary_0', 'adversary_1', 'adversary_2']
Episode reward: 710.0


In [25]:
%history -n 1-20

   1:

from pettingzoo.mpe import simple_tag_v3
from pettingzoo import ParallelEnv
import numpy as np
import functools
import os
import sys
sys.path.append('/home/mariusvaardal/AAMAS_project/AAMAS_project')

import supersuit as ss
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy

from stable_baselines3.common.evaluation import evaluate_policy
   2:
NUM_GOOD = 1
NUM_ADV = 3
NUM_OBST = 0
MAX_CYCLES = 50
CONTINOUS_ACTIONS = False
   3:
def remove_agent_0_from_dicts(dicts):
    ret = []
    for dict in dicts:
        del dict['agent_0']
        ret.append(dict)
    return ret
   4:
# from agent_types.AvoidingAgent import AvoidingAgent
# from agent_types.AvoidingNearestAdversaryAgent import AvoidingNearestAdversaryAgent
from agent_types.ImmobileAgent import ImmobileAgent

class CustomEnvironment(ParallelEnv):
    metadata = {
        "name": "custom_environment_v0",
    }

    def __init__(self, num_good, num_adversaries, num_obstacles, max_cycles, continuous_ac

In [None]:
# from agent_types.AvoidingAgent import AvoidingAgent
# from agent_types.AvoidingNearestAdversaryAgent import AvoidingNearestAdversaryAgent
from agent_types.ImmobileAgent import ImmobileAgent

class CustomEnvironment(ParallelEnv):
    metadata = {
        "name": "custom_environment_v0",
    }

    def __init__(self, num_good, num_adversaries, num_obstacles, max_cycles, continuous_actions, render_mode):
        self.env = simple_tag_v3.parallel_env(num_good=num_good, num_adversaries=num_adversaries, num_obstacles=num_obstacles, max_cycles=max_cycles, continuous_actions=continuous_actions, render_mode=render_mode)
        self.env.reset() 
        # Setting all the required attributes
        self.agents = [agent for agent in self.env.agents if agent.startswith("adversary")]
        self.possible_agents = [adv for adv in self.env.possible_agents if adv.startswith("adversary")]
        self.render_mode = render_mode
        # Adding agent_0 as part of the environment. Agent_0 is not meant to be included in the training
        self.agent_0 = ImmobileAgent('agent_0', num_adversaries=NUM_ADV, num_landmarks=NUM_OBST)
        
    def reset(self, seed=None, options=None):
        observations, infos = self.env.reset(seed=seed, options=options)
        self.agent_0.see(observations[self.agent_0.name])
        observations, infos = remove_agent_0_from_dicts([observations, infos])
        return observations, infos

    def step(self, actions):
        actions['agent_0'] = self.agent_0.get_action()
        observations, rewards, terminations, truncations, infos =  self.env.step(actions)
        if observations:
            self.agent_0.see(observations[self.agent_0.name])
            observations, rewards, terminations, truncations, infos = remove_agent_0_from_dicts([observations, rewards, terminations, truncations, infos])
        return observations, rewards, terminations, truncations, infos

    def render(self):
        self.env.render()

    # Observation space should be defined here.
    # lru_cache allows observation and action spaces to be memoized, reducing clock cycles required to get each agent's space.
    # If your spaces change over time, remove this line (disable caching).
    @functools.lru_cache(maxsize=None)
    def observation_space(self, agent):
        return self.env.observation_space(agent)

    # Action space should be defined here.
    # If your spaces change over time, remove this line (disable caching).
    @functools.lru_cache(maxsize=None)
    def action_space(self, agent):
        return self.env.action_space(agent)
