In [17]:

from pettingzoo.mpe import simple_tag_v3
from pettingzoo import ParallelEnv
import numpy as np
import functools
import os
import sys
from utils.utils import PROJECT_PATH
sys.path.append(PROJECT_PATH)

import supersuit as ss
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy

from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold, BaseCallback


In [2]:
NUM_GOOD = 1
NUM_ADV = 2
NUM_OBST = 0
MAX_CYCLES = 200
CONTINOUS_ACTIONS = False
RENDER_MODE = None

In [3]:
def remove_agent_0_from_dicts(dicts):
    ret = []
    for dict in dicts:
        del dict['agent_0']
        ret.append(dict)
    return ret

In [7]:
from agent_types.AvoidingAgent import AvoidingAgent

class CustomEnvironment(ParallelEnv):
    metadata = {
        "name": "custom_environment_v0",
    }

    def __init__(self, num_good, num_adversaries, num_obstacles, max_cycles, continuous_actions, render_mode):
        self.env = simple_tag_v3.parallel_env(num_good=num_good, num_adversaries=num_adversaries, num_obstacles=num_obstacles, max_cycles=max_cycles, continuous_actions=continuous_actions, render_mode=render_mode)
        self.env.reset() 
        # Setting all the required attributes
        self.agents = [agent for agent in self.env.agents if agent.startswith("adversary")]
        self.possible_agents = [adv for adv in self.env.possible_agents if adv.startswith("adversary")]
        self.render_mode = render_mode
        # Adding agent_0 as part of the environment. Agent_0 is not meant to be included in the training
        # self.agent_0 = AvoidingNearestAdversaryAgent('agent_0', num_adversaries=NUM_ADV, num_landmarks=NUM_OBST)
        self.agent_0 = AvoidingAgent('agent_0', num_adversaries=num_adversaries, num_landmarks=NUM_OBST)
        
    def reset(self, seed=None, options=None):
        observations, infos = self.env.reset(seed=seed, options=options)
        self.agent_0.see(observations[self.agent_0.name])
        observations, infos = remove_agent_0_from_dicts([observations, infos])
        return observations, infos

    def step(self, actions):
        actions['agent_0'] = self.agent_0.get_action()
        observations, rewards, terminations, truncations, infos =  self.env.step(actions)
        if observations:
            self.agent_0.see(observations[self.agent_0.name])
            observations, rewards, terminations, truncations, infos = remove_agent_0_from_dicts([observations, rewards, terminations, truncations, infos])
        return observations, rewards, terminations, truncations, infos

    def render(self):
        self.env.render()

    # Observation space should be defined here.
    # lru_cache allows observation and action spaces to be memoized, reducing clock cycles required to get each agent's space.
    # If your spaces change over time, remove this line (disable caching).
    @functools.lru_cache(maxsize=None)
    def observation_space(self, agent):
        return self.env.observation_space(agent)

    # Action space should be defined here.
    # If your spaces change over time, remove this line (disable caching).
    @functools.lru_cache(maxsize=None)
    def action_space(self, agent):
        return self.env.action_space(agent)


In [77]:
env = CustomEnvironment(num_good=NUM_GOOD, num_adversaries=NUM_ADV, num_obstacles=NUM_OBST, max_cycles=MAX_CYCLES, continuous_actions=CONTINOUS_ACTIONS, render_mode=RENDER_MODE)
env.reset(seed=45)
conv_env = ss.pettingzoo_env_to_vec_env_v1(env)
conv_env = ss.concat_vec_envs_v1(conv_env, 2, num_cpus=0, base_class="stable_baselines3")


In [79]:
def learning_rate_schedule():
    def func(progress_remaining):
        if progress_remaining > 0.5:
            return 0.0001
        if progress_remaining > 0.25:
            return 0.00001
        return 0.000001
    return func

In [71]:
log_path = "./logs/2_adv_0_to_50M_steps"
    

model = PPO(
        MlpPolicy,
        conv_env,
        verbose=3,
        learning_rate=learning_rate_schedule(),
        batch_size=256,
        #tensorboard_log=log_path,
    )

Using cpu device


In [None]:
from stable_baselines3.common.callbacks import CallbackList, CheckpointCallback, EvalCallback

best_model_save_path = "./models/2_adv/"

# # Save a checkpoint every 1000 steps
checkpoint_callback = CheckpointCallback(
                             save_freq=500_000,
                             save_path="./models/2_adv",
                             name_prefix="2_adv_after_15M_steps",
                            )
eval_callback = EvalCallback(conv_env,
                             eval_freq=10_000,
                             best_model_save_path=best_model_save_path,
                             log_path=log_path,
                             n_eval_episodes=100,
                             verbose=1)

callback_list = CallbackList([eval_callback, checkpoint_callback])

model.learn(total_timesteps=30_000_000, callback=callback_list)

In [67]:

evaluate_policy(model=model, env=conv_env, n_eval_episodes=500)



(2.32, 7.497839688870388)