# AgileRL Speaker-Listener with MATD3
https://docs.agilerl.com/en/latest/tutorials/pettingzoo/matd3.html

In [1]:
!pip install --upgrade pip



In [2]:
!pip install pettingzoo[mpe]
!pip install agilerl
!pip install imageio

zsh:1: no matches found: pettingzoo[mpe]




"""
This tutorial shows how to train an MATD3 agent on the simple speaker listener multi-particle environment.

Authors: Michael (https://github.com/mikepratt1), Nickua (https://github.com/nicku-a)
"""

In [3]:
import os

import numpy as np
import torch
from pettingzoo.mpe import simple_speaker_listener_v4
from tqdm import trange

from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer
from agilerl.hpo.mutation import Mutations
from agilerl.hpo.tournament import TournamentSelection
from agilerl.utils.utils import initialPopulation


###
import random

In [4]:
device = torch.device("mps")

In [5]:
# Define the network configuration
def define_network_config():
    return {
        "arch": "mlp",  # Network architecture
        "h_size": [32, 32],  # Actor hidden size
    }

# Define the initial hyperparameters
def initialize_hyperparameters():
    return {
        "POPULATION_SIZE": 4,
        "ALGO": "MATD3",  # Algorithm
        # Swap image channels dimension from last to first [H, W, C] -> [C, H, W]
        "CHANNELS_LAST": False,
        "BATCH_SIZE": 32,  # Batch size
        "LR": 0.01,  # Learning rate
        "GAMMA": 0.95,  # Discount factor
        "MEMORY_SIZE": 100000,  # Max memory buffer size
        "LEARN_STEP": 5,  # Learning frequency
        "TAU": 0.01,  # For soft update of target parameters
        "POLICY_FREQ": 2,  # Policy frequnecy
        # Instantiate a tournament selection object (used for HPO)
        'TOURNAMENT_SIZE': 2,
        'ELITISM': True,
        # Instantiate a mutations object (used for HPO)
        'NO_MUTATION': 0.2,
        'ARCHITECTURE_MUTATION': 0.2,
        'NEW_LAYER_MUTATION': 0.2,
        'PARAMETER_MUTATION': 0.2,
        'ACTIVATION_MUTATION': 0,
        'RL_HP_MUTATION': 0.2,
        'RL_HP_SELECTION': ["lr", "learn_step", "batch_size"], # RL hyperparams selected for mutation
        'MUTATION_SD': 0.1,

        
    }

In [6]:
# Define the simple speaker listener environment as a parallel environment
def initialize_environment():
    env = simple_speaker_listener_v4.parallel_env(continuous_actions=True)
    env.reset()
    return env

# Configure the multi-agent algo input arguments
def set_action_and_state_dimensions(env, init_hp):
    """
    環境から行動次元と状態次元を設定し、初期ハイパーパラメータを更新する。
    env: 学習環境
    init_hp: 初期ハイパーパラメータの辞書
    """
    try:
        # まず、状態次元を設定する
        # 状態空間が離散的か連続的かに基づいて状態次元を取得する
        state_dim = [env.observation_space(agent).n for agent in env.agents]
        one_hot = True
    except Exception:
        # 連続的な状態空間の場合
        state_dim = [env.observation_space(agent).shape for agent in env.agents]
        one_hot = False

    try:
        # 次に、行動次元を設定する
        # 行動空間が離散的か連続的かに基づいて行動次元を取得する
        action_dim = [env.action_space(agent).n for agent in env.agents]
        init_hp["DISCRETE_ACTIONS"] = True
        init_hp["MAX_ACTION"] = None
        init_hp["MIN_ACTION"] = None
    except Exception:
        # 連続的な行動空間の場合
        action_dim = [env.action_space(agent).shape[0] for agent in env.agents]
        init_hp["DISCRETE_ACTIONS"] = False
        init_hp["MAX_ACTION"] = [env.action_space(agent).high for agent in env.agents]
        init_hp["MIN_ACTION"] = [env.action_space(agent).low for agent in env.agents]

    # 状態次元の調整（CHANNELS_LAST オプションが True の場合）
    if init_hp["CHANNELS_LAST"]:
        state_dim = [
            (state_dim[2], state_dim[0], state_dim[1]) for state_dim in state_dim
        ]

    return state_dim, action_dim, init_hp, one_hot


def create_initial_population(algo, state_dim, action_dim, one_hot, net_config, init_hp, population_size, device):
    """
    初期人口を生成する。
    algo: 使用する強化学習アルゴリズム
    state_dim: 状態次元
    action_dim: 行動次元
    one_hot: 状態がワンホットエンコードされているかどうか
    net_config: ネットワーク構成
    init_hp: 初期ハイパーパラメータ
    device: 使用するデバイス（例: "cuda"、"mps"、"cpu"）
    """
    #pop = []
    #for _ in range(init_hp["POPULATION_SIZE"]):
    #    agent = initialPopulation(algo, state_dim, action_dim, one_hot, net_config, init_hp, population_size, device)
    #    pop.append(agent)

    #return pop
    pop = initialPopulation(
        init_hp["ALGO"],
        state_dim,
        action_dim,
        one_hot,
        net_config,
        init_hp,
        population_size=init_hp["POPULATION_SIZE"],
        device=device,
    )


def configure_replay_buffer(init_hp, field_names, device):
    """
    リプレイバッファを設定する。
    init_hp: 初期ハイパーパラメータ
    agent_ids: エージェントのIDリスト
    device: 使用するデバイス（例: "cuda"、"mps"、"cpu"）
    """
    # リプレイバッファを格納するためのデータ構造を定義
    field_names = ["state", "action", "reward", "next_state", "done"]

    # リプレイバッファのインスタンスを作成
    memory = MultiAgentReplayBuffer(
        init_hp["MEMORY_SIZE"],  # バッファの最大サイズ
        field_names=field_names,  # 格納するフィールド名
        agent_ids=init_hp["AGENT_IDS"],      # エージェントのID
        device=device,             # 使用するデバイス
    )

    return memory


def tournament_selection(init_hp):
    """
    トーナメント選択の設定を行う。
    init_hp: 初期ハイパーパラメータ
    """
    tournament = TournamentSelection(
        tournament_size=init_hp['TOURNAMENT_SIZE'],
        elitism=init_hp['ELITISM'],
        population_size=init_hp['POPULATION_SIZE'],
        evo_step =1,
    )
    return tournament


def mutations_config(init_hp, net_config):
    """
    突然変異の設定を行う。
    init_hp: 初期ハイパーパラメータ
    net_config: ネットワーク構成
    """
    mutations = Mutations(
        algo=init_hp["ALGO"],
        no_mutation=init_hp['NO_MUTATION'],
        architecture=init_hp['ARCHITECTURE_MUTATION'],
        new_layer_prob=init_hp['NEW_LAYER_MUTATION'],
        parameters=init_hp['PARAMETER_MUTATION'],
        activation=init_hp['ACTIVATION_MUTATION'],
        rl_hp=init_hp['RL_HP_MUTATION'],
        rl_hp_selection=init_hp['RL_HP_SELECTION'],
        mutation_sd=init_hp['MUTATION_SD'],
        agent_ids=init_hp["AGENT_IDS"],
        arch=net_config["arch"],
        rand_seed=1,
        device=device
    )
    return mutations


def training_loop(env, pop, memory, tournament, mutations, init_hp, net_config, max_episodes, max_steps):
    epsilon = 1.0
    eps_end = 0.1
    eps_decay = 0.995
    evo_epochs = 20
    evo_loop = 1
    elite = pop[0]

    for idx_epi in range(max_episodes):
        for agent in pop:
            state, info = env.reset()
            agent_reward = {agent_id: 0 for agent_id in env.agents}
            if init_hp["CHANNELS_LAST"]:
                state = {
                    agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3])
                    for agent_id, s in state.items()
                }

            for _ in range(max_steps):
                agent_mask = info.get("agent_mask")
                env_defined_actions = info.get("env_defined_actions")

                cont_actions, discrete_action = agent.getAction(
                    state, epsilon, agent_mask, env_defined_actions
                )
                action = discrete_action if agent.discrete_actions else cont_actions

                next_state, reward, termination, truncation, info = env.step(action)

                if init_hp["CHANNELS_LAST"]:
                    state = {agent_id: np.squeeze(s) for agent_id, s in state.items()}
                    next_state = {
                        agent_id: np.moveaxis(ns, [-1], [-3])
                        for agent_id, ns in next_state.items()
                    }

                memory.save2memory(state, cont_actions, reward, next_state, termination)

                for agent_id, r in reward.items():
                    agent_reward[agent_id] += r

                if (memory.counter % agent.learn_step == 0) and (len(memory) >= agent.batch_size):
                    experiences = memory.sample(agent.batch_size)
                    agent.learn(experiences)

                if init_hp["CHANNELS_LAST"]:
                    next_state = {
                        agent_id: np.expand_dims(ns, 0)
                        for agent_id, ns in next_state.items()
                    }
                state = next_state

                if any(truncation.values()) or any(termination.values()):
                    break

            score = sum(agent_reward.values())
            agent.scores.append(score)

        epsilon = max(eps_end, epsilon * eps_decay)

        if (idx_epi + 1) % evo_epochs == 0:
            fitnesses = [
                agent.test(
                    env,
                    swap_channels=init_hp["CHANNELS_LAST"],
                    max_steps=max_steps,
                    loop=evo_loop,
                )
                for agent in pop
            ]

            print(f"Episode {idx_epi + 1}/{max_episodes}")
            print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}')
            print(
                f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}'
            )

            elite, pop = tournament.select(pop)
            pop = mutations.mutation(pop)

    save_trained_model(elite, './models/MATD3', "MATD3_trained_agent.pt")

def save_trained_model(elite, path, filename):
    os.makedirs(path, exist_ok=True)
    save_path = os.path.join(path, filename)
    elite.saveCheckpoint(save_path)



In [7]:
# Main code
if __name__ == "__main__":
    #device = torch.device("mps")
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("===== AgileRL Online Multi-Agent Demo =====")

    net_config = define_network_config()
    init_hp = initialize_hyperparameters()
    env = initialize_environment()

    # Set the number of agents in the INIT_HP dictionary
    init_hp["N_AGENTS"] = env.num_agents  # Assuming env.agents gives the list of agents
    init_hp["AGENT_IDS"] = env.agents  # エージェントIDのリストを設定


    state_dim, action_dim, init_hp, one_hot= set_action_and_state_dimensions(env, init_hp)
    pop = create_initial_population(init_hp["ALGO"], state_dim, action_dim, one_hot, net_config, init_hp, init_hp["POPULATION_SIZE"], device)
    memory = configure_replay_buffer(init_hp, env.agents, device=device)
    tournament = tournament_selection(init_hp)
    mutations = mutations_config(init_hp, net_config)
    
    training_loop(env, pop, memory, tournament, mutations, init_hp, net_config, max_episodes=6000, max_steps=25)
    
    save_trained_model(pop[0], "./models/MATD3", "MATD3_trained_agent.pt")

===== AgileRL Online Multi-Agent Demo =====


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


TypeError: 'NoneType' object is not subscriptable

In [None]:
if __name__ == "__main__":
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = torch.device("mps")
    print("===== AgileRL Online Multi-Agent Demo =====")

    # Define the network configuration
    NET_CONFIG = {
        "arch": "mlp",  # Network architecture
        "h_size": [32, 32],  # Actor hidden size
    }

    # Define the initial hyperparameters
    INIT_HP = {
        "POPULATION_SIZE": 4,
        "ALGO": "MATD3",  # Algorithm
        # Swap image channels dimension from last to first [H, W, C] -> [C, H, W]
        "CHANNELS_LAST": False,
        "BATCH_SIZE": 32,  # Batch size
        "LR": 0.01,  # Learning rate
        "GAMMA": 0.95,  # Discount factor
        "MEMORY_SIZE": 100000,  # Max memory buffer size
        "LEARN_STEP": 5,  # Learning frequency
        "TAU": 0.01,  # For soft update of target parameters
        "POLICY_FREQ": 2,  # Policy frequnecy
    }

    # Define the simple speaker listener environment as a parallel environment
    env = simple_speaker_listener_v4.parallel_env(continuous_actions=True)
    env.reset()

    # Configure the multi-agent algo input arguments
    try:
        state_dim = [env.observation_space(agent).n for agent in env.agents]
        one_hot = True
    except Exception:
        state_dim = [env.observation_space(agent).shape for agent in env.agents]
        one_hot = False
    try:
        action_dim = [env.action_space(agent).n for agent in env.agents]
        INIT_HP["DISCRETE_ACTIONS"] = True
        INIT_HP["MAX_ACTION"] = None
        INIT_HP["MIN_ACTION"] = None
    except Exception:
        action_dim = [env.action_space(agent).shape[0] for agent in env.agents]
        INIT_HP["DISCRETE_ACTIONS"] = False
        INIT_HP["MAX_ACTION"] = [env.action_space(agent).high for agent in env.agents]
        INIT_HP["MIN_ACTION"] = [env.action_space(agent).low for agent in env.agents]

    # Not applicable to MPE environments, used when images are used for observations (Atari environments)
    if INIT_HP["CHANNELS_LAST"]:
        state_dim = [
            (state_dim[2], state_dim[0], state_dim[1]) for state_dim in state_dim
        ]

    # Append number of agents and agent IDs to the initial hyperparameter dictionary
    INIT_HP["N_AGENTS"] = env.num_agents
    INIT_HP["AGENT_IDS"] = env.agents

    # Create a population ready for evolutionary hyper-parameter optimisation
    pop = initialPopulation(
        INIT_HP["ALGO"],
        state_dim,
        action_dim,
        one_hot,
        NET_CONFIG,
        INIT_HP,
        population_size=INIT_HP["POPULATION_SIZE"],
        device=device,
    )

    # Configure the multi-agent replay buffer
    field_names = ["state", "action", "reward", "next_state", "done"]
    memory = MultiAgentReplayBuffer(
        INIT_HP["MEMORY_SIZE"],
        field_names=field_names,
        agent_ids=INIT_HP["AGENT_IDS"],
        device=device,
    )

    # Instantiate a tournament selection object (used for HPO)
    tournament = TournamentSelection(
        tournament_size=2,  # Tournament selection size
        elitism=True,  # Elitism in tournament selection
        population_size=INIT_HP["POPULATION_SIZE"],  # Population size
        evo_step=1,
    )  # Evaluate using last N fitness scores

    # Instantiate a mutations object (used for HPO)
    mutations = Mutations(
        algo=INIT_HP["ALGO"],
        no_mutation=0.2,  # Probability of no mutation
        architecture=0.2,  # Probability of architecture mutation
        new_layer_prob=0.2,  # Probability of new layer mutation
        parameters=0.2,  # Probability of parameter mutation
        activation=0,  # Probability of activation function mutation
        rl_hp=0.2,  # Probability of RL hyperparameter mutation
        rl_hp_selection=[
            "lr",
            "learn_step",
            "batch_size",
        ],  # RL hyperparams selected for mutation
        mutation_sd=0.1,  # Mutation strength
        agent_ids=INIT_HP["AGENT_IDS"],
        arch=NET_CONFIG["arch"],
        rand_seed=1,
        device=device,
    )

    # Define training loop parameters
    max_episodes = 6000 #500  # Total episodes (default: 6000)
    max_steps = 25  # Maximum steps to take in each episode
    epsilon = 1.0  # Starting epsilon value
    eps_end = 0.1  # Final epsilon value
    eps_decay = 0.995  # Epsilon decay
    evo_epochs = 20  # Evolution frequency
    evo_loop = 1  # Number of evaluation episodes
    elite = pop[0]  # Assign a placeholder "elite" agent

    # Training loop
    for idx_epi in trange(max_episodes):
        for agent in pop:  # Loop through population
            state, info = env.reset()  # Reset environment at start of episode
            agent_reward = {agent_id: 0 for agent_id in env.agents}
            if INIT_HP["CHANNELS_LAST"]:
                state = {
                    agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3])
                    for agent_id, s in state.items()
                }

            for _ in range(max_steps):
                agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None
                env_defined_actions = (
                    info["env_defined_actions"]
                    if "env_defined_actions" in info.keys()
                    else None
                )

                # Get next action from agent
                cont_actions, discrete_action = agent.getAction(
                    state, epsilon, agent_mask, env_defined_actions
                )
                if agent.discrete_actions:
                    action = discrete_action
                else:
                    action = cont_actions

                next_state, reward, termination, truncation, info = env.step(
                    action
                )  # Act in environment

                # Image processing if necessary for the environment
                if INIT_HP["CHANNELS_LAST"]:
                    state = {agent_id: np.squeeze(s) for agent_id, s in state.items()}
                    next_state = {
                        agent_id: np.moveaxis(ns, [-1], [-3])
                        for agent_id, ns in next_state.items()
                    }

                # Save experiences to replay buffer
                memory.save2memory(state, cont_actions, reward, next_state, termination)

                # Collect the reward
                for agent_id, r in reward.items():
                    agent_reward[agent_id] += r

                # Learn according to learning frequency
                if (memory.counter % agent.learn_step == 0) and (
                    len(memory) >= agent.batch_size
                ):
                    experiences = memory.sample(
                        agent.batch_size
                    )  # Sample replay buffer
                    agent.learn(experiences)  # Learn according to agent's RL algorithm

                # Update the state
                if INIT_HP["CHANNELS_LAST"]:
                    next_state = {
                        agent_id: np.expand_dims(ns, 0)
                        for agent_id, ns in next_state.items()
                    }
                state = next_state

                # Stop episode if any agents have terminated
                if any(truncation.values()) or any(termination.values()):
                    break

            # Save the total episode reward
            score = sum(agent_reward.values())
            agent.scores.append(score)

        # Update epsilon for exploration
        epsilon = max(eps_end, epsilon * eps_decay)

        # Now evolve population if necessary
        if (idx_epi + 1) % evo_epochs == 0:
            # Evaluate population
            fitnesses = [
                agent.test(
                    env,
                    swap_channels=INIT_HP["CHANNELS_LAST"],
                    max_steps=max_steps,
                    loop=evo_loop,
                )
                for agent in pop
            ]

            print(f"Episode {idx_epi + 1}/{max_episodes}")
            print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}')
            print(
                f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}'
            )

            # Tournament selection and population mutation
            elite, pop = tournament.select(pop)
            pop = mutations.mutation(pop)

    # Save the trained algorithm
    path = "./models/MATD3"
    filename = "MATD3_trained_agent.pt"
    os.makedirs(path, exist_ok=True)
    save_path = os.path.join(path, filename)
    elite.saveCheckpoint(save_path)

===== AgileRL Online Multi-Agent Demo =====


  0%|          | 19/6000 [00:25<2:12:15,  1.33s/it]

Episode 20/6000
Fitnesses: ['-21.67', '-358.43', '-4.85', '-768.18']
100 fitness avgs: ['-21.67', '-358.43', '-4.85', '-768.18']


  1%|          | 39/6000 [00:53<2:01:39,  1.22s/it]

Episode 40/6000
Fitnesses: ['-48.85', '-2.93', '-30.58', '-12.27']
100 fitness avgs: ['-26.85', '-3.89', '-17.71', '-8.56']


  1%|          | 59/6000 [01:27<2:21:12,  1.43s/it]

Episode 60/6000
Fitnesses: ['-177.63', '-7.01', '-27.21', '-56.99']
100 fitness avgs: ['-61.80', '-4.93', '-20.88', '-21.59']


  1%|▏         | 79/6000 [01:59<2:13:20,  1.35s/it]

Episode 80/6000
Fitnesses: ['-57.80', '-213.39', '-39.10', '-122.29']
100 fitness avgs: ['-18.15', '-57.05', '-13.47', '-76.92']


  2%|▏         | 99/6000 [02:24<1:53:24,  1.15s/it]

Episode 100/6000
Fitnesses: ['-90.52', '-67.29', '-103.76', '-56.34']
100 fitness avgs: ['-28.88', '-24.24', '-31.53', '-25.79']


  2%|▏         | 119/6000 [03:00<2:01:27,  1.24s/it]

Episode 120/6000
Fitnesses: ['-15.90', '-6.89', '-67.35', '-153.51']
100 fitness avgs: ['-24.14', '-22.64', '-32.71', '-49.65']


  2%|▏         | 139/6000 [03:33<2:15:50,  1.39s/it]

Episode 140/6000
Fitnesses: ['-25.22', '-6.81', '-29.97', '-4.51']
100 fitness avgs: ['-23.01', '-29.01', '-23.69', '-20.05']


  3%|▎         | 159/6000 [04:06<2:34:55,  1.59s/it]

Episode 160/6000
Fitnesses: ['-21.57', '-72.43', '-14.16', '-4.99']
100 fitness avgs: ['-20.24', '-26.60', '-19.31', '-26.01']


  3%|▎         | 179/6000 [04:47<3:32:48,  2.19s/it]

Episode 180/6000
Fitnesses: ['-76.56', '-16.50', '-42.88', '-52.62']
100 fitness avgs: ['-31.63', '-19.82', '-27.89', '-28.97']


  3%|▎         | 199/6000 [05:39<5:00:19,  3.11s/it]

Episode 200/6000
Fitnesses: ['-15.26', '-39.73', '-44.94', '-10.48']
100 fitness avgs: ['-19.37', '-21.81', '-29.59', '-29.51']


  4%|▎         | 219/6000 [06:57<3:17:03,  2.05s/it] 

Episode 220/6000
Fitnesses: ['-15.44', '-30.34', '-112.36', '-35.51']
100 fitness avgs: ['-28.23', '-29.59', '-27.82', '-23.06']


  4%|▍         | 239/6000 [07:42<3:31:18,  2.20s/it]

Episode 240/6000
Fitnesses: ['-7.84', '-9.65', '-46.54', '-10.90']
100 fitness avgs: ['-26.54', '-26.69', '-29.38', '-26.79']


  4%|▍         | 259/6000 [08:43<3:50:23,  2.41s/it] 

Episode 260/6000
Fitnesses: ['-38.54', '-4.58', '-17.37', '-33.03']
100 fitness avgs: ['-27.46', '-24.85', '-25.83', '-27.04']


  5%|▍         | 279/6000 [09:46<2:51:04,  1.79s/it] 

Episode 280/6000
Fitnesses: ['-6.16', '-17.73', '-66.82', '-10.54']
100 fitness avgs: ['-23.51', '-26.37', '-28.76', '-25.86']


  5%|▍         | 299/6000 [10:50<3:07:18,  1.97s/it] 

Episode 300/6000
Fitnesses: ['-6.21', '-9.78', '-32.42', '-32.72']
100 fitness avgs: ['-22.36', '-22.60', '-26.29', '-26.31']


  5%|▌         | 319/6000 [11:26<2:54:52,  1.85s/it]

Episode 320/6000
Fitnesses: ['-54.88', '-42.31', '-23.23', '-52.64']
100 fitness avgs: ['-24.39', '-23.83', '-22.41', '-27.94']


  6%|▌         | 339/6000 [12:37<2:56:51,  1.87s/it] 

Episode 340/6000
Fitnesses: ['-20.12', '-9.61', '-122.10', '-26.78']
100 fitness avgs: ['-22.28', '-22.99', '-28.28', '-22.67']


  6%|▌         | 359/6000 [13:20<3:12:58,  2.05s/it]

Episode 360/6000
Fitnesses: ['-90.42', '-7.38', '-40.40', '-30.82']
100 fitness avgs: ['-26.74', '-21.82', '-23.28', '-23.43']


  6%|▋         | 379/6000 [13:55<2:48:38,  1.80s/it]

Episode 380/6000
Fitnesses: ['-1.88', '-17.53', '-46.50', '-6.58']
100 fitness avgs: ['-20.77', '-21.59', '-23.12', '-22.41']


  7%|▋         | 399/6000 [14:30<2:44:43,  1.76s/it]

Episode 400/6000
Fitnesses: ['-25.35', '-23.57', '-39.84', '-11.91']
100 fitness avgs: ['-21.00', '-21.69', '-21.72', '-21.88']


  7%|▋         | 419/6000 [15:03<2:41:59,  1.74s/it]

Episode 420/6000
Fitnesses: ['-12.07', '-9.85', '-12.02', '-25.79']
100 fitness avgs: ['-21.41', '-21.31', '-21.41', '-21.23']


  7%|▋         | 439/6000 [15:38<2:40:58,  1.74s/it]

Episode 440/6000
Fitnesses: ['-24.61', '-51.42', '-48.42', '-4.38']
100 fitness avgs: ['-21.46', '-22.77', '-22.54', '-20.54']


  8%|▊         | 459/6000 [17:02<3:19:22,  2.16s/it] 

Episode 460/6000
Fitnesses: ['-24.69', '-8.78', '-11.33', '-59.55']
100 fitness avgs: ['-20.72', '-20.03', '-20.14', '-24.37']


  8%|▊         | 479/6000 [18:18<2:17:07,  1.49s/it] 

Episode 480/6000
Fitnesses: ['-90.96', '-34.13', '-17.93', '-54.08']
100 fitness avgs: ['-22.98', '-21.28', '-20.60', '-21.45']


  8%|▊         | 499/6000 [18:50<2:26:36,  1.60s/it]

Episode 500/6000
Fitnesses: ['-2.38', '-16.04', '-19.02', '-59.25']
100 fitness avgs: ['-19.87', '-20.42', '-20.54', '-22.96']


  9%|▊         | 519/6000 [20:24<2:53:44,  1.90s/it] 

Episode 520/6000
Fitnesses: ['-53.33', '-32.61', '-26.26', '-18.66']
100 fitness avgs: ['-21.16', '-20.89', '-20.12', '-19.83']


  9%|▉         | 539/6000 [22:07<2:57:23,  1.95s/it] 

Episode 540/6000
Fitnesses: ['-13.11', '-10.53', '-38.97', '-20.72']
100 fitness avgs: ['-19.58', '-19.76', '-20.54', '-19.86']


  9%|▉         | 559/6000 [22:43<2:36:51,  1.73s/it]

Episode 560/6000
Fitnesses: ['-58.02', '-53.77', '-22.84', '-50.36']
100 fitness avgs: ['-21.13', '-20.98', '-19.87', '-20.86']


 10%|▉         | 579/6000 [23:18<2:22:34,  1.58s/it]

Episode 580/6000
Fitnesses: ['-73.96', '-14.52', '-10.72', '-42.04']
100 fitness avgs: ['-21.74', '-19.69', '-20.62', '-21.59']


 10%|▉         | 599/6000 [24:07<2:13:31,  1.48s/it] 

Episode 600/6000
Fitnesses: ['-23.98', '-14.08', '-69.82', '-28.58']
100 fitness avgs: ['-20.74', '-21.48', '-22.26', '-19.99']


 10%|█         | 619/6000 [25:02<2:23:15,  1.60s/it] 

Episode 620/6000
Fitnesses: ['-57.87', '-54.78', '-57.24', '-36.66']
100 fitness avgs: ['-22.66', '-21.83', '-21.91', '-20.52']


 11%|█         | 639/6000 [28:12<1:44:09,  1.17s/it] 

Episode 640/6000
Fitnesses: ['-84.12', '-15.50', '-65.66', '-40.20']
100 fitness avgs: ['-22.51', '-21.64', '-21.93', '-22.41']


 11%|█         | 659/6000 [30:22<2:27:13,  1.65s/it] 

Episode 660/6000
Fitnesses: ['-9.11', '-12.76', '-27.33', '-16.80']
100 fitness avgs: ['-21.26', '-21.37', '-22.56', '-21.78']


 11%|█▏        | 679/6000 [31:44<2:33:05,  1.73s/it] 

Episode 680/6000
Fitnesses: ['-9.46', '-85.99', '-24.67', '-3.16']
100 fitness avgs: ['-20.91', '-23.27', '-21.86', '-20.72']


 11%|█▏        | 679/6000 [33:50<4:25:14,  2.99s/it]


KeyboardInterrupt: 

In [None]:
import os

import imageio
import numpy as np
import torch
from pettingzoo.mpe import simple_speaker_listener_v4
from PIL import Image, ImageDraw

from agilerl.algorithms.matd3 import MATD3


# Define function to return image
def _label_with_episode_number(frame, episode_num):
    im = Image.fromarray(frame)

    drawer = ImageDraw.Draw(im)

    if np.mean(frame) < 128:
        text_color = (255, 255, 255)
    else:
        text_color = (0, 0, 0)
    drawer.text(
        (im.size[0] / 20, im.size[1] / 18), f"Episode: {episode_num+1}", fill=text_color
    )

    return im


if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Configure the environment
    env = simple_speaker_listener_v4.parallel_env(
        continuous_actions=True, render_mode="rgb_array"
    )
    env.reset()
    try:
        state_dim = [env.observation_space(agent).n for agent in env.agents]
        one_hot = True
    except Exception:
        state_dim = [env.observation_space(agent).shape for agent in env.agents]
        one_hot = False
    try:
        action_dim = [env.action_space(agent).n for agent in env.agents]
        discrete_actions = True
        max_action = None
        min_action = None
    except Exception:
        action_dim = [env.action_space(agent).shape[0] for agent in env.agents]
        discrete_actions = False
        max_action = [env.action_space(agent).high for agent in env.agents]
        min_action = [env.action_space(agent).low for agent in env.agents]

    # Append number of agents and agent IDs to the initial hyperparameter dictionary
    n_agents = env.num_agents
    agent_ids = env.agents

    # Instantiate an MADDPG object
    matd3 = MATD3(
        state_dim,
        action_dim,
        one_hot,
        n_agents,
        agent_ids,
        max_action,
        min_action,
        discrete_actions,
        device=device,
    )

    # Load the saved algorithm into the MADDPG object
    path = "./models/MATD3/MATD3_trained_agent.pt"
    matd3.loadCheckpoint(path)

    # Define test loop parameters
    episodes = 10  # Number of episodes to test agent on
    max_steps = 25  # Max number of steps to take in the environment in each episode

    rewards = []  # List to collect total episodic reward
    frames = []  # List to collect frames
    indi_agent_rewards = {
        agent_id: [] for agent_id in agent_ids
    }  # Dictionary to collect inidivdual agent rewards

    rewards = []  # List to collect total episodic reward
    frames = []  # List to collect frames
    indi_agent_rewards = {
        agent_id: [] for agent_id in agent_ids
    }  # Dictionary to collect inidivdual agent rewards

    # Test loop for inference
    for ep in range(episodes):
        state, info = env.reset()
        agent_reward = {agent_id: 0 for agent_id in agent_ids}
        score = 0
        for _ in range(max_steps):
            agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None
            env_defined_actions = (
                info["env_defined_actions"]
                if "env_defined_actions" in info.keys()
                else None
            )

            # Get next action from agent
            cont_actions, discrete_action = matd3.getAction(
                state,
                epsilon=0,
                agent_mask=agent_mask,
                env_defined_actions=env_defined_actions,
            )
            if matd3.discrete_actions:
                action = discrete_action
            else:
                action = cont_actions

            # Save the frame for this step and append to frames list
            frame = env.render()
            frames.append(_label_with_episode_number(frame, episode_num=ep))

            # Take action in environment
            state, reward, termination, truncation, info = env.step(action)

            # Save agent's reward for this step in this episode
            for agent_id, r in reward.items():
                agent_reward[agent_id] += r

            # Determine total score for the episode and then append to rewards list
            score = sum(agent_reward.values())

            # Stop episode if any agents have terminated
            if any(truncation.values()) or any(termination.values()):
                break

        rewards.append(score)

        # Record agent specific episodic reward
        for agent_id in agent_ids:
            indi_agent_rewards[agent_id].append(agent_reward[agent_id])

        print("-" * 15, f"Episode: {ep}", "-" * 15)
        print("Episodic Reward: ", rewards[-1])
        for agent_id, reward_list in indi_agent_rewards.items():
            print(f"{agent_id} reward: {reward_list[-1]}")
    env.close()

    # Save the gif to specified path
    gif_path = "./videos/"
    os.makedirs(gif_path, exist_ok=True)
    imageio.mimwrite(
        os.path.join("./videos/", "speaker_listener.gif"), frames, duration=10
    )

--------------- Episode: 0 ---------------
Episodic Reward:  -43.835148054556576
speaker_0 reward: -21.917574027278288
listener_0 reward: -21.917574027278288
--------------- Episode: 1 ---------------
Episodic Reward:  -23.326940956588846
speaker_0 reward: -11.663470478294423
listener_0 reward: -11.663470478294423
--------------- Episode: 2 ---------------
Episodic Reward:  -10.317120117451974
speaker_0 reward: -5.158560058725987
listener_0 reward: -5.158560058725987
--------------- Episode: 3 ---------------
Episodic Reward:  -23.893555654319197
speaker_0 reward: -11.946777827159599
listener_0 reward: -11.946777827159599
--------------- Episode: 4 ---------------
Episodic Reward:  -41.38620294840652
speaker_0 reward: -20.69310147420326
listener_0 reward: -20.69310147420326
--------------- Episode: 5 ---------------
Episodic Reward:  -39.988109501088644
speaker_0 reward: -19.994054750544322
listener_0 reward: -19.994054750544322
--------------- Episode: 6 ---------------
Episodic Rewar

In [None]:
# 音を鳴らす

import numpy as np
import IPython

rate = 48000
duration = 30.0

angle_list = np.arange(0, rate * duration)
sound = np.sin(2 * np.pi * 523 / rate * angle_list)


IPython.display.Audio(sound, rate=rate, autoplay=True)