In [None]:
import matplotlib.pyplot as plt
import os
import pandas as pd
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
from pettingzoo.mpe import simple_adversary_v3



In [None]:
def calculate_tick_range(values, interval=20):
    start = np.floor(min(values) / interval) * interval
    end = np.ceil(max(values) / interval) * interval
    return np.arange(start, end + 1, interval)


# Create a function to save plots
def save_plot(plt, filename, output_dir):
    if not os.path.exists(output_dir):  # Check if the output directory exists, create it if not
        os.makedirs(output_dir)
    plt.savefig(os.path.join(output_dir, filename))

def plot_average_episode_rewards(average_rewards, scenario, output_dir):
    plt.figure(figsize=(12, 6))
    plt.xlabel('Episode')
    plt.ylabel('Average Episode Reward')
    plt.title(f'Average Episode Reward Progress - {scenario}')
    plt.plot(range(1, len(average_rewards) + 1), average_rewards)
    plt.yticks(calculate_tick_range(average_rewards))
    plt.grid()
    save_plot(plt, f'Average_Episode_Reward_Progress_{scenario}.png', output_dir)
    
def plot_average_episode_rewards_rolling(average_rewards, scenario, output_dir):
    plt.figure(figsize=(12, 8))
    plt.xlabel('Episode')
    plt.ylabel('Average Episode Reward')
    plt.title(f'Average Episode Reward Progress - {scenario}')
    
    plt.plot(range(1, len(average_rewards) + 1), average_rewards, alpha=0.3, label='Original')
    
    rewards_series = pd.Series(average_rewards)
    smoothed_rewards = rewards_series.rolling(window=100).mean()

    plt.plot(range(1, len(average_rewards) + 1), smoothed_rewards, color='red', label='Smoothed (Rolling Mean)')
    
    plt.yticks(calculate_tick_range(smoothed_rewards.dropna()))
    plt.legend()
    plt.grid()
    save_plot(plt, f'Average_Episode_Reward_Progress_Rolling_Mean{scenario}.png', output_dir)

def plot_all_agents_rewards(agent_rewards, scenario, output_dir):
    plt.figure(figsize=(12, 6))
    plt.xlabel('Episode')
    plt.ylabel('Agent Reward')
    plt.title(f'All Agents Reward Progress (agent + avdversary) - {scenario}')
    
    all_rewards = np.concatenate([rewards for rewards in agent_rewards.values()])
    for agent_name, rewards in agent_rewards.items():
        plt.plot(range(1, len(rewards) + 1), rewards, label=f'Agent {agent_name}')
    
    plt.yticks(calculate_tick_range(all_rewards))
    plt.legend()
    plt.grid()
    save_plot(plt, f'All_Agents_Reward_Progress_{scenario}.png', output_dir)

def plot_individual_agent_rewards(epsiode_mean_agent_rewards, agent_name, scenario, output_dir):
    plt.figure(figsize=(12, 6))
    plt.xlabel('Episode')
    plt.ylabel('Mean Agent Reward')
    plt.title(f'Mean Agent Reward Progress - {scenario} (Agent {agent_name})')
    plt.plot(range(1, len(epsiode_mean_agent_rewards) + 1), epsiode_mean_agent_rewards, label=f'Agent {agent_name}')
    
    plt.yticks(calculate_tick_range(epsiode_mean_agent_rewards))
    plt.legend()
    plt.grid()
    save_plot(plt, f'Individual_Agent_Reward_Progress_{scenario}_Agent_{agent_name}.png', output_dir)

def plot_everything(output_dir, scenario, k, score_history_100, score_history, epsiode_mean_agent_rewards):
    # Plot results for different subpolicies
            output_subdir = os.path.join(output_dir, f'scenario_{scenario}', f'k_{k}')
            os.makedirs(output_subdir, exist_ok=True)

            # Plot for average episode rewards fancy
            plot_average_episode_rewards(score_history_100, f"{scenario} - {k}", output_subdir)

            # Plot for average episode rewards with rolling mean
            plot_average_episode_rewards_rolling(score_history, f"{scenario} - {k}", output_subdir)
            
            # Plot for individual agent rewards
            plot_all_agents_rewards(epsiode_mean_agent_rewards, f"{scenario} - {k}", output_subdir)
            
            # Plot for 'agent_0' only
            agent_name = 'agent_0'
            if agent_name in epsiode_mean_agent_rewards:
                plot_individual_agent_rewards(epsiode_mean_agent_rewards[agent_name], agent_name, f"{scenario} - {k}", output_subdir)

In [None]:

# this class is used to store the experiences for each agent's subpolicy
class MultiAgentReplayBuffer:
    def __init__(self, max_size, critic_dims, actor_dims, 
            n_actions, n_agents, batch_size,agent_names):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.n_agents = n_agents
        self.actor_dims = actor_dims
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.agent_names = agent_names

        self.state_memory = np.zeros((self.mem_size, critic_dims))
        self.new_state_memory = np.zeros((self.mem_size, critic_dims))
        self.reward_memory = np.zeros((self.mem_size, n_agents))
        self.terminal_memory = np.zeros((self.mem_size, n_agents), dtype=bool)

        self.init_actor_memory()

    def init_actor_memory(self):
        self.actor_state_memory = []
        self.actor_new_state_memory = []
        self.actor_action_memory = []

        for i in range(self.n_agents):
            self.actor_state_memory.append(
                            np.zeros((self.mem_size, self.actor_dims[i])))
            self.actor_new_state_memory.append(
                            np.zeros((self.mem_size, self.actor_dims[i])))
            self.actor_action_memory.append(
                            np.zeros((self.mem_size, self.n_actions[i])))

    # Store a new experience in the memory buffer
    def store_transition(self, raw_obs, state, action, reward, 
                               raw_obs_, state_, done):
        # circular buffer (when buffer is full, replace the old with new)       
        index = self.mem_cntr % self.mem_size

        for agent_idx, agent_name in enumerate(self.agent_names):
            self.actor_state_memory[agent_idx][index] = raw_obs[agent_name]
            self.actor_new_state_memory[agent_idx][index] = raw_obs_[agent_name]
            self.actor_action_memory[agent_idx][index] = action[agent_name]

        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index] = [i for i in reward.values()]
        self.terminal_memory[index] = done
        self.mem_cntr += 1


    # Sample a batch of experiences from the memory buffer
    def sample_buffer(self):
        max_mem = min(self.mem_cntr, self.mem_size)
        
        # Randomly sample indices
        batch = np.random.choice(max_mem, self.batch_size, replace=False)
        states = self.state_memory[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        terminal = self.terminal_memory[batch]

        actor_states = []
        actor_new_states = []
        actions = []
        # 
        for agent_idx in range(self.n_agents):
            actor_states.append(self.actor_state_memory[agent_idx][batch])
            actor_new_states.append(self.actor_new_state_memory[agent_idx][batch])
            actions.append(self.actor_action_memory[agent_idx][batch])

        return actor_states, states, actions, rewards, \
               actor_new_states, states_, terminal

    # Check if the buffer has enough samples to start training
    def ready(self):
        if self.mem_cntr >= self.batch_size:
            return True


In [None]:
import os
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


# Critic Network:
# used to approximate the Q-value function given the state of the environment 
# and the actions of all agents for each agent
class CriticNetwork(nn.Module):
    def __init__(self, beta, input_dims, fc1_dims, fc2_dims, 
                    n_agents, n_actions, name, chkpt_dir):
        super(CriticNetwork, self).__init__()

        self.chkpt_file = os.path.join(chkpt_dir, name)

        self.fc1 = nn.Linear(input_dims, fc1_dims)
        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
        self.q = nn.Linear(fc2_dims, 1)

        self.optimizer = optim.Adam(self.parameters(), lr=beta)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
 
        self.to(self.device)

    def forward(self, state, action):
        x = F.relu(self.fc1(T.cat([state, action], dim=1)))
        x = F.relu(self.fc2(x))
        q = self.q(x)

        return q

    def save_checkpoint(self,type):
        checkpoint_temp = self.chkpt_file + type
        os.makedirs(os.path.dirname(checkpoint_temp), exist_ok=True)
        checkpoint_path = self.chkpt_file + ".pt"  # Ensure the file has an extension
        T.save(self.state_dict(), checkpoint_path)

    
    def load_checkpoint(self,type):
        checkpoint_temp = self.chkpt_file + type
        checkpoint_path = checkpoint_temp + ".pt"
        return 
        self.load_state_dict(T.load(checkpoint_path))

# Actor Network:
# used to approximate the policy function for each agent
class ActorNetwork(nn.Module):
    def __init__(self, alpha, input_dims, fc1_dims, fc2_dims, 
                 n_actions, name, chkpt_dir):
        super(ActorNetwork, self).__init__()
        self.chkpt_file = os.path.join(chkpt_dir, name)

        self.fc1 = nn.Linear(input_dims, fc1_dims)
        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
        self.pi = nn.Linear(fc2_dims, n_actions)

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
 
        self.to(self.device)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        pi = T.softmax(self.pi(x), dim=1)

        return pi

    def save_checkpoint(self,type):
        checkpoint_temp = self.chkpt_file + type
        os.makedirs(os.path.dirname(checkpoint_temp), exist_ok=True)
        checkpoint_path = checkpoint_temp + ".pt"  # Ensure the file has an extension
        T.save(self.state_dict(), checkpoint_path)

    def load_checkpoint(self,type):
        checkpoint_temp = self.chkpt_file + type
        checkpoint_path = checkpoint_temp + ".pt"  # Ensure the file has an extension
        self.load_state_dict(T.load(checkpoint_path))



In [None]:

# Each agent has k instances of the subpolicy class
class SubPolicy:
    def __init__(self, actor_dims, critic_dims, n_actions, n_agents,  chkpt_dir,agent_name,
                    alpha=0.01, beta=0.01, fc1=32,
                    fc2=32, gamma=0.95, tau=0.01 ,
                    ):
        self.gamma = gamma
        self.tau = tau
        self.n_actions = n_actions
        self.agent_name = agent_name
        self.actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions,
                                  chkpt_dir=chkpt_dir,  name=self.agent_name+'_actor')
        self.critic = CriticNetwork(beta, critic_dims,
                            fc1, fc2, n_agents, n_actions,
                            chkpt_dir=chkpt_dir, name=self.agent_name+'_critic')
        self.target_actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions,
                                        chkpt_dir=chkpt_dir,
                                        name=self.agent_name+'_target_actor')
        self.target_critic = CriticNetwork(beta, critic_dims,
                                            fc1, fc2, n_agents, n_actions,
                                            chkpt_dir=chkpt_dir,
                                            name=self.agent_name+'_target_critic')
        

        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        # Ensure observation is a numpy array with standard numeric dtype
        if isinstance(observation, list):
            observations = np.array(observation)
        else:
            observations = observation

        if observations.ndim == 1:
            observations = observations.reshape(1, -1)

        state = T.tensor(observations, dtype=T.float).to(self.actor.device)
        actions = self.actor.forward(state)
        min_v = (1 - actions).min()

        # Ensure noise is generated on the same device as state
        noise = (T.rand(self.n_actions, device=state.device) * min_v)
    
        action = actions + noise
        return action.detach().cpu().numpy()[0]




In [None]:

class MADDPG:
    def __init__(self, actor_dims, critic_dims,whole_state_observation_dims, n_agents, n_actions, env,
                 scenario='simple',  alpha=0.01, beta=0.01, fc1=32,
                 fc2=32, gamma=0.95, tau=0.01, chkpt_dir='tmp/maddpg/',k=1):
        

        # Create a memory buffer for each agent and each subpolicy
        self.memory = [[MultiAgentReplayBuffer(1_000_000, whole_state_observation_dims, actor_dims,
                                               n_actions, n_agents, batch_size=32,
                                               agent_names=env.agents)
                        for _ in range(k)] for _ in range(n_agents)]
        self.k = k
        self.agents = []
        self.n_agents = n_agents
        self.n_actions = n_actions
        chkpt_dir += scenario
        self.agents = {}
        for agent_idx, agent_name in enumerate(env.possible_agents):
            self.agents[agent_name] = Agent(actor_dims[agent_idx],
                                            critic_dims,
                                            n_actions[agent_idx], n_agents,
                                            agent_name = agent_name,
                                            alpha=alpha,
                                            beta=beta,
                                            chkpt_dir=chkpt_dir,k=k,agent_type=agent_name[0:-2])
        
        # Create a set of agent types : {'adversary', 'agent', etc..}
        self.agent_types= set([agent.agent_type for agent in self.agents.values()])
 
    def save_checkpoint(self,type):
        print('... saving checkpoint ...')
        for agent_name, agent in self.agents.items():
            agent.save_models(type)
 
    # def load_checkpoint(self,type):
    #     print('... loading checkpoint ...')
    #     for agent_name, agent in self.agents.items():
    #         agent.load_models(type)
 
    def choose_action(self, raw_obs):
        actions = []
        for agent_idx, agent in enumerate(self.agents):
            action = agent.choose_action(raw_obs[agent_idx])
            actions.append(action)
        return actions
 
    def choose_action(self, raw_obs):
        actions = {agent.agent_name: agent.choose_action(raw_obs[agent.agent_name]) for agent in self.agents.values()}
        return actions
 
    def randomly_choose_subpolicy(self):
        # Generate random subpolicies for each agent type in one go
        random_subpolicies = np.random.randint(0, self.k, size=len(self.agent_types))

        # Create a mapping of agent types to their random subpolicy
        self.type_to_subpolicy = dict(zip(self.agent_types, random_subpolicies))

        # Update subpolicy for each agent only when accessed
        for agent in self.agents.values():
            agent.update_subpolicy(self.type_to_subpolicy[agent.agent_type])
        
    
    def store_transition(self, obs, state, action, reward, obs_, state_, done):
        # Store the transition in the memory buffer of the corresponding agent and active subpolicy:
        # specified by the agent_idx and subpolicy_idx
        for agent_idx, agent in enumerate(self.agents.values()):
            self.memory[agent_idx][agent.current_subpolicy_idx].store_transition(obs, state, action, reward, obs_, state_, done)
    
    def learn(self):
        for agent_idx,agent in enumerate(self.agents.values()):
            agent.learn(self.memory[agent_idx][agent.current_subpolicy_idx],self.agents)


In [None]:
import os
import numpy as np
import pandas as pd
from utils_plots import plot_everything
from maddpg import MADDPG
from pettingzoo.mpe import simple_adversary_v3, simple_speaker_listener_v4, simple_spread_v3, simple_reference_v3, simple_tag_v3, simple_crypto_v3,simple_push_v3
import warnings
import matplotlib.pyplot as plt
from IPython import display
from tqdm import tqdm
from pettingzoo.sisl import waterworld_v4


def obs_list_to_state_vector(observation):
    """
    Convert a list of observations to a state vector by concatenating them.
    """
    state = np.array([])
    for obs in observation:
        state = np.concatenate([state, obs])
    return state


def visualize_agents(agents, env, n_episodes=20, speed=0.1):
    """
    Visualize the agents' behavior in the environment.
    """
    # Ensure speed is between 0 and 1
    speed = np.clip(speed, 0, 1)

    # Create a figure outside the loop
    plt.figure()
    for episode in range(n_episodes):
        prev_reward = -np.inf
        obs, _ = env.reset()
        terminal = [False] * env.num_agents

        while not any(terminal):
            actions = agents.choose_action(obs)
            obs, rewards, done, truncation, _ = env.step(actions)

            # Sum rewards
            rewards = sum(rewards.values())

            # Determine direction
            direction = "Right direction" if rewards > prev_reward else "Wrong direction"
            prev_reward = rewards
            
            # Render as an RGB array
            img = env.render()

            # Clear the current axes and plot the new image
            plt.clf()
            plt.imshow(img)

            # Determine the center position for the text
            center_x = img.shape[1] / 2

            # Add direction text to the figure, centered horizontally
            plt.text(center_x, 20, direction, fontsize=12, color='white', bbox=dict(facecolor='black', alpha=1), ha='center')

            # Display the updated figure
            display.clear_output(wait=True)
            display.display(plt.gcf())
            plt.pause(0.1 / speed)

            terminal = [d or t for d, t in zip(done.values(), truncation.values())]

        print(f'Episode {episode + 1} completed')
        plt.close()


def solve_env_with_subpolicies(env, scenario, N_GAMES, evaluate, k_values=[1], plot=True, output_dir=None):
    """
    Solve the environment using subpolicies.
    """
    for k in k_values:
        print(f"Solving env {scenario} with k={k}")
        obs = env.reset()

        n_agents = env.num_agents
        actor_dims = [env.observation_spaces[agent_name].shape[0] for agent_name in env.agents]
        n_actions = [env.action_spaces[agent_name].shape[0] for agent_name in env.agents]
        critic_dims = sum(actor_dims) + sum(n_actions)
        # what everyone is seeing
        whole_state_observation_dims = sum(actor_dims)

        maddpg_agents = MADDPG(actor_dims, critic_dims, whole_state_observation_dims, n_agents, n_actions,
                               fc1=32, fc2=32,
                               alpha=0.01, beta=0.01, scenario=scenario,
                               chkpt_dir=f'tmp/maddpg/k_{k}/', env=env, k=k)

        LOAD_TYPE = ["Regular", "Best"]  # Regular: save every 10k, Best: save only if avg_score > best_score
        PRINT_INTERVAL = 500
        SAVE_INTERVAL = 5000
        MAX_STEPS = 25
        total_steps = 0
        score_history = []
        score_history_100 = []
        best_score = - np.inf  #the first score will always be better than this
        epsiode_mean_agent_rewards = {agent_name: [] for agent_name in env.agents}
        episode_lengths = []
        
        if evaluate:
            maddpg_agents.load_checkpoint(LOAD_TYPE[0])  # load best
            visualize_agents(maddpg_agents, env, n_episodes=5, speed=10)
        else:
            for i in tqdm(range(N_GAMES), desc=f"Training with k={k}"):
                obs, _ = env.reset()
                score = 0
                done = [False] * n_agents
                episode_step = 0
                episode_length = 0
                agent_rewards = {agent_name: [] for agent_name in env.agents}

                # each episode, randomly choose a subpolicy
                maddpg_agents.randomly_choose_subpolicy()
                while not any(done):
                    actions = maddpg_agents.choose_action(obs)

                    obs_, reward, termination, truncation, _ = env.step(actions)
                    state = np.concatenate([i for i in obs.values()])
                    state_ = np.concatenate([i for i in obs_.values()])

                    if episode_step >= MAX_STEPS:
                        done = [True] * n_agents

                    if any(termination.values()) or any(truncation.values()) or (episode_step >= MAX_STEPS):
                        done = [True] * n_agents

                    maddpg_agents.store_transition(obs, state, actions, reward, obs_, state_, done)

                    if total_steps % 5 == 0:
                        maddpg_agents.learn()

                    obs = obs_
                    for agent_name, r in reward.items():
                        agent_rewards[agent_name].append(r)

                    score += sum(reward.values())
                    total_steps += 1
                    episode_step += 1
                    episode_length += 1
                
                score_history.append(score)
                avg_score = np.mean(score_history[-100:])
                score_history_100.append(avg_score)
                episode_lengths.append(episode_length)

                if (avg_score > best_score) and (i > PRINT_INTERVAL):
                    print(" avg_score, best_score", avg_score, best_score)
                    maddpg_agents.save_checkpoint(LOAD_TYPE[1])
                    best_score = avg_score
                if i % SAVE_INTERVAL == 0 and i > 0:
                    maddpg_agents.save_checkpoint(LOAD_TYPE[0])

                # Compute mean agent rewards
                for agent_name, rewards in agent_rewards.items():
                    mean_agent_reward = sum(rewards)
                    epsiode_mean_agent_rewards[agent_name].append(mean_agent_reward)
                  
        if plot:
            plot_everything(output_dir, scenario, k, score_history_100, score_history, epsiode_mean_agent_rewards)



warnings.filterwarnings('ignore')
# Specify the output directory
output_dir = "plots"
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
env6, scenario6 = simple_adversary_v3.parallel_env(N=1, max_cycles=25, continuous_actions=True, render_mode='rgb_array'), "Keep_Away"
envs = [env6]
scenarios = [scenario6]
k_values = [1]  # Add more values if needed
for env, scenario in zip(envs, scenarios):
    solve_env_with_subpolicies(env, scenario, N_GAMES=25_000, evaluate=False, k_values=k_values, plot=True, output_dir=output_dir)