In [None]:
import matplotlib.pyplot as plt
import os
import pandas as pd
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
from pettingzoo.mpe import simple_adversary_v3



In [None]:

def plot_learning_curve(x, scores, figure_file):
    running_avg = np.zeros(len(scores))
    for i in range(len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
    plt.plot(x, running_avg)
    plt.title('Running average of previous 100 scores')
    plt.savefig(figure_file)



def calculate_tick_range(values, interval=20):
    start = np.floor(min(values) / interval) * interval
    end = np.ceil(max(values) / interval) * interval
    return np.arange(start, end + 1, interval)


# Create a function to save plots
def save_plot(plt, filename, output_dir):
    if not os.path.exists(output_dir):  # Check if the output directory exists, create it if not
        os.makedirs(output_dir)
    plt.savefig(os.path.join(output_dir, filename))

def plot_average_episode_rewards(average_rewards, scenario, output_dir):
    plt.figure(figsize=(12, 6))
    plt.xlabel('Episode')
    plt.ylabel('Average Episode Reward')
    plt.title(f'Average Episode Reward Progress - {scenario}')
    plt.plot(range(1, len(average_rewards) + 1), average_rewards)
    plt.yticks(calculate_tick_range(average_rewards))
    plt.grid()
    save_plot(plt, f'Average_Episode_Reward_Progress_{scenario}.png', output_dir)
    
def plot_average_episode_rewards_rolling(average_rewards, scenario, output_dir):
    plt.figure(figsize=(12, 8))
    plt.xlabel('Episode')
    plt.ylabel('Average Episode Reward')
    plt.title(f'Average Episode Reward Progress - {scenario}')
    
    plt.plot(range(1, len(average_rewards) + 1), average_rewards, alpha=0.3, label='Original')
    
    rewards_series = pd.Series(average_rewards)
    smoothed_rewards = rewards_series.rolling(window=100).mean()

    plt.plot(range(1, len(average_rewards) + 1), smoothed_rewards, color='red', label='Smoothed (Rolling Mean)')
    
    plt.yticks(calculate_tick_range(smoothed_rewards.dropna()))
    plt.legend()
    plt.grid()
    save_plot(plt, f'Average_Episode_Reward_Progress_Rolling_Mean{scenario}.png', output_dir)

def plot_all_agents_rewards(agent_rewards, scenario, output_dir):
    plt.figure(figsize=(12, 6))
    plt.xlabel('Episode')
    plt.ylabel('Agent Reward')
    plt.title(f'All Agents Reward Progress (agent + avdversary) - {scenario}')
    
    all_rewards = np.concatenate([rewards for rewards in agent_rewards.values()])
    for agent_name, rewards in agent_rewards.items():
        plt.plot(range(1, len(rewards) + 1), rewards, label=f'Agent {agent_name}')
    
    plt.yticks(calculate_tick_range(all_rewards))
    plt.legend()
    plt.grid()
    save_plot(plt, f'All_Agents_Reward_Progress_{scenario}.png', output_dir)


def smooth_data(data, window_size):
    """Apply a moving average to smooth the data."""
    return np.convolve(data, np.ones(window_size) / window_size, mode='valid')



def plot_all_agents_rewards_smooth(agent_rewards, scenario, output_dir):
    plt.figure(figsize=(12, 6))
    plt.xlabel('Episode')
    plt.ylabel('Agent Reward')
    plt.title(f'All Agents Reward Progress (agent + avdversary) - {scenario} smooth')
    
    all_rewards = np.concatenate([rewards for rewards in agent_rewards.values()])
    for agent_name, rewards in agent_rewards.items():
        plot_data = smooth_data( rewards, window_size=100)
        plt.plot(range(1, len(plot_data) + 1),plot_data, label=f'Agent {agent_name}')
    
    plt.yticks(calculate_tick_range(all_rewards))
    plt.legend()
    plt.grid()
    save_plot(plt, f'All_Agents_Reward_Progress_{scenario}_smooth.png', output_dir)




def plot_individual_agent_rewards(epsiode_mean_agent_rewards, agent_name, scenario, output_dir):
    plt.figure(figsize=(12, 6))
    plt.xlabel('Episode')
    plt.ylabel('Mean Agent Reward')
    plt.title(f'Mean Agent Reward Progress - {scenario} (Agent {agent_name})')
    plt.plot(range(1, len(epsiode_mean_agent_rewards) + 1), epsiode_mean_agent_rewards, label=f'Agent {agent_name}')
    
    plt.yticks(calculate_tick_range(epsiode_mean_agent_rewards))
    plt.legend()
    plt.grid()
    save_plot(plt, f'Individual_Agent_Reward_Progress_{scenario}_Agent_{agent_name}.png', output_dir)

def plot_everything(output_dir, scenario, k, score_history_100, score_history, epsiode_mean_agent_rewards):
    # Plot results for different subpolicies
            output_subdir = os.path.join(output_dir, f'scenario_{scenario}', f'k_{k}')
            os.makedirs(output_subdir, exist_ok=True)

            # Plot for average episode rewards fancy
            plot_average_episode_rewards(score_history_100, f"{scenario} - {k}", output_subdir)

            # Plot for average episode rewards with rolling mean
            plot_average_episode_rewards_rolling(score_history, f"{scenario} - {k}", output_subdir)
            
            # Plot for individual agent rewards
            plot_all_agents_rewards(epsiode_mean_agent_rewards, f"{scenario} - {k}", output_subdir)
            
            # Plot for 'agent_0' only
            agent_name = 'agent_0'
            if agent_name in epsiode_mean_agent_rewards:
                plot_individual_agent_rewards(epsiode_mean_agent_rewards[agent_name], agent_name, f"{scenario} - {k}", output_subdir)

In [None]:

class CriticNetwork(nn.Module):
    def __init__(self, beta, input_dims, fc1_dims, fc2_dims,
                 n_actions, name, chkpt_dir='tmp/ddpg'):
        super(CriticNetwork, self).__init__()

        self.chkpt_file = os.path.join(chkpt_dir, name)
        self.fc1 = nn.Linear(input_dims+n_actions, fc1_dims)
        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
        self.q = nn.Linear(fc2_dims, 1)

        self.optimizer = optim.Adam(self.parameters(), lr=beta)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')

        self.to(self.device)

    def forward(self, state, action):
        x = F.relu(self.fc1(T.cat([state, action], dim=1)))
        x = F.relu(self.fc2(x))
        q = self.q(x)

        return q

    def save_checkpoint(self):
        T.save(self.state_dict(), self.chkpt_file)

    def load_checkpoint(self):
        self.load_state_dict(T.load(self.chkpt_file))


class ActorNetwork(nn.Module):
    def __init__(self, alpha, input_dims, fc1_dims, fc2_dims,
                 n_actions, name, chkpt_dir='tmp/ddpg'):
        super(ActorNetwork, self).__init__()

        self.chkpt_file = os.path.join(chkpt_dir, name)

        self.fc1 = nn.Linear(input_dims, fc1_dims)
        self.fc2 = nn.Linear(fc1_dims, fc2_dims)

        self.pi = nn.Linear(fc2_dims, n_actions)

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')

        self.to(self.device)

    def forward(self, state):
        x = self.fc1(state)
        x = F.relu(x)
        x = F.relu(self.fc2(x))
        pi = T.sigmoid(self.pi(x))

        return pi

    def save_checkpoint(self):
        T.save(self.state_dict(), self.chkpt_file)

    def load_checkpoint(self):
        self.load_state_dict(T.load(self.chkpt_file))

In [None]:

class ReplayBuffer:
    def __init__(self, max_size, input_shape, n_actions):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, input_shape))
        self.new_state_memory = np.zeros((self.mem_size, input_shape))
        self.action_memory = np.zeros((self.mem_size, n_actions))
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size, dtype=bool)

    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.new_state_memory[index] = state_
        self.terminal_memory[index] = done

        self.mem_cntr += 1

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)

        batch = np.random.choice(max_mem, batch_size)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        dones = self.terminal_memory[batch]

        return states, actions, rewards, states_, dones

In [None]:

class Agent:
    def __init__(self, alpha, beta, input_dims, tau, n_actions, gamma=0.99,
                 max_size=1000000, fc1_dims=400, fc2_dims=300,
                 batch_size=64):
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.alpha = alpha
        self.beta = beta
        self.n_actions = n_actions

        self.memory = ReplayBuffer(max_size, input_dims, n_actions)

        self.actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims,
                                  n_actions=n_actions, name='actor')
        self.critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims,
                                    n_actions=n_actions, name='critic')

        self.target_actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims,
                                         n_actions=n_actions,
                                         name='target_actor')

        self.target_critic = CriticNetwork(beta, input_dims, fc1_dims,
                                           fc2_dims, n_actions=n_actions,
                                           name='target_critic')

        self.update_network_parameters(tau=1)

    def choose_action(self, observation, eval=False):
        state = T.tensor(observation[np.newaxis, :], dtype=T.float,
                         device=self.actor.device)
        mu = self.actor.forward(state).to(self.actor.device)
        noise = T.rand(self.n_actions).to(self.actor.device)
        noise *= T.tensor(1 - int(eval))
        mu_prime = mu + noise
        mu_prime = T.clamp(mu_prime, 0., 1.)

        return mu_prime.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def save_models(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.critic.save_checkpoint()
        self.target_critic.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.target_critic.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        states, actions, rewards, states_, done = \
            self.memory.sample_buffer(self.batch_size)

        states = T.tensor(states, dtype=T.float).to(self.actor.device)
        states_ = T.tensor(states_, dtype=T.float).to(self.actor.device)
        actions = T.tensor(actions, dtype=T.float).to(self.actor.device)
        rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device)
        done = T.tensor(done).to(self.actor.device)

        target_actions = self.target_actor.forward(states_)
        critic_value_ = self.target_critic.forward(states_, target_actions)
        critic_value = self.critic.forward(states, actions)

        critic_value_[done] = 0.0
        critic_value_ = critic_value_.view(-1)

        target = rewards + self.gamma*critic_value_
        target = target.view(self.batch_size, 1)

        self.critic.optimizer.zero_grad()
        critic_loss = F.mse_loss(target, critic_value)
        critic_loss.backward()
        self.critic.optimizer.step()

        self.actor.optimizer.zero_grad()
        actor_loss = -self.critic.forward(states, self.actor.forward(states))
        actor_loss = T.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        tau = tau or self.tau
        src = self.actor
        dest = self.target_actor
        for param, target in zip(src.parameters(), dest.parameters()):
            target.data.copy_(tau * param.data + (1 - tau) * target.data)
        src = self.critic
        dest = self.target_critic
        for param, target in zip(src.parameters(), dest.parameters()):
            target.data.copy_(tau * param.data + (1 - tau) * target.data)

In [None]:



def obs_list_to_state_vector(observation):
    state = np.array([])
    for obs in observation:
        state = np.concatenate([state, obs])
    return state


def smooth_data(data, window_size):
    """Apply a moving average to smooth the data."""
    return np.convolve(data, np.ones(window_size) / window_size, mode='valid')


def train_DDPG(parallel_env, N_GAMES, scenario, output_dir):
    _, _ = parallel_env.reset()
    n_agents = parallel_env.max_num_agents

    agents = []
    for agent in parallel_env.agents:
        input_dims = parallel_env.observation_space(agent).shape[0]
        n_actions = parallel_env.action_space(agent).shape[0]

        agents.append(Agent(input_dims=input_dims, n_actions=n_actions,
                            gamma=0.95, tau=0.01, alpha=1e-4, beta=1e-3))

    EVAL_INTERVAL = 1000
    MAX_STEPS = N_GAMES * 25  # 25 steps per episode
    total_steps = 0
    episode = 0

    episode_rewards = []  # Store rewards for each episode
    epsiode_mean_agent_rewards = {agent_name: [] for agent_name in parallel_env.agents}
    eval_scores = []
    eval_steps = []
    score = evaluate(agents, parallel_env, episode, total_steps)
    eval_scores.append(score)
    eval_steps.append(total_steps)

    pbar = tqdm(total=MAX_STEPS, desc="Training DDPG")

    while total_steps < MAX_STEPS:
        obs, _ = parallel_env.reset()
        terminal = [False] * n_agents
        obs = list(obs.values())
        episode_reward = 0
        agent_rewards = {agent_name: [] for agent_name in parallel_env.agents}
        while not any(terminal):
            action = [agent.choose_action(obs[idx])
                      for idx, agent in enumerate(agents)]
            action = {agent: act
                      for agent, act in zip(parallel_env.agents, action)}
            obs_, reward, done, truncated, info = parallel_env.step(action)
            list_done = list(done.values())
            list_reward = list(reward.values())
            list_action = list(action.values())
            obs_ = list(obs_.values())
            list_trunc = list(truncated.values())

            terminal = [d or t for d, t in zip(list_done, list_trunc)]

            for idx, agent in enumerate(agents):
                agent.remember(obs[idx], list_action[idx],
                               list_reward[idx], obs_[idx], terminal[idx])

            if total_steps % 125 == 0:
                for agent in agents:
                    agent.learn()
            obs = obs_
            print(reward)
            # Store the rewards
            for agent_name, r in reward.items():
                agent_rewards[agent_name].append(r)
            input("Press Enter to continue...")

            episode_reward += sum(reward.values())
            total_steps += 1
            pbar.update(1)

        if total_steps % EVAL_INTERVAL == 0 and total_steps > 0:
            score = evaluate(agents, parallel_env, episode, total_steps)
            eval_scores.append(score)
            eval_steps.append(total_steps)

        episode += 1

        # Create the 'data' directory if it doesn't exist
        if not os.path.exists('data'):
            os.makedirs('data')

        # Save the files in the 'data' directory
        np.save('data/ddpg_scores.npy', np.array(eval_scores))
        np.save('data/ddpg_steps.npy', np.array(eval_steps))
        episode_rewards.append(episode_reward)  # Store reward for this episode
        for agent_name, rewards in agent_rewards.items():
            mean_agent_reward = sum(rewards)
            epsiode_mean_agent_rewards[agent_name].append(mean_agent_reward)

    pbar.close()
    smoothed_rewards = smooth_data(episode_rewards, window_size=100)  # Adjust the window size as needed

    plot_average_episode_rewards(smoothed_rewards, scenario, output_dir)
    plot_all_agents_rewards(epsiode_mean_agent_rewards, scenario, output_dir)
    plot_all_agents_rewards_smooth(epsiode_mean_agent_rewards, scenario, output_dir)

    return eval_scores


def evaluate(agents, env, ep, step):
    score_history = []
    for i in range(3):
        obs, _ = env.reset()
        score = 0
        terminal = [False] * env.max_num_agents
        obs = list(obs.values())
        while not any(terminal):
            action = [agent.choose_action(obs[idx], eval=True)
                      for idx, agent in enumerate(agents)]
            action = {agent: act
                      for agent, act in zip(env.agents, action)}

            obs_, reward, done, truncated, info = env.step(action)
            obs_ = list(obs_.values())
            list_trunc = list(truncated.values())
            list_reward = list(reward.values())
            list_done = list(done.values())

            terminal = [d or t for d, t in zip(list_done, list_trunc)]

            obs = obs_
            score += sum(list_reward)
        score_history.append(score)
    avg_score = np.mean(score_history)

    return avg_score


# Specify the output directory
output_dir = "ddpg_plots"
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
# Change this line to change the environment
#  parallel_env, scenario = simple_tag_v3.parallel_env(max_cycles=25, continuous_actions=True, render_mode="rgb_array"), "predator_prey"
parallel_env, scenario =  simple_adversary_v3.parallel_env(N=1,max_cycles=25, continuous_actions=True, render_mode="rgb_array"), "Cooperative_Communication"
print(scenario)
N_GAMES = 25_000
# Create a subfolder with the name of the scenario
scenario_dir = os.path.join(output_dir, scenario)
if not os.path.exists(scenario_dir):
    os.makedirs(scenario_dir)
train_DDPG(parallel_env=parallel_env, N_GAMES=N_GAMES, scenario=scenario, output_dir=scenario_dir)