In [None]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from NaturalEnv import natural_env_v0

# Inicializar el entorno
env = natural_env_v0.parallel_env(render_mode=None, max_cycles=100, num_predators=0, num_prey=1,
                                  num_obstacles=0, num_food=1, num_water=1, num_forests=0)

observations, infos = env.reset(seed=42)

# Parámetros del aprendizaje
learning_rate = 0.001
discount_factor = 0.9
clip_epsilon = 0.2
action_space = 5
epsilon = 0.9  # Tasa de exploración elevada

# Definir la red neuronal para la política
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.dropout1 = nn.Dropout(p=0.5)  # Dropout con probabilidad de 0.3
        self.fc2 = nn.Linear(64, 64)
        self.dropout2 = nn.Dropout(p=0.5)  # Dropout con probabilidad de 0.3
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        logits = self.fc3(x)
        return torch.softmax(logits, dim=-1)

# Crear red para la presa
input_dim = observations['prey_0'].shape[0]
policy_prey = PolicyNetwork(input_dim, action_space)
optimizer_prey = optim.Adam(policy_prey.parameters(), lr=learning_rate)

# Funciones de ayuda
def get_action(observation, policy_net, exploration=True, epsilon=0.9):
    observation = torch.tensor(observation, dtype=torch.float32)
    action_probs = policy_net(observation)
    action_dist = torch.distributions.Categorical(action_probs)

    if exploration and random.uniform(0, 1) < epsilon:
        action = random.randint(0, action_space - 1)
        log_prob = torch.log(torch.tensor(1.0 / action_space, dtype=torch.float32))
        return action, log_prob

    action = action_dist.sample()
    return action.item(), action_dist.log_prob(action)

def ppo_update(experience, policy_net, optimizer, clip_eps=clip_epsilon):
    log_probs_old = []
    rewards = []
    log_probs_new = []

    for state, action, reward, log_prob_old in experience:
        state = torch.tensor(state, dtype=torch.float32)
        log_prob_old = log_prob_old.clone().detach()

        action_probs = policy_net(state)
        action_dist = torch.distributions.Categorical(action_probs)
        log_prob_new = action_dist.log_prob(torch.tensor(action))

        log_probs_old.append(log_prob_old)
        rewards.append(reward)
        log_probs_new.append(log_prob_new)

    log_probs_old = torch.stack(log_probs_old)
    rewards = torch.tensor(rewards, dtype=torch.float32)
    log_probs_new = torch.stack(log_probs_new)

    advantage = rewards - rewards.mean()

    ratios = torch.exp(log_probs_new - log_probs_old)
    clipped_ratios = torch.clamp(ratios, 1 - clip_eps, 1 + clip_eps)
    loss = -torch.min(ratios * advantage, clipped_ratios * advantage).mean()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Función para identificar puntos óptimos de Pareto
def is_pareto_optimal(points):
    """Devuelve un array booleano indicando si cada punto es óptimo de Pareto."""
    is_optimal = np.ones(points.shape[0], dtype=bool)
    for i, point in enumerate(points):
        if is_optimal[i]:
            is_optimal[i] = not np.any(np.all(points <= point, axis=1) & np.any(points < point, axis=1))
    return is_optimal

# Bucle de episodios
num_episodes = 100
rewards_multiobjective = []

for episode in range(num_episodes):
    observations, infos = env.reset()
    experience_prey = []
    total_reward_resources = 0
    total_reward_survival = 0

    print(f"\nInicio del Episodio {episode + 1}")

    while env.agents:
        actions = {}

        for agent in env.agents:
            obs = observations[agent]
            obs = (obs - np.mean(obs)) / (np.std(obs) + 1e-5)
            if "prey" in agent:
                action, log_prob = get_action(obs, policy_prey, epsilon=epsilon)
                actions[agent] = action

        observations, rewards, terminations, truncations, infos = env.step(actions)

        for agent in env.agents:
            if "prey" in agent:
                reward_vector = rewards[agent]
                total_reward_resources += reward_vector[0]
                total_reward_survival += reward_vector[1]
                action = actions[agent]
                log_prob = get_action(obs, policy_prey)[1]
                experience_prey.append((obs, action, sum(reward_vector), log_prob))

        env.agents = [agent for agent in env.agents if not (terminations[agent] or truncations[agent])]

    rewards_multiobjective.append([total_reward_resources, total_reward_survival])
    ppo_update(experience_prey, policy_prey, optimizer_prey)

    epsilon = max(0.1, epsilon * 0.99)

    print(f"Recompensas acumuladas: Recursos={total_reward_resources}, Supervivencia={total_reward_survival}")

env.close()

# Visualización del Frente de Pareto
rewards_array = np.array(rewards_multiobjective)
pareto_optimal = is_pareto_optimal(rewards_array)

plt.figure(figsize=(10, 6))
plt.scatter(rewards_array[:, 0], rewards_array[:, 1], label='No Óptimo de Pareto', alpha=0.5)
plt.scatter(rewards_array[pareto_optimal, 0], rewards_array[pareto_optimal, 1], color='red', label='Óptimo de Pareto')
plt.xlabel('Recompensa por Recursos (Comida/Agua)')
plt.ylabel('Recompensa por Supervivencia')
plt.title('Frente de Pareto para la Presa')
plt.legend()
plt.grid()
plt.show()
