In [1]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from NaturalEnv import natural_env_v0
import matplotlib.pyplot as plt

# Inicializar el entorno
#env = natural_env_v0.parallel_env(render_mode=None, max_cycles=100)
env = natural_env_v0.parallel_env(render_mode=None, max_cycles=100, num_predators=0, num_prey=1,
                                  num_obstacles = 0, num_food=1, num_water=1, num_forests=0)

observations, infos = env.reset(seed=42)

# Parámetros del aprendizaje
learning_rate = 0.001
discount_factor = 0.9
clip_epsilon = 0.2
action_space = 5
epsilon = 0.9 # Tasa de exploración elevada

# Definir la red neuronal para la política
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.dropout1 = nn.Dropout(p=0.3)  # Dropout con probabilidad de 0.3
        self.fc2 = nn.Linear(64, 64)
        self.dropout2 = nn.Dropout(p=0.3)  # Dropout con probabilidad de 0.3
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)  # Aplicar Dropout después de la primera capa
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)  # Aplicar Dropout después de la segunda capa
        logits = self.fc3(x)
        return torch.softmax(logits, dim=-1)


# Crear redes para las presas y los depredadores
input_dim = observations['prey_0'].shape[0]
policy_prey = PolicyNetwork(input_dim, action_space)
#policy_predator = PolicyNetwork(input_dim, action_space)

optimizer_prey = optim.Adam(policy_prey.parameters(), lr=learning_rate)
#optimizer_predator = optim.Adam(policy_predator.parameters(), lr=learning_rate)

# Funciones de ayuda
def get_action(observation, policy_net, exploration=True, epsilon=0.9):
    observation = torch.tensor(observation, dtype=torch.float32)
    action_probs = policy_net(observation)
    action_dist = torch.distributions.Categorical(action_probs)

    if exploration and random.uniform(0, 1) < epsilon:
        action = random.randint(0, action_space - 1)
        log_prob = torch.log(torch.tensor(1.0 / action_space, dtype=torch.float32))
        return action, log_prob

    action = action_dist.sample()
    return action.item(), action_dist.log_prob(action)


def ppo_update(experience, policy_net, optimizer, clip_eps=clip_epsilon):
    """Actualiza el modelo utilizando PPO."""
    log_probs_old = []
    rewards = []
    log_probs_new = []

    # Recolectar datos de experiencia
    for state, action, reward, log_prob_old in experience:
        state = torch.tensor(state, dtype=torch.float32)
        log_prob_old = log_prob_old.clone().detach()  # Corregido

        # Obtener la probabilidad logarítmica de la acción con la política actual
        action_probs = policy_net(state)
        action_dist = torch.distributions.Categorical(action_probs)
        log_prob_new = action_dist.log_prob(torch.tensor(action))

        log_probs_old.append(log_prob_old)
        rewards.append(reward)
        log_probs_new.append(log_prob_new)

    log_probs_old = torch.stack(log_probs_old)
    rewards = torch.tensor(rewards, dtype=torch.float32)
    log_probs_new = torch.stack(log_probs_new)

    # Calcular ventaja
    advantage = rewards - rewards.mean()

    # Calcular el ratio y aplicar clipping
    ratios = torch.exp(log_probs_new - log_probs_old)
    clipped_ratios = torch.clamp(ratios, 1 - clip_eps, 1 + clip_eps)
    loss = -torch.min(ratios * advantage, clipped_ratios * advantage).mean()

    # Actualizar los parámetros de la política
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


# Bucle de episodios
num_episodes = 100
for episode in range(num_episodes):
    observations, infos = env.reset()
    experience_prey = []
    #experience_predator = []
    total_rewards = {agent: 0 for agent in env.agents}
    

    print(f"\nInicio del Episodio {episode + 1}")

    while env.agents:
        actions = {}
        

        # Seleccionar acciones para cada agente
        for agent in env.agents:
            obs = observations[agent]
            obs = (obs - np.mean(obs)) / (np.std(obs) + 1e-5)
            if "prey" in agent:
                action, log_prob = get_action(obs, policy_prey, epsilon=epsilon)
                actions[agent] = action
            #elif "predator" in agent:
             #   action, log_prob = get_action(obs, policy_predator)
              #  actions[agent] = action

        # Tomar un paso en el entorno
        observations, rewards, terminations, truncations, infos = env.step(actions)

        # Recolectar experiencia después de obtener la recompensa
        for agent in env.agents:
            obs = observations[agent]
            if "prey" in agent:
                action = actions[agent]
                log_prob = get_action(obs, policy_prey)[1]
                experience_prey.append((obs, action, rewards[agent], log_prob))
            #elif "predator" in agent:
             #   action = actions[agent]
              #  log_prob = get_action(obs, policy_predator)[1]
               # experience_predator.append((obs, action, rewards[agent], log_prob))

            total_rewards[agent] += rewards[agent]

        # Remover agentes que terminaron
        env.agents = [agent for agent in env.agents if not (terminations[agent] or truncations[agent])]

    # Actualizar la política utilizando PPO
    ppo_update(experience_prey, policy_prey, optimizer_prey)
    #ppo_update(experience_predator, policy_predator, optimizer_predator)

    # Reducir gradualmente la exploración
    epsilon = max(0.1, epsilon * 0.99)

    print(f"\nResumen del Episodio {episode + 1}")
    print(f"Recompensas acumuladas: {total_rewards}")

env.close()



Inicio del Episodio 1

Resumen del Episodio 1
Recompensas acumuladas: {'prey_0': 49.244069622005014}

Inicio del Episodio 2

Resumen del Episodio 2
Recompensas acumuladas: {'prey_0': 44.00076519537924}

Inicio del Episodio 3

Resumen del Episodio 3
Recompensas acumuladas: {'prey_0': 49.5}

Inicio del Episodio 4

Resumen del Episodio 4
Recompensas acumuladas: {'prey_0': 36.99873162377422}

Inicio del Episodio 5

Resumen del Episodio 5
Recompensas acumuladas: {'prey_0': 43.341301278324}

Inicio del Episodio 6

Resumen del Episodio 6
Recompensas acumuladas: {'prey_0': 49.43802417048589}

Inicio del Episodio 7

Resumen del Episodio 7
Recompensas acumuladas: {'prey_0': 41.21247185031807}

Inicio del Episodio 8

Resumen del Episodio 8
Recompensas acumuladas: {'prey_0': 8.044957538509262}

Inicio del Episodio 9

Resumen del Episodio 9
Recompensas acumuladas: {'prey_0': 40.71632382073294}

Inicio del Episodio 10

Resumen del Episodio 10
Recompensas acumuladas: {'prey_0': 47.200976161746986}

