In [1]:
import numpy as np
import random
from NaturalEnv import natural_env_v0

# Inicializar el entorno
env = natural_env_v0.parallel_env(render_mode=None, max_cycles=100)
observations, infos = env.reset(seed=42)

# Parámetros del aprendizaje
learning_rate = 0.01
discount_factor = 0.9
action_space = 5
epsilon = 0.5
clip_epsilon = 0.2

# Definir el total de estados y modelo
total_states = (observations['prey_0'].shape[0], action_space)
policy_prey = np.zeros((total_states[0], action_space))
policy_predator = np.zeros((total_states[0], action_space))
old_policy_prey = policy_prey.copy()
old_policy_predator = policy_predator.copy()

# Funciones de ayuda
def calc_state(observation):
    return int(observation.sum()) % total_states[0]

def get_action(state, model_params, exploration=True):
    if exploration and random.uniform(0, 1) < epsilon:
        return np.random.choice(action_space)
    else:
        return model_params[state].argmax()

def ppo_update(experience, model_params, old_model_params, lr=learning_rate, clip_eps=clip_epsilon):
    """Actualiza el modelo utilizando PPO."""
    for prev_state, action, state, reward, final in reversed(experience):
        advantage = reward - model_params[prev_state][action]
        advantage = np.clip(advantage, -10, 10)

        ratio = model_params[prev_state][action] / (old_model_params[prev_state][action] + 1e-5)
        ratio = np.clip(ratio, 0.1, 10)

        clipped_ratio = np.clip(ratio, 1 - clip_eps, 1 + clip_eps)

        loss = -min(ratio * advantage, clipped_ratio * advantage)
        model_params[prev_state][action] -= lr * loss

# Bucle de episodios
num_episodes = 100
for episode in range(num_episodes):
    observations, infos = env.reset()
    experience_prey = {agent: [] for agent in env.agents if "prey" in agent}
    experience_predator = {agent: [] for agent in env.agents if "predator" in agent}
    total_rewards = {agent: 0 for agent in env.agents}

    print(f"\nInicio del Episodio {episode + 1}")

    while env.agents:
        actions = {}
        for agent in env.agents:
            state = calc_state(observations[agent])
            if "prey" in agent:
                action = get_action(state, policy_prey)
            elif "predator" in agent:
                action = get_action(state, policy_predator)

            actions[agent] = action

        # Tomar un paso en el entorno
        observations, rewards, terminations, truncations, infos = env.step(actions)

        # Recolectar experiencia y actualizar recompensas
        for agent in env.agents:
            state = calc_state(observations[agent])
            reward = rewards[agent]
            total_rewards[agent] += reward

            if "prey" in agent:
                experience_prey[agent].append([state, actions[agent], state, reward, terminations[agent] or truncations[agent]])
            elif "predator" in agent:
                experience_predator[agent].append([state, actions[agent], state, reward, terminations[agent] or truncations[agent]])

        # Remover agentes que terminaron
        env.agents = [agent for agent in env.agents if not (terminations[agent] or truncations[agent])]

    # Actualizar la política utilizando PPO
    for agent in experience_prey:
        ppo_update(experience_prey[agent], policy_prey, old_policy_prey)
    for agent in experience_predator:
        ppo_update(experience_predator[agent], policy_predator, old_policy_predator)

    # Copiar política actual a política antigua
    old_policy_prey = policy_prey.copy()
    old_policy_predator = policy_predator.copy()

    print(f"\nResumen del Episodio {episode + 1}")
    print(f"Recompensas acumuladas: {total_rewards}")

env.close()



Inicio del Episodio 1

Resumen del Episodio 1
Recompensas acumuladas: {'prey_0': -182.18391919922004, 'prey_1': -86.16124176121001, 'prey_2': 18.90000000000003, 'prey_3': 9.89999999999998, 'prey_4': -0.10000000000001391, 'prey_5': 9.89999999999998, 'predator_0': -51.26886022559757, 'predator_1': -117.76718374647754}

Inicio del Episodio 2

Resumen del Episodio 2
Recompensas acumuladas: {'prey_0': -57.86275413308803, 'prey_1': 13.89999999999998, 'prey_2': -206.73391001749533, 'prey_3': -348.0799760111454, 'prey_4': -94.01842395722804, 'prey_5': -1193.5010441702848, 'predator_0': -54.989804529140756, 'predator_1': -60.58817707822509}

Inicio del Episodio 3

Resumen del Episodio 3
Recompensas acumuladas: {'prey_0': -582.9453827361011, 'prey_1': -968.6452801262739, 'prey_2': 23.900000000000063, 'prey_3': -46.06654246609793, 'prey_4': 16.899999999999984, 'prey_5': 2.6998597440840983, 'predator_0': -844.3812760491226, 'predator_1': -235.86994548817972}

Inicio del Episodio 4

Resumen del Ep