In [2]:
!pip install pettingzoo


In [1]:
import numpy as np
import random
from pettingzoo.mpe import simple_tag_v3

# Inicializar el entorno
env = simple_tag_v3.env(render_mode=None, max_cycles=100)
env.reset()

# Parámetros del aprendizaje
learning_rate = 0.01
discount_factor = 0.9
action_space = 5
epsilon = 0.5
clip_epsilon = 0.2

# Definir el total de estados y modelo
total_states = (env.observation_space(env.agents[0]).shape[0], action_space)
policy_prey = np.zeros((total_states[0], action_space))
policy_predator = np.zeros((total_states[0], action_space))
old_policy_prey = policy_prey.copy()
old_policy_predator = policy_predator.copy()

# Estado interno de las presas
prey_state = {
    "energy": 100,
    "thirst": 0
}

# Funciones de ayuda
def calc_state(observation):
    return int(observation.sum()) % total_states[0]

def get_action(state, model_params, exploration=True):
    if exploration and random.uniform(0, 1) < epsilon:
        return np.random.choice(action_space)
    else:
        return model_params[state].argmax()

def update_prey_state(prey_state, action):
    if action == 1:  # Buscar agua
        prey_state["thirst"] = max(0, prey_state["thirst"] - 10)
    elif action == 2:  # Buscar comida
        prey_state["energy"] = min(100, prey_state["energy"] + 10)
    elif action == 0:  # Huir
        prey_state["energy"] -= 5
        prey_state["thirst"] += 2
    else:  # Descansar
        prey_state["energy"] += 1

def calculate_reward(agent, prey_state, action, termination):
    if "agent" in agent:  # Presa
        if termination:
            return -10  # Muerte
        if prey_state["energy"] <= 0 or prey_state["thirst"] >= 100:
            return -1  # Hambre o sed
        if action in [1, 2]:
            return 1  # Encontrar comida o agua
        return 0
    elif "adversary" in agent:  # Depredador
        if termination:
            return -10  # Muerte
        if action == 3:  # Comer presa
            return 2
        return 0

def compute_advantage(rewards, values):
    return rewards - values

def ppo_update(experience, model_params, old_model_params, lr=learning_rate, clip_eps=clip_epsilon):
    """Actualiza el modelo utilizando PPO."""
    for prev_state, action, state, reward, final in reversed(experience):
        # Calcular ventaja y normalizar
        advantage = compute_advantage(reward, model_params[prev_state][action])
        advantage = np.clip(advantage, -10, 10)  # Evitar valores extremos

        # Calcular el ratio y aplicar corrección numérica
        ratio = model_params[prev_state][action] / (old_model_params[prev_state][action] + 1e-5)
        ratio = np.clip(ratio, 0.1, 10)  # Limitar el ratio

        # Aplicar clipping al ratio
        clipped_ratio = np.clip(ratio, 1 - clip_eps, 1 + clip_eps)

        # Calcular la pérdida y actualizar los parámetros
        loss = -min(ratio * advantage, clipped_ratio * advantage)
        model_params[prev_state][action] -= lr * loss

# Bucle de episodios
num_episodes = 10
prey_survival_times = {}
agent_actions = {}

for episode in range(num_episodes):
    env.reset()
    experience_prey = {agent: [] for agent in env.agents if "agent" in agent}
    experience_predator = {agent: [] for agent in env.agents if "adversary" in agent}
    survival_time = {agent: 0 for agent in env.agents if "agent" in agent}
    actions_log = {agent: [] for agent in env.agents}
    total_rewards = {agent: 0 for agent in env.agents}

    print(f"\nInicio del Episodio {episode + 1}")

    while True:
        all_terminated = True
        for agent in env.agent_iter():
            observation, reward, termination, truncation, info = env.last()

            if termination or truncation:
                action = None
                print(f"{agent} ha terminado y no toma acción.")
            else:
                state = calc_state(observation)
                if "agent" in agent:  # Presa
                    action = get_action(state, policy_prey)
                    update_prey_state(prey_state, action)
                    survival_time[agent] += 1
                elif "adversary" in agent:  # Depredador
                    action = get_action(state, policy_predator)

                # Calcular recompensa personalizada
                custom_reward = calculate_reward(agent, prey_state, action, termination)
                total_rewards[agent] += custom_reward

                # Registrar la acción tomada y la experiencia
                actions_log[agent].append(action)
                if "agent" in agent:
                    experience_prey[agent].append([state, action, calc_state(observation), custom_reward, termination or truncation])
                else:
                    experience_predator[agent].append([state, action, calc_state(observation), custom_reward, termination or truncation])

            env.step(action)

            if not (termination or truncation):
                all_terminated = False

        if all_terminated:
            break

    # Actualizar la política utilizando PPO
    for agent in experience_prey:
        ppo_update(experience_prey[agent], policy_prey, old_policy_prey)
    for agent in experience_predator:
        ppo_update(experience_predator[agent], policy_predator, old_policy_predator)

    # Copiar política actual a política antigua
    old_policy_prey = policy_prey.copy()
    old_policy_predator = policy_predator.copy()

    print(f"\nResumen del Episodio {episode + 1}")
    print(f"Recompensas acumuladas: {total_rewards}")
    print(f"Tiempo de supervivencia de las presas: {survival_time}")

print("\nResultados Finales:")
for agent, times in prey_survival_times.items():
    promedio = np.mean(times)
    print(f"Presa {agent} sobrevivió en promedio {promedio:.2f} pasos.")

env.close()



Inicio del Episodio 1
adversary_0 ha terminado y no toma acción.
adversary_1 ha terminado y no toma acción.
adversary_2 ha terminado y no toma acción.
agent_0 ha terminado y no toma acción.

Resumen del Episodio 1
Recompensas acumuladas: {'adversary_0': 28, 'adversary_1': 20, 'adversary_2': 28, 'agent_0': -40}
Tiempo de supervivencia de las presas: {'agent_0': 100}

Inicio del Episodio 2
adversary_0 ha terminado y no toma acción.
adversary_1 ha terminado y no toma acción.
adversary_2 ha terminado y no toma acción.
agent_0 ha terminado y no toma acción.

Resumen del Episodio 2
Recompensas acumuladas: {'adversary_0': 38, 'adversary_1': 56, 'adversary_2': 26, 'agent_0': -100}
Tiempo de supervivencia de las presas: {'agent_0': 100}

Inicio del Episodio 3
adversary_0 ha terminado y no toma acción.
adversary_1 ha terminado y no toma acción.
adversary_2 ha terminado y no toma acción.
agent_0 ha terminado y no toma acción.

Resumen del Episodio 3
Recompensas acumuladas: {'adversary_0': 132, '