In [2]:
import numpy as np
import random
from NaturalEnv import natural_env_v0

# Inicializar el entorno
env = natural_env_v0.parallel_env(render_mode=None)
observations, infos = env.reset()

# Parámetros del aprendizaje
learning_rate = 0.01
discount_factor = 0.9
action_space = 5
epsilon = 0.5

# Inicializar modelos de Q-learning para cada agente
model_params = {agent: np.zeros((observations[agent].shape[0], action_space)) for agent in env.agents}

# Funciones de ayuda
def calc_state(observation):
    return int(observation.sum()) % observation.shape[0]

def get_action(state, model_params, exploration=True):
    if exploration and random.uniform(0, 1) < epsilon:
        return np.random.choice(action_space)
    else:
        return model_params[state].argmax()

def train(experience, model_params, lr=learning_rate, df=discount_factor):
    """Entrena el modelo actualizando parámetros usando Q-learning."""
    for prev_state, action_taken, state, reward in reversed(experience):
        best_next_action = model_params[state].max()
        target = reward + df * best_next_action
        model_params[prev_state][action_taken] += lr * (target - model_params[prev_state][action_taken])

# Bucle de episodios
num_episodes = 10
for episode in range(num_episodes):
    observations, infos = env.reset()
    experience = {agent: [] for agent in env.agents}
    total_rewards = {agent: 0 for agent in env.agents}

    print(f"\nInicio del Episodio {episode + 1}")

    while env.agents:
        # Seleccionar acciones para cada agente
        actions = {}
        for agent in env.agents:
            state = calc_state(observations[agent])
            action = get_action(state, model_params[agent])
            actions[agent] = action

        # Tomar un paso en el entorno
        observations, rewards, terminations, truncations, infos = env.step(actions)

        # Registrar experiencia y actualizar recompensas acumuladas
        for agent in env.agents:
            state = calc_state(observations[agent])
            prev_state = calc_state(observations[agent])
            action = actions[agent]
            reward = rewards[agent]
            total_rewards[agent] += reward

            experience[agent].append([prev_state, action, state, reward])

            # Verificar si el agente ha terminado
            if terminations[agent] or truncations[agent]:
                env.agents.remove(agent)

    # Entrenar al final del episodio
    for agent in experience:
        train(experience[agent], model_params[agent])

    # Mostrar las recompensas acumuladas
    print(f"Recompensas acumuladas: {total_rewards}")

env.close()



Inicio del Episodio 1
Recompensas acumuladas: {'prey_0': -24.6245222857422, 'prey_1': 8.399999999999995, 'prey_2': -46.02781424473212, 'prey_3': -5.587157694768834, 'prey_4': 2.400000000000001, 'prey_5': 11.399999999999999, 'predator_0': -15.94555310376971, 'predator_1': -14.872268014593276}

Inicio del Episodio 2
Recompensas acumuladas: {'prey_0': 2.400000000000001, 'prey_1': 2.400000000000001, 'prey_2': 2.400000000000001, 'prey_3': 7.399999999999999, 'prey_4': -36.86832441538518, 'prey_5': -2.3958489991743046, 'predator_0': -20.91062022173938, 'predator_1': -10.20672414424847}

Inicio del Episodio 3
Recompensas acumuladas: {'prey_0': -27.65334663777228, 'prey_1': 2.400000000000001, 'prey_2': 2.400000000000001, 'prey_3': -35.31269661231628, 'prey_4': 13.399999999999995, 'prey_5': -13.377532925479219, 'predator_0': -23.811878842393764, 'predator_1': -15.783645919318353}

Inicio del Episodio 4
Recompensas acumuladas: {'prey_0': 2.400000000000001, 'prey_1': 2.400000000000001, 'prey_2': 