In [1]:
%pip install -q -e .

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import nmmo
from nmmo import config

from implementations.train_ppo import train_ppo, evaluate_agent
from implementations.SimplierInputAgentV2 import SimplierInputAgentV2
from implementations.RandomAgent import get_avg_lifetime_for_random_agent, get_avg_reward_for_random_agent
from implementations.Observations import Observations
from implementations.CustomRewardBase import LifetimeReward, \
    ResourcesAndGatheringReward, ExplorationReward, WeightedReward, ShiftingReward, CurriculumReward, ResourcesReward
from implementations.StayNearResourcesReward import StayNearResourcesReward
from implementations.SavingCallback import SavingCallback
from implementations.AnimationCallback import AnimationCallback
from implementations.PathTrackingCallback import PathTrackingCallback
from implementations.observations_to_inputs import observations_to_inputs_simplier
from implementations.jar import Jar

In [3]:
def get_all_observations_from_save(save_name: str, agent_ids: list[int]) -> list[Observations]:       
    history = Jar("saves").get(save_name)
    observations = [ep_obs[agent_id] 
                   for ep in history 
                   for ep_obs, _ in ep[0] 
                   for agent_id in agent_ids
                   if agent_id in ep_obs]
    return observations

def get_entropies_from_save(save_name: str) -> dict[str, list[float]]:     
	history = Jar("saves").get(save_name)
	entropies_per_episode = [[entropies 
                           		for step in ep[4] 
                           		for entropies in step.values()]
						    for ep in history]
 
	means_per_episode_per_type = [{type: np.mean([entropies[type] for entropies in ep]) 
                                  	for type in ep[0].keys()} 
                                 for ep in entropies_per_episode]
 
	means_per_type_per_episode = {type: [ep[type] for ep in means_per_episode_per_type]
                                  for type in means_per_episode_per_type[0].keys()}
	return means_per_type_per_episode

In [4]:
def plot_losses(
    actor_losses: list[float], 
    critic_losses: list[float],
    window: int = 500
) -> None:
    _, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 6))
    
    ax1.plot(actor_losses, label="Actor Loss", color='blue', alpha=0.4)
    actor_losses_smooth = np.convolve(actor_losses, np.ones(window)/window, mode='valid')

    actor_losses_std = np.array([np.std(actor_losses[max(0, i-window):i+1]) 
                                for i in range(window-1, len(actor_losses))])
    
    ax1.plot(range(window-1, len(actor_losses)), actor_losses_smooth, 
             label=f"Running Mean (window={window})", color='red')
    ax1.fill_between(range(window-1, len(actor_losses)), 
                     actor_losses_smooth - actor_losses_std,
                     actor_losses_smooth + actor_losses_std,
                     alpha=0.2, color='red', label='Standard Deviation')
    
    ax1.set_xlabel("Epoch")
    ax1.set_ylabel("Loss")
    ax1.set_title("Actor Loss Over Time")
    ax1.legend()
    
    ax2.semilogy(critic_losses, label="Critic Loss", color='blue', alpha=0.4)
    critic_losses_smooth = np.convolve(critic_losses, np.ones(window)/window, mode='valid')
    ax2.semilogy(range(window-1, len(critic_losses)), critic_losses_smooth, 
                 label=f"Running Mean (window={window})", color='red')
    ax2.set_xlabel("Epoch")
    ax2.set_ylabel("Loss (log scale)")
    ax2.set_title("Critic Loss Over Time")
    ax2.legend()
    
    plt.tight_layout()
    plt.show()
    
def plot_losses_from_save(agent_name: str, window: int = 500) -> None:
    history = Jar("saves").get(agent_name)

    losses = [episode[2] for episode in history]
    actor_losses = [l for loss in losses for l in loss[0]]
    critic_losses = [l for loss in losses for l in loss[1]]
    
    plot_losses(actor_losses, critic_losses, window)

def plot_rewards(
    avg_rewards: list[float], 
    max_rewards: list[float], 
    min_rewards: list[float], 
    ninetieth_percentile_rewards: list[float] | None = None, 
    random_agent_reward: float | None = None,
    window: int = 50
) -> None:
    _, ax = plt.subplots(figsize=(10, 6))
    
    if ninetieth_percentile_rewards is not None:
        ax.plot(ninetieth_percentile_rewards, label="90th Percentile Reward", color='purple', alpha=0.4)
        
    ax.plot(avg_rewards, label="Average Reward", color='red', alpha=0.4)
    ax.plot(max_rewards, label="Max Reward", color='pink', alpha=0.8)
    ax.plot(min_rewards, label="Min Reward", color='green', alpha=0.4)
    
    if random_agent_reward is not None:
        ax.axhline(y=random_agent_reward, label="Random Agent Reward", color='black', linestyle='--')
    
    avg_rewards_smooth = np.convolve(avg_rewards, np.ones(window)/window, mode='valid')
    ax.plot(range(window-1, len(avg_rewards)), avg_rewards_smooth, label=f"Running Mean (window={window})", color='blue')

    ax.set_xlabel("Episode")
    ax.set_ylabel("Reward")
    ax.set_title("Rewards Over Time")
    ax.legend()
    plt.show()

def plot_rewards_from_save(agent_name: str, window: int = 50, random_agent_reward: float | None = None) -> None:
    history = Jar("saves").get(agent_name)

    rewards = [episode[1] for episode in history]
    num_agents = len(rewards[0])
    
    avg_rewards = [np.mean([r for r in reward.values()]) for reward in rewards]
    max_rewards = [np.max([r for r in reward.values()]) for reward in rewards]
    min_rewards = [np.min([r for r in reward.values()]) for reward in rewards]
    
    if num_agents > 30:
        ninetieth_percentile_rewards = [np.percentile([r for r in reward.values()], 90) for reward in rewards]
    else:
        ninetieth_percentile_rewards = None
    
    plot_rewards(avg_rewards, max_rewards, min_rewards, ninetieth_percentile_rewards, random_agent_reward, window)

def plot_lifetimes(
    avg_lifetimes: list[float], 
    max_lifetimes: list[float], 
    min_lifetimes: list[float], 
    ninetieth_percentile: list[float] | None = None,
    random_agent_lifetime: float | None = None,
    window: int = 50
) -> None:
    _, ax = plt.subplots(figsize=(10, 6))
    
    if ninetieth_percentile is not None:
        ax.plot(ninetieth_percentile, label="90th Percentile", color='purple', alpha=0.4)
        
    ax.plot(avg_lifetimes, label="Average Lifetime", color='red', alpha=0.4)
    ax.plot(max_lifetimes, label="Max Lifetime", color='pink', alpha=0.8)
    ax.plot(min_lifetimes, label="Min Lifetime", color='green', alpha=0.4)
    
    if random_agent_lifetime is not None:
        ax.axhline(y=random_agent_lifetime, label="Random Agent Lifetime", color='black', linestyle='--')

    avg_rewards_smooth = np.convolve(avg_lifetimes, np.ones(window)/window, mode='valid')
    ax.plot(range(window-1, len(avg_lifetimes)), avg_rewards_smooth, label=f"Running Mean (window={window})", color='blue')

    ax.set_xlabel("Episode")
    ax.set_ylabel("Lifetime")
    ax.set_title("Agent Lifetime Over Time")
    ax.legend()
    plt.show()   

def plot_lifetimes_from_save(agent_name: str, random_agent_lifetime: float | None = None, window: int = 50) -> None:
    history = Jar("saves").get(agent_name)

    lifetimes = [episode[3] for episode in history]
    num_agents = len(lifetimes[0])
    
    avg_lifetimes = [np.mean([r for r in reward.values()]) for reward in lifetimes]
    max_lifetimes = [np.max([r for r in reward.values()]) for reward in lifetimes]
    min_lifetimes = [np.min([r for r in reward.values()]) for reward in lifetimes]
    
    if num_agents > 30:
        ninetieth_percentile = [np.percentile([r for r in lifetime.values()], 90) for lifetime in lifetimes]
    else:
        ninetieth_percentile = None
    
    plot_lifetimes(avg_lifetimes, max_lifetimes, min_lifetimes, ninetieth_percentile, random_agent_lifetime, window)
    
def plot_entropies(
    means_per_type_per_episode: dict[str, list[float]], 
    window: int = 50
) -> None:
    _, axs = plt.subplots(2, 2, figsize=(20, 12))
    axs = axs.flatten()
    
    for idx, (type, entropies) in enumerate(means_per_type_per_episode.items()):
        ax = axs[idx]
        ax.plot(entropies, label=f"{type} Entropy", alpha=0.4)
        
        entropies_smooth = np.convolve(entropies, np.ones(window)/window, mode='valid')
        ax.plot(range(window-1, len(entropies)), entropies_smooth, 
               label=f"Running Mean (window={window})")
        
        ax.set_xlabel("Episode")
        ax.set_ylabel("Entropy")
        ax.set_title(f"{type} Entropy Over Time")
        ax.set_ylim(bottom=0)  # Set y-axis minimum to 0
        ax.legend()
    
    plt.tight_layout()
    plt.show()
    
def plot_entropies_from_save(agent_name: str, window: int = 50) -> None:
    plot_entropies(get_entropies_from_save(agent_name), window)

In [5]:
conf = config.Default()
conf.set("PLAYER_N", 32)
#conf.set("NPC_N", 0)

# reward = WeightedReward({
#     StayNearResourcesReward(1024, target_distance=1): 1,
#     ResourcesAndGatheringReward(1024, gathering_bonus=4, scale_with_resource_change=True): 1,
#     ExplorationReward(1024): 0.2
# })

lifetime_reward = LifetimeReward(max_lifetime=1000)
resources_reward = ResourcesReward(max_lifetime=1000)
exploration_reward = ExplorationReward(max_lifetime=1000, map_size=128, view_radius=7)

random_lifetime_reward, _ = get_avg_reward_for_random_agent(conf, reward=lifetime_reward, retries=20)
print(f"Random agent lifetime reward: {random_lifetime_reward:.6f}")
random_resources_reward, _ = get_avg_reward_for_random_agent(conf, reward=resources_reward, retries=20)
print(f"Random agent resources reward: {random_resources_reward:.6f}")
random_exploration_reward, _ = get_avg_reward_for_random_agent(conf, reward=exploration_reward, retries=20)
print(f"Random agent exploration reward: {random_exploration_reward:.6f}")

random_baselines = {
    "LifetimeReward": random_lifetime_reward,
    "ResourcesReward": random_resources_reward,
    "ExplorationReward": random_exploration_reward,
}

reward_stages = [
    (0.03, resources_reward),
    (0.1, exploration_reward),
    (0, lifetime_reward)
]

curriculum_reward = CurriculumReward(reward_stages=reward_stages, random_agent_baselines=random_baselines)


# curriculum_reward = CurriculumReward(reward_stages=reward_stages)

print(curriculum_reward.get_config())
reward = curriculum_reward


random_reward, random_rewards = get_avg_reward_for_random_agent(conf, reward=reward, retries=20)
random_reward_std = np.std(random_rewards)
print(f"Random agent reward: {random_reward:.6f} ± {random_reward_std:.6f}")

random_lifetime, random_lifetimes = get_avg_lifetime_for_random_agent(conf, retries=20)
random_lifetime_std = np.std(random_lifetimes)
print(f"Random agent lifetime: {random_lifetime:4.2f} ± {random_lifetime_std:.2f}")

Random agent lifetime reward: 0.080481
Random agent resources reward: 0.069806
Random agent exploration reward: 0.014562
{'name': 'CurriculumReward', 'stages': [{'reward_threshold': 0.03, 'config': {'name': 'ResourcesReward', 'max_lifetime': 1000}}, {'reward_threshold': 0.1, 'config': {'name': 'ExplorationReward', 'max_lifetime': 1000, 'map_size': 128, 'view_radius': 7}}, {'reward_threshold': 0, 'config': {'name': 'LifetimeReward', 'max_lifetime': 1000}}]}
Running average reward: 0.0143 < 0.03 (stage 0)
Running average reward: 0.0141 < 0.03 (stage 0)
Running average reward: 0.0139 < 0.03 (stage 0)
Running average reward: 0.0137 < 0.03 (stage 0)
Running average reward: 0.0136 < 0.03 (stage 0)
Running average reward: 0.0136 < 0.03 (stage 0)
Running average reward: 0.0135 < 0.03 (stage 0)
Running average reward: 0.0134 < 0.03 (stage 0)
Running average reward: 0.0133 < 0.03 (stage 0)
Running average reward: 0.0132 < 0.03 (stage 0)
Running average reward: 0.0130 < 0.03 (stage 0)
Running ave

In [6]:
agent_name = "curriculum_reward_42"
save_name = agent_name

train_ppo(nmmo.Env(conf),
          SimplierInputAgentV2(
            learning_rate=5e-5,
            lr_decay=0.999,
            min_lr=5e-7,
            critic_learning_rate=5e-6,
            critic_lr_decay=0.999,
            critic_min_lr=5e-7,
            epsilon=0.1,
            epochs=50,
            batch_size=256,
            entropy_loss_coef=3e-5,
            max_grad_norm=0.5,
            sample_weights_softmin_temp=-0.5,
            action_loss_weights={
                "Move": 2.5,
                "AttackStyle": 0.5,
                "AttackTargetPos": 0.5,
              }),
          episodes=40, # 400
          save_every=20, # 25
          print_every=1, # 5
          eval_every=20, # 50
          eval_episodes=5,
          custom_reward=reward,
          agent_name=agent_name,
          callbacks=[
            SavingCallback(save_name, reward_config=reward.get_config())])

Running average reward: 0.0143 < 0.03 (stage 0)
Running average reward: 0.0141 < 0.03 (stage 0)
Running average reward: 0.0139 < 0.03 (stage 0)
Running average reward: 0.0137 < 0.03 (stage 0)
Running average reward: 0.0135 < 0.03 (stage 0)
Running average reward: 0.0134 < 0.03 (stage 0)
Running average reward: 0.0132 < 0.03 (stage 0)
Running average reward: 0.0131 < 0.03 (stage 0)
Running average reward: 0.0130 < 0.03 (stage 0)
Running average reward: 0.0129 < 0.03 (stage 0)
Running average reward: 0.0126 < 0.03 (stage 0)
Running average reward: 0.0124 < 0.03 (stage 0)
Running average reward: 0.0122 < 0.03 (stage 0)
Running average reward: 0.0120 < 0.03 (stage 0)
Running average reward: 0.0118 < 0.03 (stage 0)
Running average reward: 0.0117 < 0.03 (stage 0)
Running average reward: 0.0115 < 0.03 (stage 0)
Running average reward: 0.0113 < 0.03 (stage 0)
Running average reward: 0.0111 < 0.03 (stage 0)
Running average reward: 0.0110 < 0.03 (stage 0)
Running average reward: 0.0108 < 0.03 (s

KeyboardInterrupt: 

In [None]:
plot_rewards_from_save(save_name, random_agent_reward=random_reward, window=5) #50

In [None]:
plot_lifetimes_from_save(save_name, random_agent_lifetime=random_lifetime, window=5) #50

In [None]:
plot_losses_from_save(save_name, window=1000)

In [None]:
plot_entropies_from_save(save_name, window=5) #50

In [None]:
path_callback = PathTrackingCallback()

evaluate_agent(
    nmmo.Env(conf),
    SimplierInputAgentV2.load(f"{agent_name}_best"),
    episodes=10,
    custom_reward=LifetimeReward(1024),
    callbacks=[
        path_callback,
        AnimationCallback(1, f"{agent_name}_best_animation")
    ]
)

In [None]:
path_callback.plot_paths(1)

In [13]:
def verify_observations(save_name: str) -> None:
    observations = get_all_observations_from_save(save_name, agent_ids=list(range(1, 33)))
    net_inputs = [observations_to_inputs_simplier(obs, device="cpu") for obs in observations]

    tiles = [inp[0][0] for inp in net_inputs]
    tile_features = [feature for tile in tiles for feature in tile.reshape(-1, 28)[:, -9:]if not torch.all(feature == 0)]

    self_datas = [inp[1][0] for inp in net_inputs]
    move_masks = [inp[2][0] for inp in net_inputs]
    attack_masks = [inp[3][0] for inp in net_inputs]

    def assert_for_all(values, assertion_fn, description):
        correct_count = sum([assertion_fn(tensor) for tensor in values])
        total_count = len(values)
        print(f"{(description+':'):<35}{correct_count}/{total_count} {('✅' if correct_count == total_count else '❌')}")

    assert_for_all(tiles, lambda x: x.shape == torch.Size([15, 15, 28]), "Tiles shape")
    assert_for_all(tiles, lambda x: torch.all(torch.sum(x[:, :, :16], dim=-1) == 1), "16 features one-hot encoded")
    assert_for_all(tiles, lambda x: torch.all(torch.sum(x[:, :, 16:18], dim=-1) == 1), "Each tile either passable or not")
    assert_for_all(tiles, lambda x: torch.all(torch.logical_or(x[:, :, 18] == 0, x[:, :, 18] == 1)), "Each tile harvestable or not")
    # TODO: Check seen entity data
    print()

    assert_for_all(self_datas, lambda x: x.shape == torch.Size([5]), "Self data shape")
    assert_for_all(self_datas, lambda x: torch.all((x >= 0) & (x <= 1)), "All values between 0 and 1")
    print()

    assert_for_all(attack_masks, lambda x: x.shape == torch.Size([3]), "Attack mask shape")
    assert_for_all(attack_masks, lambda x: torch.all(x == 1), "Every attack style valid")
    print()

    assert_for_all(move_masks, lambda x: x.shape == torch.Size([5]), "Move mask shape")
    assert_for_all(move_masks, lambda x: x[-1] == 1, "Can not move")
    assert_for_all(move_masks, lambda x: torch.any(x[:-1] == 1), "Can move somewhere")

    seen_ids = {}
    for obs in observations:
        if obs.agent_id not in seen_ids:
            seen_ids[obs.agent_id] = {}
            
        for seen_id in obs.entities.id:
            if seen_id == 0 or seen_id == obs.agent_id:
                continue
            
            if seen_id not in seen_ids[obs.agent_id]:
                seen_ids[obs.agent_id][seen_id] = 0
                
            seen_ids[obs.agent_id][seen_id] += 1
            
    # Check that if one agent sees another, the other agent sees the first agent
    for agent_id, seen in seen_ids.items():
        for seen_id, count in seen.items():
            if seen_ids.get(seen_id, {}).get(agent_id, 0) != count:
                print(f"Agent {agent_id} saw {seen_id} {count} times, but {seen_id} saw {agent_id} {seen_ids.get(seen_id, {}).get(agent_id, 0)} times")