In [1]:
from magent2.environments import battle_v4
from MADDPG import MADDPG
from ReplayBuffer import MultiAgentReplayBuffer
import numpy as np

In [2]:
env = battle_v4.parallel_env(map_size=45, max_cycles=300)

In [3]:
config = {
    "n_agents": env.team_sizes[0],
    "n_actions": env.action_space("red_0").n,
    "actor_dims": 13*13*5,
    "critic_dims": 45*45*5,
    "n_episodes": 1000,
    "print_interval": 10,
    "max_steps": 300,
}

In [4]:
blue_maddpg = MADDPG(
    actor_dims=config["actor_dims"],
    critic_dims=config["critic_dims"],
    n_agents=config["n_agents"],
    n_actions=config["n_actions"],
    handle="blue",
)

In [5]:
red_maddpg = MADDPG(
    actor_dims=config["actor_dims"],
    critic_dims=config["critic_dims"],
    n_agents=config["n_agents"],
    n_actions=config["n_actions"],
    handle="red",
)

In [6]:
blue_memory = MultiAgentReplayBuffer(
    10000,
    config["critic_dims"],
    config["actor_dims"],
    config["n_actions"],
    config["n_agents"],
    handle="blue",
    batch_size=8,
)

In [7]:
red_memory = MultiAgentReplayBuffer(
    10000,
    config["critic_dims"],
    config["actor_dims"],
    config["n_actions"],
    config["n_agents"],
    handle="red",
    batch_size=8,
)

In [8]:
blue_score_history = []
blue_best_score = -np.inf
red_score_history = []
red_best_score = -np.inf

In [9]:
for episode in range(config["n_episodes"]):
    obs = env.reset() # Dict {agent_name: obs}
    state = env.state() # Initial state
    blue_score = 0
    red_score = 0

    for step in range(config["max_steps"]):
        blue_actions = blue_maddpg.choose_action(obs)
        red_actions = red_maddpg.choose_action(obs)
        actions = {**blue_actions, **red_actions}

        obs_, reward, termination, truncation, _ = env.step(actions)

        blue_reward = [reward.get(agent_name, 0) for agent_name in blue_actions.keys()]
        red_reward = [reward.get(agent_name, 0) for agent_name in red_actions.keys()]

        blue_done = [termination.get(agent_name, True) or truncation.get(agent_name, True) for agent_name in blue_actions.keys()]
        red_done = [termination.get(agent_name, True) or truncation.get(agent_name, True) for agent_name in red_actions.keys()]

        next_state = env.state()

        blue_memory.store_transition(obs, state, blue_actions, blue_reward, obs_, next_state, blue_done)
        red_memory.store_transition(obs, state, red_actions, red_reward, obs_, next_state, red_done)

        if step % 100 == 0:
            blue_maddpg.learn(blue_memory)
            red_maddpg.learn(red_memory)

        obs = obs_
        state = next_state

        blue_score += sum(blue_reward)
        red_score += sum(red_reward)

        if all(blue_done) or all(red_done):
            break
    
    blue_score_history.append(blue_score)
    red_score_history.append(red_score)

    blue_avg_score = np.mean(blue_score_history[-100:])
    red_avg_score = np.mean(red_score_history[-100:])

    if blue_avg_score > blue_best_score:
        blue_best_score = blue_avg_score
        blue_maddpg.save_checkpoint()
    
    if red_avg_score > red_best_score:
        red_best_score = red_avg_score
        red_maddpg.save_checkpoint()

    print(f"Episode: {episode}, Blue Score: {blue_score}, Red Score: {red_score}, Blue Avg Score: {blue_avg_score}, Red Avg Score: {red_avg_score}")

  File "C:\Users\natsu\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\natsu\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\natsu\Desktop\Work\RL-final-project-AIT-3007\.venv\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\natsu\Desktop\Work\RL-final-project-AIT-3007\.venv\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\natsu\Desktop\Work\RL-final-project-AIT-3007\.venv\lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "c:\Users\natsu\Desktop\Work\RL-final-project-AIT-3007\.venv\lib\site-packages\tornado\platform\asyncio.py", line 205, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\natsu\AppData\Local\Programs\Python\Python31

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [64, 21]], which is output 0 of AsStridedBackward0, is at version 3; expected version 2 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!