## Training part

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
import numpy as np
import gymnasium as gym
import gfootball.env as football_env
from typing import List
import os

from models import AgentNN
from ippo_method import compute_gae, ppo_update
from envs_create import make_gfootball_env


res_dir = "PPO_academy_5_vs_5"

if not os.path.exists(res_dir):
    os.mkdir(res_dir)

# 1. Конфигурация и гиперпараметры
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_ENVS = 12
NUM_AGENTS = 5
GAMMA = 0.99
GAE_LAMBDA = 0.95
CLIP_EPS = 0.05
PPO_EPOCHS = 4
MINIBATCH_SIZE = 2048
LR = 5e-5
TOTAL_FRAMES = 10_000_000
FRAMES_PER_BATCH = 8192*2 


# 2. Цикл обучения
if __name__ == '__main__':
    temp_env = make_gfootball_env(NUM_AGENTS)
    temp_obs = temp_env.reset() 
    obs_shape = temp_obs.shape[1:] # (72, 96, 4)
    temp_env.close()
    action_dim = 19 
    envs = [make_gfootball_env(NUM_AGENTS) for _ in range(NUM_ENVS)]
    
    agents = [AgentNN(obs_shape, action_dim).to(DEVICE) for _ in range(NUM_AGENTS)]
    all_agent_parameters = []
    for agent in agents:
        all_agent_parameters.extend(list(agent.parameters()))
        
    optimizer = optim.Adam(all_agent_parameters, lr=LR, weight_decay=1e-5)#, eps=1e-5)
    # -----------------------------------------------------------

    print(f"Запуск обучения на {DEVICE}, размерность SMM-представления на агента: {obs_shape}, размерность действия: {action_dim}. Используется 5 независимых CNN, 1 оптимизатор.")
    
    
    scores_team_ema = 0
    ema_coef = 0.98
    
    scores_team, num_done = 0, 0
    
    global_step = 0
    
    N_steps = 0
    
    while global_step < TOTAL_FRAMES:
        
        mb_data_agentwise = [{'obs': [], 'actions': [], 'logprobs': [], 'rewards': [], 'values': [], 'dones': []} for _ in range(NUM_AGENTS)]
        current_obs = [env.reset() for env in envs] # Список из 4 массивов (5, 72, 96, 4)
        steps_per_env_agent = FRAMES_PER_BATCH // (NUM_ENVS * NUM_AGENTS)

        for _ in range(steps_per_env_agent):
            obs_stack = np.stack(current_obs, axis=0) # (NUM_ENVS, NUM_AGENTS, 72, 96, 4)
            actions_np = np.zeros((NUM_ENVS, NUM_AGENTS), dtype=int)
            values_np = np.zeros((NUM_ENVS, NUM_AGENTS), dtype=np.float32)
            logprobs_np = np.zeros((NUM_ENVS, NUM_AGENTS), dtype=np.float32)

            for agent_idx in range(NUM_AGENTS):
                # Выбор среза данных по агенту
                agent_obs = torch.tensor(obs_stack[:, agent_idx, :, :, :], dtype=torch.float32).to(DEVICE) # (NUM_ENVS, 72, 96, 4)
                with torch.no_grad():
                    action, logprob, _, value = agents[agent_idx].get_action_and_value(agent_obs)
                actions_np[:, agent_idx] = action.cpu().numpy().astype(int)
                values_np[:, agent_idx] = value.squeeze().cpu().numpy()
                logprobs_np[:, agent_idx] = logprob.cpu().numpy()

            step_results = [env.step(a) for env, a in zip(envs, actions_np)]
            next_obs_list, rewards_list_step, dones_list_step, info_list_step = zip(*step_results)
            
            for to_reset, done_val, score_val in zip(envs, dones_list_step, info_list_step):
                if done_val: 
                    scores_team_ema = ema_coef*scores_team_ema + (1-ema_coef)*score_val["score_reward"]
                    num_done += 1
                    
                    scores_team += score_val["score_reward"]
                    to_reset.reset()

            rewards_per_agent_step = np.repeat(np.array(rewards_list_step)[:, np.newaxis], NUM_AGENTS, axis=1) 
            dones_per_agent_step = np.repeat(np.array(dones_list_step)[:, np.newaxis], NUM_AGENTS, axis=1) 

            for agent_idx in range(NUM_AGENTS):
                mb_data_agentwise[agent_idx]['obs'].append(obs_stack[:, agent_idx, :, :, :])
                mb_data_agentwise[agent_idx]['actions'].append(actions_np[:, agent_idx])
                mb_data_agentwise[agent_idx]['logprobs'].append(logprobs_np[:, agent_idx])
                mb_data_agentwise[agent_idx]['rewards'].append(rewards_per_agent_step[:, agent_idx]) 
                mb_data_agentwise[agent_idx]['values'].append(values_np[:, agent_idx])
                mb_data_agentwise[agent_idx]['dones'].append(dones_per_agent_step[:, agent_idx])
            current_obs = list(next_obs_list)

        global_step += FRAMES_PER_BATCH
        print(f"Глобальные шаги: {global_step}. Сбор данных...")

        next_values_per_agent_structured = np.zeros((NUM_ENVS, NUM_AGENTS), dtype=np.float32)
        with torch.no_grad():
            next_obs_stack = np.stack(current_obs, axis=0)
            for agent_idx in range(NUM_AGENTS):
                agent_next_obs = torch.tensor(next_obs_stack[:, agent_idx, :, :, :], dtype=torch.float32).to(DEVICE)
                next_val = agents[agent_idx].get_value(agent_next_obs).squeeze().cpu().numpy()
                next_values_per_agent_structured[:, agent_idx] = next_val
        
        optimizer.zero_grad() 
        avg_rewards_list = []

        for agent_idx in range(NUM_AGENTS):
            
            agent_data = mb_data_agentwise[agent_idx]

            # Конкатенация списков массивов
            all_obs = np.concatenate(agent_data['obs'])
            all_actions = np.concatenate(agent_data['actions'])
            all_logprobs = np.concatenate(agent_data['logprobs'])
            all_rewards_flat = np.concatenate(agent_data['rewards']).flatten()
            all_values_flat = np.concatenate(agent_data['values']).flatten()
            all_dones_flat = np.concatenate(agent_data['dones']).flatten()
            
            avg_rewards_list.append(np.sum(all_rewards_flat))

            agent_advantages = []
            agent_returns = []
            T = steps_per_env_agent
            for env_idx in range(NUM_ENVS):
                env_rewards = all_rewards_flat[env_idx * T : (env_idx + 1) * T]
                env_dones = all_dones_flat[env_idx * T : (env_idx + 1) * T]
                env_values = all_values_flat[env_idx * T : (env_idx + 1) * T]
                next_val_scalar = next_values_per_agent_structured[env_idx, agent_idx]
                
                adv, ret = compute_gae(next_val_scalar, env_rewards, env_dones, env_values, GAMMA, GAE_LAMBDA)
                agent_advantages.append(adv)
                agent_returns.append(ret)
            
            all_advantages_flat = np.concatenate(agent_advantages)
            all_returns_flat = np.concatenate(agent_returns)

            batch_data = {
                'obs': torch.tensor(all_obs, dtype=torch.float32).to(DEVICE),
                'actions': torch.tensor(all_actions, dtype=torch.long).to(DEVICE),
                'logprobs': torch.tensor(all_logprobs, dtype=torch.float32).to(DEVICE),
                'returns': torch.tensor(all_returns_flat, dtype=torch.float32).to(DEVICE),
                'advantages': torch.tensor(all_advantages_flat, dtype=torch.float32).to(DEVICE),
            }
            
            if np.random.uniform(0, 1) < 0.5: continue # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            
            ppo_update(agents[agent_idx], optimizer=optimizer, batch_data=batch_data, 
                       PPO_EPOCHS = PPO_EPOCHS, MINIBATCH_SIZE = MINIBATCH_SIZE, CLIP_EPS = CLIP_EPS) 

        nn.utils.clip_grad_norm_(all_agent_parameters, max_norm=0.5) ## !!!!!!!!!!!!!!! WAS 0.5
        optimizer.step()
        
        avg_rewards_across_all_agents = np.mean(avg_rewards_list)
        print(f"Глобальные шаги: {global_step}, среднее вознаграждение за батч (команды): {avg_rewards_across_all_agents:.4f}, среднее кол-во очков: {scores_team} / {num_done}, {scores_team_ema:.4f}") #{avg_rewards_across_all_agents:.4f}")
        
        
        N_steps += 1
        
        if N_steps % 20 == 0: #409600 == 0:
             torch.save({f'agent_{i}': agents[i].state_dict() for i in range(NUM_AGENTS)}, 
                        f"{res_dir}/ppo_gfootball_checkpoint_{global_step}.pt")

    for env in envs:
        env.close()


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  deprecation(


Запуск обучения на cuda, размерность SMM-представления на агента: (72, 96, 4), размерность действия: 19. Используется 5 независимых CNN, 1 оптимизатор.
Глобальные шаги: 16384. Сбор данных...
Глобальные шаги: 16384, среднее вознаграждение за батч (команды): 0.0000, среднее кол-во очков: 0 / 0, 0.0000


In [3]:
torch.save({f'agent_{i}': agents[i].state_dict() for i in range(NUM_AGENTS)}, 
                        f"{res_dir}/ppo_gfootball_checkpoint_{global_step}.pt")

## Drawing part

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
import numpy as np
import gymnasium as gym
import gfootball.env as football_env
from typing import List
import os
import imageio.v2 as imageio

from models import AgentNN
from envs_create import make_gfootball_env


# Configuration
MODEL_PATH = '../PPO_academy_5_vs_5_TEST3/ppo_gfootball_checkpoint_2293760.pt'
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_AGENTS = 5 
OBS_SHAPE = (72, 96, 4) 
ACTION_DIM = 19 
VIDEO_FILENAME = "gfootball_replay_manual3v1.mp4"

def run_test_episode_and_record(model_path):
    if os.path.exists(model_path):
        checkpoint = torch.load(model_path, map_location=DEVICE)
        agents = [AgentNN(OBS_SHAPE, ACTION_DIM).to(DEVICE) for _ in range(NUM_AGENTS)]
        for i in range(NUM_AGENTS):
            agents[i].load_state_dict(checkpoint[f'agent_{i}'])
            agents[i].eval() 
        print(f"Models loaded successfully from {model_path}")
    else:
        print(f"Error: Model file not found at {model_path}")
        return

    test_env = football_env.create_environment(
        env_name='5_vs_5',
        stacked=False,
        representation='extracted', 
        render=False,
        number_of_left_players_agent_controls=NUM_AGENTS,
    )
    
    print(f"Starting test episode and manually collecting SMM frames...")

    frames = []
    obs_list = []
    with torch.no_grad():
        obs = test_env.reset() 
        done = False
        while not done:
            frames.append(test_env.render(mode='rgb_array')) 
            
            actions_np = np.zeros((NUM_AGENTS,), dtype=int)
            for agent_idx in range(NUM_AGENTS):
                agent_obs = torch.tensor(obs[agent_idx:agent_idx+1], dtype=torch.float32).to(DEVICE)
                action, _, _, _ = agents[agent_idx].get_action_and_value(agent_obs)
                actions_np[agent_idx] = action.cpu().numpy().item()

            obs, reward, done_bool, info = test_env.step(actions_np)
            
            obs_list.append(obs)
            
            if done_bool:
                done = True
        
    test_env.close()

    return frames, obs_list


if __name__ == '__main__':
    frames, obs_list = run_test_episode_and_record(MODEL_PATH)


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


Models loaded successfully from ../PPO_academy_5_vs_5_TEST3/ppo_gfootball_checkpoint_2949120.pt


  deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


Starting test episode and manually collecting SMM frames...


In [3]:
VIDEO_FILENAME = "gfootball_replay_manual5v5.mp4"


print(f"Saving {len(frames)} frames to {VIDEO_FILENAME} using imageio...")

imageio.mimsave(VIDEO_FILENAME, frames[:1800], fps=30)  #VIDEO_FILENAME

print(f"Successfully saved video to {os.path.abspath(VIDEO_FILENAME)}")


Saving 3001 frames to gfootball_replay_manual5v5.mp4 using imageio...
Successfully saved video to C:\Users\Ivan\Desktop\Текущие статьи ИТМО\1. NN_communication\Bisheaf autoencoder\gfootball_replay_manual5v5.mp4
