In [1]:
import os
import numpy as np
import cv2
import matplotlib.pyplot as plt
from PIL import Image

# Gymnasium and ALE
import gymnasium as gym
import ale_py

# Stable Baselines3
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv, VecMonitor
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import CallbackList, CheckpointCallback, EvalCallback

# Wandb Integration
import wandb
from wandb.integration.sb3 import WandbCallback

# Custom constants
import a2c_pacman_constants as constants

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def make_env(env_id, render_mode=None):
    def _env():
        env = gym.make(env_id, render_mode=render_mode)
        env = Monitor(env, allow_early_resets=True)  
        return env
    return _env

In [None]:
run = wandb.init(
    project="A2C Pacman",
    config={
        "env_id": constants.env_id,
        "algorithm": constants.algorithm,
        "learning_rate": constants.learning_rate,
        "gamma": constants.gamma,
        "n_steps": constants.n_steps,
        "vf_coef": constants.vf_coef,
        "ent_coef": constants.ent_coef,   
        "max_grad_norm": constants.max_grad_norm,
        "total_timesteps": constants.total_timesteps, 
        "model_name": constants.model_name,
        "export_path": constants.export_path,
        "videos_path": constants.videos_path,
    },
    sync_tensorboard=True,
    save_code=True,  
)


env_id = "ALE/Pacman-v5"  # Pac-Man environment ID
env = DummyVecEnv([make_env(env_id) for i in range(8)])  # Create 8 parallel environments
env = VecMonitor(env) # Log rollout metrics

# Create and configure the A2C model
model = A2C(
    policy = constants.policy,  # Use a convolutional neural network . Pacman is represented in images (better to use CNN rather than MLP)
    env = env,
    learning_rate = constants.learning_rate,  
    gamma = constants.gamma,  
    n_steps = constants.n_steps,  
    vf_coef = constants.vf_coef,  
    ent_coef = constants.ent_coef,  
    max_grad_norm = constants.max_grad_norm,  
    verbose = 2,  # Enable verbose output                 
    tensorboard_log = f"runs/{run.id}",
)

eval_env = DummyVecEnv([make_env(env_id) for i in range(1)])  

eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/',
                             log_path='./logs/', eval_freq=500,
                             deterministic=False, render=False)

checkpoint_callback = CheckpointCallback(
    save_freq=1000,  
    save_path='./checkpoints/',  
    name_prefix="a2c_pacman", 
)

callback_list = CallbackList([WandbCallback(verbose=2, gradient_save_freq=10), eval_callback, checkpoint_callback])

# Train the model
print("Training...")
model.learn(total_timesteps=constants.total_timesteps, log_interval=100, callback=callback_list)

# Save the trained model
model_path = os.path.join(constants.export_path, constants.model_name)
model.save(model_path)
wandb.save(model_path + ".zip")
wandb.finish()

# Close the training environment
env.close()



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmarionapla[0m. Use [1m`wandb login --relogin`[0m to force relogin




Using cpu device
Wrapping the env in a VecTransposeImage.
Training...
Logging to runs/5duywz9g\A2C_1




Eval num_timesteps=4000, episode_reward=12.80 +/- 2.71
Episode length: 432.80 +/- 44.91
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 433      |
|    mean_reward        | 12.8     |
| time/                 |          |
|    total_timesteps    | 4000     |
| train/                |          |
|    entropy_loss       | -1       |
|    explained_variance | 0.00585  |
|    learning_rate      | 0.0005   |
|    n_updates          | 15       |
|    policy_loss        | -0.328   |
|    value_loss         | 1.51     |
------------------------------------
New best mean reward!
Eval num_timesteps=8000, episode_reward=18.20 +/- 8.86
Episode length: 516.00 +/- 80.23
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 516      |
|    mean_reward        | 18.2     |
| time/                 |          |
|    total_timesteps    | 8000     |
| train/                |          |
|    entropy_loss       |

KeyboardInterrupt: 