In [None]:
import os
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback
from stable_baselines3.common.monitor import Monitor
--
TOTAL_TIMESTEPS = 4_000_000
N_ENVS = 4
ENV_ID = "PongNoFrameskip-v4"

LOG_DIR = "pong_a2c_logs/"
BEST_MODEL_SAVE_PATH = os.path.join(LOG_DIR, "best_model")
FINAL_MODEL_SAVE_PATH = os.path.join(LOG_DIR, "a2c_pong_final_model")

os.makedirs(LOG_DIR, exist_ok=True)

if __name__ == '__main__':
    train_env = make_atari_env(ENV_ID, n_envs=N_ENVS, seed=42, monitor_dir=LOG_DIR)
    train_env = VecFrameStack(train_env, n_stack=4)
    print("Środowiska treningowe utworzone.")


    print(f"Tworzenie środowiska ewaluacyjnego dla {ENV_ID}...")
    eval_env = make_atari_env(ENV_ID, n_envs=1, seed=123, monitor_dir=os.path.join(LOG_DIR, "eval_monitor"))
    eval_env = VecFrameStack(eval_env, n_stack=4)
    print("Środowisko ewaluacyjne utworzone.")

    print(f"EvalCallback skonfigurowany z eval_freq = {EVAL_FREQ * N_ENVS} (co {EVAL_FREQ} kroków na środowisko)")
    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path=BEST_MODEL_SAVE_PATH,
                                 log_path=LOG_DIR,
                                 eval_freq=EVAL_FREQ,
                                 n_eval_episodes=5,
                                 deterministic=True,
                                 render=False,
                                 verbose=1)

    CHECKPOINT_FREQ = max(500_000 // N_ENVS, 1)
    checkpoint_callback = CheckpointCallback(save_freq=CHECKPOINT_FREQ,
                                             save_path=LOG_DIR,
                                             name_prefix="a2c_pong_checkpoint",
                                             save_replay_buffer=False,
                                             save_vecnormalize=True,
                                             verbose=1)

.
    print("Tworzenie modelu A2C...")
    model = A2C("CnnPolicy",
                train_env,
                verbose=1,
                tensorboard_log=LOG_DIR,
                device="auto")
    print("Model A2C utworzony.")

    print(f"Rozpoczynanie treningu na {TOTAL_TIMESTEPS} kroków...")
    try:
        model.learn(total_timesteps=TOTAL_TIMESTEPS,
                    callback=[eval_callback, checkpoint_callback],
                    tb_log_name="A2C_Pong")
    finally:
        print(f"Zapisywanie finalnego modelu do: {FINAL_MODEL_SAVE_PATH}.zip")
        model.save(FINAL_MODEL_SAVE_PATH)
        print("Finalny model zapisany.")

    print("Trening zakończony.")

    train_env.close()
    eval_env.close()


Tworzenie 4 środowisk treningowych dla PongNoFrameskip-v4...
Środowiska treningowe utworzone.
Tworzenie środowiska ewaluacyjnego dla PongNoFrameskip-v4...
Środowisko ewaluacyjne utworzone.
EvalCallback skonfigurowany z eval_freq = 100000 (co 25000 kroków na środowisko)
CheckpointCallback skonfigurowany z save_freq = 500000 (co 125000 kroków na środowisko)
Tworzenie modelu A2C...
Using cuda device
Wrapping the env in a VecTransposeImage.
Model A2C utworzony.
Rozpoczynanie treningu na 4000000 kroków...
Logging to pong_a2c_logs/A2C_Pong_1




[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
|    entropy_loss       | -1.21    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 170999   |
|    policy_loss        | 0.0686   |
|    value_loss         | 0.00444  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 3.34e+03 |
|    ep_rew_mean        | -20.5    |
| time/                 |          |
|    fps                | 438      |
|    iterations         | 171100   |
|    time_elapsed       | 7805     |
|    total_timesteps    | 3422000  |
| train/                |          |
|    entropy_loss       | -1.19    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 171099   |
|    policy_loss        | 0.0579   |
|    value_loss         | 0.00439  |
------------------------------------
---------------------------------