In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.atari_wrappers import AtariWrapper

from stable_baselines3.common.utils import get_schedule_fn

import numpy as np
import matplotlib.pyplot as plt
import json

# Callback do logowania i wczesnego zatrzymania
class RewardLoggerCallback(BaseCallback):
    def __init__(self, check_freq=50, target_avg_reward=19.5):
        super().__init__()
        self.episode_rewards = []
        self.check_freq = check_freq
        self.target_avg_reward = target_avg_reward

    def _on_step(self) -> bool:
        infos = self.locals.get("infos", [])
        for info in infos:
            if "episode" in info:
                reward = info["episode"]["r"]
                self.episode_rewards.append(reward)

                if len(self.episode_rewards) >= self.check_freq:
                    recent_avg = np.mean(self.episode_rewards[-self.check_freq:])
                    print(f"[INFO] Średnia z ostatnich {self.check_freq} epizodów: {recent_avg:.2f}")
                    if recent_avg >= self.target_avg_reward:
                        print("Wczesne zatrzymanie")
                        self.model.save("ppo_pong_model_early_stop")
                        with open("pong_rewards_early_stop.json", "w") as f:
                            json.dump(self.episode_rewards, f)
                        self.model.stop_training = True
        return True

# Hiperparametry
learning_rate = get_schedule_fn(2.5e-4)
clip_range = get_schedule_fn(0.1)
n_envs = 8
n_steps = 128
batch_size = 256
n_epochs = 4
ent_coef = 0.01
vf_coef = 0.5
frame_stack = 4
total_timesteps = int(10_000_000)

env_id = "PongNoFrameskip-v4"
vec_env = make_atari_env(env_id, n_envs=n_envs, seed=0)
vec_env = VecFrameStack(vec_env, n_stack=frame_stack)

# Parametry polityki
policy_kwargs = dict(ortho_init=False)

# Tworzenie modelu
model = PPO(
    policy="CnnPolicy",
    env=vec_env,
    learning_rate=learning_rate,
    clip_range=clip_range,
    n_steps=n_steps,
    batch_size=batch_size,
    n_epochs=n_epochs,
    ent_coef=ent_coef,
    vf_coef=vf_coef,
    policy_kwargs=policy_kwargs,
    verbose=1
)

reward_callback = RewardLoggerCallback(check_freq=50, target_avg_reward=19.5)

# Trening
model.learn(total_timesteps=total_timesteps, callback=reward_callback)

# Zapis modelu i nagród
model.save("ppo_pong_model")

with open("pong_rewards.json", "w") as f:
    json.dump(reward_callback.episode_rewards, f)

[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
|    clip_range           | 0.1         |
|    entropy_loss         | -1.77       |
|    explained_variance   | 0.00133     |
|    learning_rate        | 0.00025     |
|    loss                 | 0.0416      |
|    n_updates            | 68          |
|    policy_gradient_loss | -0.00168    |
|    value_loss           | 0.128       |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 3.56e+03     |
|    ep_rew_mean          | -20.6        |
| time/                   |              |
|    fps                  | 109          |
|    iterations           | 19           |
|    time_elapsed         | 178          |
|    total_timesteps      | 19456        |
| train/                  |              |
|    approx_kl            | 0.0021789335 |
|    clip_fraction        | 0.141        |
|    clip_range           |

KeyboardInterrupt: 