In [None]:
import os
import re
import time
from pathlib import Path

os.environ["MPLBACKEND"] = "Agg"

import gym
import torch
import ale_py  # noqa: F401  (registra ALE envs)

from stable_baselines3 import DQN
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage, VecMonitor
from stable_baselines3.common.callbacks import BaseCallback, CallbackList


# =========================
# Callback: Heartbeat + log por episodio
# =========================
class LiveLoggerCallback(BaseCallback):
    """Heartbeat cada N steps + log al terminar episodios (si existe info['episode'])."""
    def __init__(self, every_steps=10_000, verbose=1):
        super().__init__(verbose)
        self.every_steps = int(every_steps)
        self.t0 = None

    def _on_training_start(self) -> None:
        self.t0 = time.time()
        print("[live] training started", flush=True)

    def _on_step(self) -> bool:
        if self.num_timesteps % self.every_steps == 0:
            elapsed = time.time() - self.t0 if self.t0 else 0.0
            print(f"[heartbeat] timesteps={self.num_timesteps} elapsed_s={elapsed:.1f}", flush=True)

        infos = self.locals.get("infos", [])
        for info in infos:
            ep = info.get("episode")
            if ep is None:
                continue
            ep_r = float(ep["r"])
            ep_l = int(ep["l"])
            print(f"[episode] r={ep_r:.1f} len={ep_l} | timesteps={self.num_timesteps}", flush=True)

        return True


# =========================
# Callback: objetivo 20+ por 100 episodios consecutivos
# =========================
class ConsecutiveRewardCallback(BaseCallback):
    """Cuenta episodios consecutivos con reward (clipped) > threshold."""
    def __init__(self, threshold=20.0, target_consecutive=100, save_path="model_reached_target", verbose=1):
        super().__init__(verbose)
        self.threshold = float(threshold)
        self.target_consecutive = int(target_consecutive)
        self.save_path = str(save_path)
        self.consecutive = 0
        self.best_consecutive = 0

    def _on_step(self) -> bool:
        infos = self.locals.get("infos", [])
        for info in infos:
            ep = info.get("episode")
            if ep is None:
                continue

            ep_r = float(ep["r"])

            if ep_r > self.threshold:
                self.consecutive += 1
            else:
                self.consecutive = 0

            self.best_consecutive = max(self.best_consecutive, self.consecutive)

            # Si tienes tensorboard_log instalado/activo, esto se registra en SB3
            self.logger.record("rollout/consecutive_over_20", self.consecutive)
            self.logger.record("rollout/best_consecutive_over_20", self.best_consecutive)
            self.logger.record("rollout/episode_reward", ep_r)

            if self.verbose and (self.consecutive == 1 or self.consecutive % 10 == 0):
                print(
                    f"[racha] ep_reward={ep_r:.1f} | consecutive>{self.threshold}={self.consecutive} "
                    f"(best={self.best_consecutive})",
                    flush=True
                )

            if self.consecutive >= self.target_consecutive:
                print(
                    f"\n✅ OBJETIVO CUMPLIDO: {self.consecutive} episodios seguidos con reward > {self.threshold}",
                    flush=True
                )
                self.model.save(self.save_path)
                return False  # detener entrenamiento

        return True


# =========================
# Callback: checkpoint + replay buffer (controlado)
# =========================
class SaveModelAndBufferCallback(BaseCallback):
    """
    Guarda:
      - modelo cada `save_freq` steps (como CheckpointCallback)
      - replay buffer en UN SOLO archivo (se sobreescribe) cada `buffer_freq` steps
        para NO llenar tu disco.
    """
    def __init__(self, ckpt_dir, name_prefix="dqn_si", save_freq=25_000,
                 save_replay_buffer=True, buffer_freq=100_000, verbose=1):
        super().__init__(verbose)
        self.ckpt_dir = Path(ckpt_dir)
        self.ckpt_dir.mkdir(parents=True, exist_ok=True)
        self.name_prefix = name_prefix
        self.save_freq = int(save_freq)
        self.save_replay_buffer = bool(save_replay_buffer)
        self.buffer_freq = int(buffer_freq)

        # archivo fijo (se sobreescribe)
        self.buffer_path = self.ckpt_dir / f"{self.name_prefix}_replay_buffer.pkl"

    def _on_step(self) -> bool:
        # 1) Modelo por checkpoints numerados
        if self.num_timesteps > 0 and self.num_timesteps % self.save_freq == 0:
            model_path = self.ckpt_dir / f"{self.name_prefix}_{self.num_timesteps}_steps"
            self.model.save(str(model_path))
            if self.verbose:
                print(f"[ckpt] saved model: {model_path}.zip", flush=True)

        # 2) Replay buffer (opcional) en archivo fijo para no llenar disco
        if self.save_replay_buffer and self.num_timesteps > 0 and self.num_timesteps % self.buffer_freq == 0:
            # DQN tiene save_replay_buffer/load_replay_buffer
            if hasattr(self.model, "save_replay_buffer"):
                self.model.save_replay_buffer(str(self.buffer_path))
                if self.verbose:
                    print(f"[buffer] saved replay buffer (overwrite): {self.buffer_path}", flush=True)

        return True


# =========================
# Utils: encontrar último checkpoint
# =========================
_ckpt_re = re.compile(r".*_(\d+)_steps\.zip$")

def find_latest_checkpoint(ckpt_dir: Path, prefix="dqn_si"):
    ckpts = sorted(ckpt_dir.glob(f"{prefix}_*_steps.zip"))
    best = None
    best_steps = -1
    for p in ckpts:
        m = _ckpt_re.match(str(p))
        if not m:
            continue
        steps = int(m.group(1))
        if steps > best_steps:
            best_steps = steps
            best = p
    return best, best_steps


def main():
    print("Gym:", gym.__version__)
    print("Torch:", torch.__version__, "CUDA:", torch.cuda.is_available(), flush=True)

    # ====== Config ======
    env_id = "ALE/SpaceInvaders-v5"
    n_envs = 4
    seed = 123
    total_timesteps = 5_000_000  # tu objetivo de entrenamiento total (puedes subirlo)

    base_dir = Path(os.getcwd())
    ckpt_dir = base_dir / "checkpoints"
    log_dir = base_dir / "logs_tensorboard"
    ckpt_dir.mkdir(exist_ok=True)
    log_dir.mkdir(exist_ok=True)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"✅ Paths:\n  ckpt_dir={ckpt_dir}\n  log_dir={log_dir}\n  device={device}", flush=True)

    # ====== Env ======
    # noop_max=0 evita el bug de randint vs integers en wrappers antiguos
    env = make_atari_env(
        env_id,
        n_envs=n_envs,
        seed=seed,
        wrapper_kwargs=dict(clip_reward=True, noop_max=0),
    )
    # VecMonitor para que exista info["episode"] de forma consistente en VecEnv
    env = VecMonitor(env)
    env = VecFrameStack(env, n_stack=4)
    env = VecTransposeImage(env)

    # ====== Callbacks ======
    live_cb = LiveLoggerCallback(every_steps=10_000, verbose=1)

    goal_cb = ConsecutiveRewardCallback(
        threshold=20.0,
        target_consecutive=100,
        save_path=ckpt_dir / "dqn_si_reached_20x100",
        verbose=1,
    )

    # Guardado:
    # - modelo cada 25k (con n_envs=4 esto es “rápido”)
    # - replay buffer cada 100k y en archivo fijo (NO crece infinito)
    save_cb = SaveModelAndBufferCallback(
        ckpt_dir=ckpt_dir,
        name_prefix="dqn_si",
        save_freq=25_000,
        save_replay_buffer=True,
        buffer_freq=100_000,
        verbose=1
    )

    callbacks = CallbackList([save_cb, live_cb, goal_cb])

    # ====== Model (resume si hay ckpt) ======
    latest, latest_steps = find_latest_checkpoint(ckpt_dir, prefix="dqn_si")

    if latest is not None:
        print(f"✅ Reanudando desde: {latest} (steps={latest_steps})", flush=True)
        model = DQN.load(str(latest), env=env, device=device)

        # Replay buffer: usamos el archivo fijo (si existe)
        fixed_rb = ckpt_dir / "dqn_si_replay_buffer.pkl"
        if fixed_rb.exists():
            try:
                model.load_replay_buffer(str(fixed_rb))
                print(f"✅ Replay buffer cargado: {fixed_rb}", flush=True)
            except Exception as e:
                print(f"⚠️ No pude cargar replay buffer ({fixed_rb}): {e}", flush=True)
        else:
            print("⚠️ No hay replay buffer guardado. Continuarás con buffer vacío.", flush=True)

        reset_num_timesteps = False  # MUY importante al reanudar
    else:
        print("🆕 No hay checkpoints. Entrenando desde cero.", flush=True)

        # Nota importante:
        # - En SB3 2.x puedes usar optimize_memory_usage + replay_buffer_kwargs(handle_timeout_termination=False)
        # - En SB3 1.8.0 normalmente NO existe replay_buffer_kwargs -> entonces desactiva optimize_memory_usage
        try:
            model = DQN(
                "CnnPolicy",
                env,
                learning_rate=1e-4,
                buffer_size=300_000,
                learning_starts=10_000,
                batch_size=64,
                gamma=0.99,
                train_freq=4,
                gradient_steps=1,
                target_update_interval=10_000,
                exploration_fraction=0.25,
                exploration_initial_eps=1.0,
                exploration_final_eps=0.01,
                optimize_memory_usage=True,
                replay_buffer_kwargs={"handle_timeout_termination": False},
                tensorboard_log=str(log_dir),
                verbose=1,
                device=device,
            )
        except TypeError:
            # Compatibilidad SB3 1.8.0
            model = DQN(
                "CnnPolicy",
                env,
                learning_rate=1e-4,
                buffer_size=300_000,
                learning_starts=10_000,
                batch_size=64,
                gamma=0.99,
                train_freq=4,
                gradient_steps=1,
                target_update_interval=10_000,
                exploration_fraction=0.25,
                exploration_initial_eps=1.0,
                exploration_final_eps=0.01,
                optimize_memory_usage=False,
                tensorboard_log=str(log_dir),
                verbose=1,
                device=device,
            )

        reset_num_timesteps = True

    # ====== Train ======
    model.learn(
        total_timesteps=total_timesteps,
        callback=callbacks,
        reset_num_timesteps=reset_num_timesteps
    )

    # Guardado final
    model.save(str(ckpt_dir / "dqn_si_last"))
    if hasattr(model, "save_replay_buffer"):
        model.save_replay_buffer(str(ckpt_dir / "dqn_si_replay_buffer.pkl"))
    env.close()
    print("✅ Entrenamiento terminado. Modelo guardado.", flush=True)


if __name__ == "__main__":
    main()


Gym: 0.21.0
Torch: 1.13.1+cu117 CUDA: True
✅ Paths:
  ckpt_dir=C:\08MIAR\LOCAL\checkpoints
  log_dir=C:\08MIAR\LOCAL\logs_tensorboard
  device=cuda
✅ Reanudando desde: C:\08MIAR\LOCAL\checkpoints\dqn_si_800000_steps.zip (steps=800000)




✅ Replay buffer cargado: C:\08MIAR\LOCAL\checkpoints\dqn_si_replay_buffer.pkl
Logging to C:\08MIAR\LOCAL\logs_tensorboard\DQN_1
[live] training started
[episode] r=0.0 len=20 | timesteps=800080
[episode] r=0.0 len=20 | timesteps=800080
[racha] ep_reward=0.0 | consecutive>20.0=0 (best=0)
[racha] ep_reward=0.0 | consecutive>20.0=0 (best=0)
[episode] r=2.0 len=35 | timesteps=800140
[racha] ep_reward=2.0 | consecutive>20.0=0 (best=0)
------------------------------------------
| rollout/                    |          |
|    best_consecutive_over_20 | 0        |
|    consecutive_over_20      | 0        |
|    ep_len_mean              | 48.4     |
|    ep_rew_mean              | 6.56     |
|    episode_reward           | 2        |
|    exploration_rate         | 0.454    |
| time/                       |          |
|    episodes                 | 17540    |
|    fps                      | 16       |
|    time_elapsed             | 8        |
|    total_timesteps          | 800140   |
| train

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



------------------------------------------
| rollout/                    |          |
|    best_consecutive_over_20 | 5        |
|    consecutive_over_20      | 0        |
|    ep_len_mean              | 72.7     |
|    ep_rew_mean              | 13.63    |
|    episode_reward           | 0        |
|    exploration_rate         | 0.01     |
| time/                       |          |
|    episodes                 | 55684    |
|    fps                      | 182      |
|    time_elapsed             | 12834    |
|    total_timesteps          | 3137136  |
| train/                      |          |
|    learning_rate            | 0.0001   |
|    loss                     | 0.0954   |
|    n_updates                | 195444   |
------------------------------------------
[episode] r=21.0 len=114 | timesteps=3137284
[racha] ep_reward=21.0 | consecutive>20.0=1 (best=5)
[episode] r=8.0 len=99 | timesteps=3137428
[racha] ep_reward=8.0 | consecutive>20.0=0 (best=5)
[episode] r=42.0 len=234 | timest

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[racha] ep_reward=30.0 | consecutive>20.0=1 (best=6)
[episode] r=1.0 len=39 | timesteps=3252352
[racha] ep_reward=1.0 | consecutive>20.0=0 (best=6)
[episode] r=1.0 len=27 | timesteps=3252412
[racha] ep_reward=1.0 | consecutive>20.0=0 (best=6)
[episode] r=3.0 len=29 | timesteps=3252428
[racha] ep_reward=3.0 | consecutive>20.0=0 (best=6)
------------------------------------------
| rollout/                    |          |
|    best_consecutive_over_20 | 6        |
|    consecutive_over_20      | 0        |
|    ep_len_mean              | 67.9     |
|    ep_rew_mean              | 12.54    |
|    episode_reward           | 3        |
|    exploration_rate         | 0.01     |
| time/                       |          |
|    episodes                 | 57224    |
|    fps                      | 182      |
|    time_elapsed             | 13414    |
|    total_timesteps          | 3252428  |
| train/                      |          |
|    learning_rate            | 0.0001   |
|    loss        

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[racha] ep_reward=37.0 | consecutive>20.0=1 (best=7)
[episode] r=0.0 len=11 | timesteps=5340908
[racha] ep_reward=0.0 | consecutive>20.0=0 (best=7)
------------------------------------------
| rollout/                    |          |
|    best_consecutive_over_20 | 7        |
|    consecutive_over_20      | 0        |
|    ep_len_mean              | 87.6     |
|    ep_rew_mean              | 17.73    |
|    episode_reward           | 0        |
|    exploration_rate         | 0.01     |
| time/                       |          |
|    episodes                 | 83632    |
|    fps                      | 191      |
|    time_elapsed             | 23760    |
|    total_timesteps          | 5340908  |
| train/                      |          |
|    learning_rate            | 0.0001   |
|    loss                     | 0.191    |
|    n_updates                | 333180   |
------------------------------------------
[episode] r=62.0 len=272 | timesteps=5341000
[racha] ep_reward=62.0 | consecut

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[episode] r=35.0 len=138 | timesteps=5493396
[racha] ep_reward=35.0 | consecutive>20.0=1 (best=7)
------------------------------------------
| rollout/                    |          |
|    best_consecutive_over_20 | 7        |
|    consecutive_over_20      | 1        |
|    ep_len_mean              | 88.5     |
|    ep_rew_mean              | 18.31    |
|    episode_reward           | 35       |
|    exploration_rate         | 0.01     |
| time/                       |          |
|    episodes                 | 85368    |
|    fps                      | 191      |
|    time_elapsed             | 24494    |
|    total_timesteps          | 5493396  |
| train/                      |          |
|    learning_rate            | 0.0001   |
|    loss                     | 0.25     |
|    n_updates                | 342711   |
------------------------------------------
[episode] r=6.0 len=28 | timesteps=5493436
[racha] ep_reward=6.0 | consecutive>20.0=0 (best=7)
[episode] r=66.0 len=227 | timest

In [2]:
import os
import time
from pathlib import Path

os.environ["MPLBACKEND"] = "Agg"

import gym
import torch
import ale_py  # noqa: F401  (registra ALE envs)

from stable_baselines3 import DQN
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage, VecMonitor


def _reset_compat(env):
    """
    Compatibilidad: algunas APIs devuelven obs o (obs, info).
    VecEnv normalmente devuelve obs directo, pero lo dejamos robusto.
    """
    out = env.reset()
    if isinstance(out, tuple) and len(out) == 2:
        return out[0]
    return out


def _step_compat(env, action):
    """
    Compatibilidad:
    - Gym clásico: obs, reward, done, info
    - Gymnasium: obs, reward, terminated, truncated, info
    - VecEnv: reward/done vienen como arrays de tamaño n_envs
    """
    out = env.step(action)
    if len(out) == 4:
        obs, reward, done, info = out
        terminated = done
        truncated = [False] if hasattr(done, "__len__") else False
        return obs, reward, terminated, truncated, info
    elif len(out) == 5:
        obs, reward, terminated, truncated, info = out
        return obs, reward, terminated, truncated, info
    else:
        raise RuntimeError(f"step() devolvió {len(out)} elementos, no esperado.")


def main():
    # ====== Ajustes ======
    base_dir = Path(os.getcwd())

    # Cambia esto si tu zip está en otra ruta:
    model_path = base_dir / "checkpoints" / "dqn_si_last.zip"

    env_id = "ALE/SpaceInvaders-v5"
    seed = 123

    # Criterio del profe:
    threshold = 20.0
    target_consecutive = 100

    # Límite de seguridad (para no correr infinito):
    max_episodes = 50_000

    # Logs:
    print_every_episodes = 10   # imprime cada N episodios
    heartbeat_seconds = 30      # imprime “sigo vivo” cada N segundos aunque no termine episodio

    # ====== Verificaciones ======
    print("Gym:", gym.__version__)
    print("Torch:", torch.__version__, "CUDA:", torch.cuda.is_available(), flush=True)

    if not model_path.exists():
        raise FileNotFoundError(f"No encontré el modelo: {model_path}")

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"✅ Cargando modelo: {model_path} (device={device})", flush=True)

    # ====== Env (mismo pipeline que entrenamiento) ======
    env = make_atari_env(
        env_id,
        n_envs=1,
        seed=seed,
        wrapper_kwargs=dict(
            clip_reward=True,  # ✅ reward clipping activo (como tu entrenamiento)
            noop_max=0,        # ✅ evita bug randint vs Generator
        ),
    )
    env = VecMonitor(env)             # ✅ para tener info["episode"] consistente
    env = VecFrameStack(env, n_stack=4)
    env = VecTransposeImage(env)

    # ====== Modelo ======
    model = DQN.load(str(model_path), env=env, device=device)

    # ====== Evaluación determinística ======
    consecutive = 0
    best_consecutive = 0
    episodes = 0

    last_hb = time.time()

    obs = _reset_compat(env)

    # Acumulador por episodio (por si no llega info["episode"])
    ep_return = 0.0
    ep_len = 0

    print("[eval] start (deterministic=True)", flush=True)

    while episodes < max_episodes and consecutive < target_consecutive:
        # Acción greedy
        action, _ = model.predict(obs, deterministic=True)

        obs, reward, terminated, truncated, infos = _step_compat(env, action)

        # VecEnv: reward es array([r])
        r = float(reward[0]) if hasattr(reward, "__len__") else float(reward)
        ep_return += r
        ep_len += 1

        done = bool(terminated[0]) if hasattr(terminated, "__len__") else bool(terminated)
        trunc = bool(truncated[0]) if hasattr(truncated, "__len__") else bool(truncated)

        # Heartbeat temporal (aunque no termine episodio)
        now = time.time()
        if now - last_hb >= heartbeat_seconds:
            print(f"[heartbeat] episodes={episodes} best={best_consecutive} current={consecutive}", flush=True)
            last_hb = now

        if done or trunc:
            episodes += 1

            # Si VecMonitor puso info["episode"], úsalo (más “oficial”)
            info0 = infos[0] if isinstance(infos, (list, tuple)) else infos
            ep = None
            if isinstance(info0, dict):
                ep = info0.get("episode")

            if ep is not None and "r" in ep and "l" in ep:
                ep_r = float(ep["r"])
                ep_l = int(ep["l"])
            else:
                ep_r = float(ep_return)
                ep_l = int(ep_len)

            # Criterio
            if ep_r > threshold:
                consecutive += 1
            else:
                consecutive = 0

            best_consecutive = max(best_consecutive, consecutive)

            if episodes % print_every_episodes == 0 or ep_r > threshold:
                print(
                    f"[episode {episodes}] r={ep_r:.1f} len={ep_l} | "
                    f"consecutive>{threshold}={consecutive} (best={best_consecutive})",
                    flush=True
                )

            # Reset acumuladores
            ep_return = 0.0
            ep_len = 0

            # Reset env
            obs = _reset_compat(env)

    env.close()

    if consecutive >= target_consecutive:
        print(
            f"\n✅ OBJETIVO CUMPLIDO EN EVALUACIÓN: {consecutive} episodios seguidos con reward(clipped) > {threshold}",
            flush=True
        )
        print(f"   Episodios totales usados: {episodes}", flush=True)
    else:
        print(
            f"\n❌ No se logró la racha. best_consecutive={best_consecutive} "
            f"en {episodes} episodios (límite={max_episodes}).",
            flush=True
        )


if __name__ == "__main__":
    main()


Gym: 0.21.0
Torch: 1.13.1+cu117 CUDA: True
✅ Cargando modelo: C:\08MIAR\LOCAL\checkpoints\dqn_si_last.zip (device=cuda)
[eval] start (deterministic=True)
[episode 1] r=34.0 len=142 | consecutive>20.0=1 (best=1)
[episode 6] r=51.0 len=332 | consecutive>20.0=1 (best=1)
[episode 7] r=59.0 len=237 | consecutive>20.0=2 (best=2)
[episode 10] r=30.0 len=93 | consecutive>20.0=1 (best=2)
[episode 11] r=36.0 len=186 | consecutive>20.0=2 (best=2)
[episode 13] r=51.0 len=194 | consecutive>20.0=1 (best=2)
[episode 16] r=46.0 len=185 | consecutive>20.0=1 (best=2)
[episode 19] r=63.0 len=299 | consecutive>20.0=1 (best=2)
[episode 20] r=5.0 len=49 | consecutive>20.0=0 (best=2)
[episode 25] r=46.0 len=187 | consecutive>20.0=1 (best=2)
[episode 28] r=23.0 len=74 | consecutive>20.0=1 (best=2)
[episode 30] r=1.0 len=23 | consecutive>20.0=0 (best=2)
[episode 34] r=34.0 len=139 | consecutive>20.0=1 (best=2)
[episode 39] r=25.0 len=110 | consecutive>20.0=1 (best=2)
[episode 40] r=24.0 len=96 | consecutive>20