# SpaceInvaders con Actor–Critic (PG/AC) **desde cero** — estilo `08MIAR_pg_ac`

Este cuaderno implementa un **Actor–Critic** con **red compartida (tronco) + dos cabezas** (actor y critic), siguiendo la idea del notebook `08MIAR_pg_ac`, adaptado a SpaceInvaders con el **pipeline Atari** y **restricciones**:

**Restricciones / pipeline Atari**
- **Noop reset** (NOOP aleatorios al reset)
- **MaxAndSkip = 4**
- **FireReset** (para arrancar el juego cuando hace falta)
- **WarpFrame** a **84×84** en **grayscale**
- **FrameStack = 4**
- **Reward clipping** (opcional, activado por defecto)

**Notas**
- Compatible con **Gym** (API antigua) y **Gymnasium** (API nueva).
- Incluye **entropy bonus** + **normalización de ventajas** para evitar colapso de política (acción única).


**Criterio de éxito (estricto)**
- **Alcanzar > 20 puntos (con reward clipping) durante > 100 episodios consecutivos**.


**v3 añade**: *entropy schedule*, *temperature schedule* y log de histograma de acciones por update (para detectar colapso a una sola acción como LEFTFIRE).

---
### v4
Añade **GAE(λ)** y **ε-greedy** en el rollout (además de schedules de entropía/temperatura) para reducir el colapso a una sola acción y mejorar el aprendizaje con reward clipping.


## 1) Instalación (solo si hace falta)

Si estás en Colab y no tienes Atari instalado, descomenta y ejecuta la celda siguiente.

In [None]:
# En Colab (solo si hace falta):
# !pip -q install "gymnasium[atari,accept-rom-license]" "ale-py" torch numpy pillow
# Si usas Gym clásico:
# !pip -q install "gym[atari]" "ale-py" torch numpy pillow


## 2) Imports, semilla y compatibilidad Gym/Gymnasium

In [1]:
import os
import sys
import time
import random
from collections import deque, Counter
import numpy as np
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F

# --- Gym vs Gymnasium (compatibilidad) ---
try:
    import gymnasium as gym
    _USING_GYMNASIUM = True
except Exception:
    import gym
    _USING_GYMNASIUM = False

print("Using gymnasium:", _USING_GYMNASIUM)
print("Torch:", torch.__version__)

SEED = 123
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Using gymnasium: False
Torch: 2.0.1+cpu
Device: cpu


## 3) Wrappers Atari

In [2]:
# Helpers para soportar step API de 4 y 5 valores
def _is_new_step_api(step_result):
    return isinstance(step_result, tuple) and len(step_result) == 5

def _unpack_step(step_result):
    # Returns: obs, reward, done, info, terminated, truncated
    if _is_new_step_api(step_result):
        obs, reward, terminated, truncated, info = step_result
        done = bool(terminated) or bool(truncated)
        return obs, float(reward), done, info, bool(terminated), bool(truncated)
    else:
        obs, reward, done, info = step_result
        return obs, float(reward), bool(done), info, None, None

def _pack_step(obs, reward, done, info, terminated, truncated, use_new_api):
    if use_new_api:
        if terminated is None or truncated is None:
            terminated = bool(done)
            truncated = False
        return obs, float(reward), bool(terminated), bool(truncated), info
    else:
        return obs, float(reward), bool(done), info

def _unpack_reset(reset_result):
    # Returns: obs, info, use_new_api
    if isinstance(reset_result, tuple) and len(reset_result) == 2 and isinstance(reset_result[1], dict):
        obs, info = reset_result
        return obs, info, True
    else:
        return reset_result, {}, False

class NoopResetEnv(gym.Wrapper):
    # Realiza NOOP aleatorios al reset para aleatorizar el estado inicial.
    def __init__(self, env, noop_max=30):
        super().__init__(env)
        self.noop_max = noop_max
        self.noop_action = 0  # Atari: 0 suele ser NOOP

    def reset(self, **kwargs):
        obs, info, use_new_api = _unpack_reset(self.env.reset(**kwargs))
        noops = np.random.randint(1, self.noop_max + 1)
        for _ in range(noops):
            step_res = self.env.step(self.noop_action)
            o, _, done, inf, term, trunc = _unpack_step(step_res)
            obs, info = o, inf
            if done:
                obs, info, use_new_api = _unpack_reset(self.env.reset(**kwargs))
        return (obs, info) if use_new_api else obs

class MaxAndSkipEnv(gym.Wrapper):
    # Salta frames y hace max over last 2 frames (reduce flickering).
    def __init__(self, env, skip=4):
        super().__init__(env)
        self._skip = skip
        self._obs_buffer = deque(maxlen=2)

    def step(self, action):
        total_reward = 0.0
        done = False
        info = {}
        terminated = truncated = None

        use_new_api = None

        for _ in range(self._skip):
            step_res = self.env.step(action)
            if use_new_api is None:
                use_new_api = _is_new_step_api(step_res)

            obs, reward, d, info, term, trunc = _unpack_step(step_res)
            self._obs_buffer.append(obs)
            total_reward += reward
            done = done or d
            terminated = term if term is not None else terminated
            truncated = trunc if trunc is not None else truncated
            if done:
                break

        max_frame = np.maximum(self._obs_buffer[0], self._obs_buffer[-1])
        return _pack_step(max_frame, total_reward, done, info, terminated, truncated, use_new_api)

class FireResetEnv(gym.Wrapper):
    # Hace FIRE al reset si el juego lo requiere.
    def reset(self, **kwargs):
        obs, info, use_new_api = _unpack_reset(self.env.reset(**kwargs))
        # Intentamos FIRE=1 (típico)
        try:
            step_res = self.env.step(1)
            obs, _, done, info, term, trunc = _unpack_step(step_res)
            if done:
                obs, info, use_new_api = _unpack_reset(self.env.reset(**kwargs))
        except Exception:
            pass
        return (obs, info) if use_new_api else obs

class WarpFrame(gym.ObservationWrapper):
    # Resize a 84x84 + grayscale.
    def __init__(self, env, width=84, height=84, grayscale=True):
        super().__init__(env)
        self.width = width
        self.height = height
        self.grayscale = grayscale
        shape = (height, width, 1 if grayscale else 3)
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=shape, dtype=np.uint8)

    def observation(self, obs):
        if self.grayscale:
            obs = np.dot(obs[..., :3], [0.299, 0.587, 0.114]).astype(np.uint8)
            img = Image.fromarray(obs)
            img = img.resize((self.width, self.height))
            obs = np.array(img, dtype=np.uint8)[..., None]  # (H,W,1)
        else:
            img = Image.fromarray(obs)
            img = img.resize((self.width, self.height))
            obs = np.array(img, dtype=np.uint8)
        return obs

class ClipRewardEnv(gym.RewardWrapper):
    # Clipping de recompensa a -1/0/1.
    def reward(self, reward):
        return np.sign(reward).astype(np.float32)

class LazyFrames:
    # Evita copias al apilar frames.
    def __init__(self, frames):
        self._frames = frames

    def __array__(self, dtype=None):
        out = np.concatenate(self._frames, axis=2)
        if dtype is not None:
            out = out.astype(dtype)
        return out

class FrameStack(gym.Wrapper):
    # Apila k frames en el canal: (84,84,1) x k -> (84,84,k).
    def __init__(self, env, k=4):
        super().__init__(env)
        self.k = k
        self.frames = deque(maxlen=k)

        shp = env.observation_space.shape
        self.observation_space = gym.spaces.Box(
            low=0, high=255, shape=(shp[0], shp[1], shp[2]*k), dtype=np.uint8
        )

    def reset(self, **kwargs):
        obs, info, use_new_api = _unpack_reset(self.env.reset(**kwargs))
        for _ in range(self.k):
            self.frames.append(obs)
        stacked = self._get_obs()
        return (stacked, info) if use_new_api else stacked

    def step(self, action):
        step_res = self.env.step(action)
        use_new_api = _is_new_step_api(step_res)
        obs, reward, done, info, term, trunc = _unpack_step(step_res)
        self.frames.append(obs)
        stacked = self._get_obs()
        return _pack_step(stacked, reward, done, info, term, trunc, use_new_api)

    def _get_obs(self):
        return LazyFrames(list(self.frames))

class RecordEpisodeStats(gym.Wrapper):
    # Guarda retorno y longitud por episodio en info['episode'].
    def __init__(self, env):
        super().__init__(env)
        self.episode_return = 0.0
        self.episode_length = 0

    def reset(self, **kwargs):
        obs, info, use_new_api = _unpack_reset(self.env.reset(**kwargs))
        self.episode_return = 0.0
        self.episode_length = 0
        return (obs, info) if use_new_api else obs

    def step(self, action):
        step_res = self.env.step(action)
        use_new_api = _is_new_step_api(step_res)
        obs, reward, done, info, term, trunc = _unpack_step(step_res)
        self.episode_return += reward
        self.episode_length += 1
        if done:
            info = dict(info)
            info["episode"] = {"r": float(self.episode_return), "l": int(self.episode_length)}
        return _pack_step(obs, reward, done, info, term, trunc, use_new_api)

def make_env(env_id="SpaceInvaders-v0", seed=0, reward_clip=True):
    # Fallbacks por si tu instalación usa otros IDs
    tried = []
    last_err = None
    env = None
    for candidate in [env_id, "ALE/SpaceInvaders-v5", "SpaceInvadersNoFrameskip-v4"]:
        try:
            env = gym.make(candidate)
            env_id = candidate
            break
        except Exception as e:
            tried.append(candidate)
            last_err = e

    if env is None:
        raise RuntimeError(f"No pude crear el entorno. Probé: {tried}. Último error: {last_err}")

    # Seed (compat)
    try:
        env.reset(seed=seed)
    except Exception:
        try:
            env.seed(seed)
        except Exception:
            pass

    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)
    env = FireResetEnv(env)
    env = WarpFrame(env, width=84, height=84, grayscale=True)
    if reward_clip:
        env = ClipRewardEnv(env)
    env = FrameStack(env, k=4)
    env = RecordEpisodeStats(env)
    return env, env_id


## 4) Crear entorno y comprobar acciones

In [3]:
ENV_ID = "SpaceInvaders-v0"
env, resolved_id = make_env(ENV_ID, seed=SEED, reward_clip=True)
print("Resolved env id:", resolved_id)
try:
    print("Action meanings:", env.unwrapped.get_action_meanings())
except Exception:
    print("No action meanings available.")

reset_out = env.reset()
obs, info, use_new = _unpack_reset(reset_out)
print("Obs shape:", np.array(obs).shape, "| New reset API:", use_new)

step_out = env.step(0)
obs2, r2, done2, info2, term2, trunc2 = _unpack_step(step_out)
print("Step OK | r:", r2, "| done:", done2, "| keys:", list(info2.keys())[:8])


Resolved env id: SpaceInvaders-v0
Action meanings: ['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']
Obs shape: (84, 84, 4) | New reset API: False
Step OK | r: 0.0 | done: False | keys: ['ale.lives']


## 5) Red Actor–Critic (compartida) — CNN tipo Atari

In [4]:
def obs_to_tensor(obs) -> torch.Tensor:
    arr = np.array(obs, dtype=np.uint8)            # (84,84,4)
    arr = np.transpose(arr, (2, 0, 1))             # (4,84,84)
    t = torch.from_numpy(arr).float() / 255.0      # (4,84,84) en [0,1]
    return t.unsqueeze(0)                          # (1,4,84,84)

def get_lives(env, info):
    if isinstance(info, dict):
        if "ale.lives" in info:
            return info["ale.lives"]
        if "lives" in info:
            return info["lives"]
    try:
        return env.unwrapped.ale.lives()
    except Exception:
        return None

class ActorCritic(nn.Module):
    def __init__(self, n_actions: int):
        super().__init__()
        self.conv1 = nn.Conv2d(4, 32, 8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, 3, stride=1)
        self.fc1 = nn.Linear(7*7*64, 512)
        self.actor = nn.Linear(512, n_actions)
        self.critic = nn.Linear(512, 1)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.reshape(x.size(0), -1)
        x = F.relu(self.fc1(x))
        logits = self.actor(x)
        value = self.critic(x).squeeze(-1)
        return logits, value


## 6) Entrenamiento Actor–Critic (n-step) + Entropía + Ventaja normalizada

In [15]:
def train_pg_ac_spaceinvaders(
    env_id="SpaceInvaders-v0",
    total_updates=100000,         # Antes: 30000 (×3.3 más entrenamiento)
    n_steps=256,                  # Antes: 128 (rollouts más largos)
    gamma=0.99,
    lr=1e-4,                      # Antes: 2.5e-4 (más conservador), 1.5e-4
    batch_size=128,               # Antes: 64 (updates más estables)
    ent_coef=0.01,
    ent_coef_start=0.05,          # Antes: 0.03 (menos agresivo) 0.02, 
    ent_coef_end=0.01,            # Antes: 0.01 (decae más) 0.005,
    ent_decay_updates=50000,      # Antes: 5000 (×4 más lento) 20000,
    temp_start=2.0,               # Antes: 1.5 (menos agresivo) 1.3,
    temp_end=1.0,
    temp_decay_updates=30000,     # Antes: 5000 (×2 más lento) 10000,
    # v4: GAE(λ) + ε-greedy exploration
    gae_lambda=0.95,
    eps_start=0.15,               # Antes: 0.10 (menos aleatorio) 0.08, 
    eps_end=0.05,                 # Antes: 0.02 (converge más) 0.01,
    eps_decay_updates=60000,      # Antes: 8000 (×3.75 más lento) 30000,
    vf_coef=0.5,                  # Antes: 0.5 (más peso al value) 1.0,
    max_grad_norm=0.5,
    reward_clip=True,
    model_path="pg_ac_spaceinvaders_V7.pt",
    log_every=50,
    # Compat: criterios estrictos del proyecto
    early_stop=True,
    success_threshold=20.0,
    success_consecutive=100,
    **_unused_kwargs,
):
    """A2C/PG-AC estilo '08MIAR_pg_ac':
      - Rollout on-policy de n_steps
      - Actor-Critic con pérdida: policy + vf + entropy
      - Schedules: entropía/temperatura (si ent_decay_updates>0 / temp_decay_updates>0)
      - v4: GAE(λ) + ε-greedy en rollout
      - Reward clipping opcional (requisito típico del proyecto)
      - Early stop si logra >success_threshold (clipped) durante success_consecutive episodios consecutivos en TRAIN
    """
    import numpy as np
    import torch
    import torch.nn.functional as F
    from collections import Counter

    env, resolved_id = make_env(env_id, reward_clip=reward_clip)
    reset_out = env.reset()
    obs, _, _ = _unpack_reset(reset_out)
    n_actions = env.action_space.n

    model = ActorCritic(n_actions=n_actions).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, eps=1e-5)

    def linear_schedule(start, end, step, total):
        if total is None or total <= 0:
            return float(end)
        frac = min(max(step / float(total), 0.0), 1.0)
        return float(start + frac * (end - start))

    def get_lr_schedule(update, total_updates, initial_lr):
        """Linear decay del learning rate"""
        frac = min(update / float(total_updates), 1.0)
        return initial_lr * (1.0 - frac)

    ep_return = 0.0
    consec_success = 0
    returns_hist = []

    initial_lr = lr  # Guardar el LR original

    for upd in range(1, total_updates + 1):

        # Actualizar learning rate
        current_lr = get_lr_schedule(upd, total_updates, initial_lr)
        for param_group in optimizer.param_groups:
            param_group['lr'] = current_lr
            
        # Schedules (si el usuario fija ent_decay_updates=0, se usa ent_coef)
        ent_coef_cur = (ent_coef if (ent_decay_updates is None or ent_decay_updates <= 0)
                        else linear_schedule(ent_coef_start, ent_coef_end, upd - 1, ent_decay_updates))
        temp = (1.0 if (temp_decay_updates is None or temp_decay_updates <= 0)
                else linear_schedule(temp_start, temp_end, upd - 1, temp_decay_updates))
        eps = linear_schedule(eps_start, eps_end, upd - 1, eps_decay_updates)

        states, actions, rewards, dones, values = [], [], [], [], []
        action_counts = Counter()

        for t in range(n_steps):
            s = np.array(obs, dtype=np.uint8)
            states.append(s)

            s_t = obs_to_tensor(s).to(device)
            logits, v = model(s,t)
            values.append(float(v.item()))

            # Temperatura solo para muestreo (exploración)
            logits_scaled = logits / max(temp, 1e-6)
            dist = torch.distributions.Categorical(logits=logits_scaled)
            a = dist.sample()

            # ε-greedy: con prob eps elige acción uniforme
            if np.random.rand() < eps:
                a = torch.tensor([np.random.randint(n_actions)], device=device)

            a_int = int(a.item())
            action_counts[a_int] += 1

            step_out = env.step(a_int)
            next_obs, r, done, info, terminated, truncated = _unpack_step(step_out)
            done = bool(done)
            ep_return += float(r)

            actions.append(a_int)
            rewards.append(float(r))
            dones.append(1.0 if done else 0.0)

            obs = next_obs

            if done:
                returns_hist.append(ep_return)
                if ep_return > float(success_threshold):
                    consec_success += 1
                else:
                    consec_success = 0

                ep_return = 0.0
                reset_out = env.reset()
                obs, _, _ = _unpack_reset(reset_out)
                if early_stop and consec_success >= int(success_consecutive):
                # Verificar que llevamos suficiente entrenamiento (mínimo 50k updates)
                    if upd >= 50000:
                        torch.save(model.state_dict(), model_path)
                        avg100 = float(np.mean(returns_hist[-100:])) if len(returns_hist) else 0.0
                        print(f"[EARLY STOP] update={upd} consec>{success_threshold}={consec_success} avg100={avg100:.2f}")
                        env.close()
                        return model, resolved_id
                    else:
                        print(f"[INFO] {consec_success} consecutivos alcanzados pero esperando update 50k (actual: {upd})")

        # Bootstrap para GAE
        with torch.no_grad():
            last_s_t = obs_to_tensor(obs).to(device)
            _, last_v = model(last_s_t)
            last_v = float(last_v.item())

        rewards_np = np.array(rewards, dtype=np.float32)
        values_np = np.array(values, dtype=np.float32)
        dones_np = np.array(dones, dtype=np.float32)

        T = len(rewards_np)
        adv = np.zeros(T, dtype=np.float32)
        gae = 0.0

        for t in reversed(range(T)):
            if t == T - 1:
                next_nonterminal = 1.0 - dones_np[t]
                next_value = last_v
            else:
                next_nonterminal = 1.0 - dones_np[t + 1]
                next_value = values_np[t + 1]
            delta = rewards_np[t] + gamma * next_value * next_nonterminal - values_np[t]
            gae = delta + gamma * gae_lambda * next_nonterminal * gae
            adv[t] = gae

        returns = adv + values_np

        # Normalizar ventajas
        adv = (adv - adv.mean()) / (adv.std() + 1e-8)

        states_arr = np.array(states, dtype=np.uint8)  # (T,H,W,C)
        states_arr = np.transpose(states_arr, (0, 3, 1, 2))    # (T,C,H,W)
        states_t = torch.from_numpy(states_arr).float().to(device).div_(255.0).contiguous()
        actions_t = torch.tensor(np.array(actions), dtype=torch.int64, device=device)
        returns_t = torch.tensor(returns, dtype=torch.float32, device=device)
        adv_t = torch.tensor(adv, dtype=torch.float32, device=device)

        idxs = np.arange(T)
        np.random.shuffle(idxs)

        n_mb = max(1, (T + batch_size - 1) // batch_size)
        pol_loss_acc, vf_loss_acc = 0.0, 0.0

        for start in range(0, T, batch_size):
            b = idxs[start:start + batch_size]

            logits, v_pred = model(states_t[b])
            dist = torch.distributions.Categorical(logits=logits)  # pérdida con logits "sin temperatura"
            logp = dist.log_prob(actions_t[b])
            entropy = dist.entropy().mean()

            policy_loss = -(logp * adv_t[b]).mean()
            v_pred = v_pred.squeeze(-1)
            value_loss = F.mse_loss(v_pred, returns_t[b])

            loss = policy_loss + vf_coef * value_loss - ent_coef_cur * entropy

            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()

            pol_loss_acc += float(policy_loss.item())
            vf_loss_acc += float(value_loss.item())

        if upd % log_every == 0:
            avg100 = float(np.mean(returns_hist[-100:])) if len(returns_hist) else 0.0
            avg20 = float(np.mean(returns_hist[-20:])) if len(returns_hist) >= 20 else 0.0
            last_ep = float(returns_hist[-1]) if len(returns_hist) else 0.0
            top_actions = action_counts.most_common(3)
    
            # Calcular % de distribución de top acción
            total_actions = sum(action_counts.values())
            top_pct = (action_counts.most_common(1)[0][1] / total_actions * 100) if total_actions > 0 else 0
    
            print(
                f"[upd {upd:6d}/{total_updates}] "
                f"pol_loss={pol_loss_acc/n_mb:+.4f} val_loss={vf_loss_acc/n_mb:.4f} "
                f"last={last_ep:4.1f} avg20={avg20:5.2f} avg100={avg100:5.2f} "
                f"consec>{success_threshold}={consec_success:3d} "
                f"ent={ent_coef_cur:.4f} temp={temp:.2f} eps={eps:.3f} "
                f"lr={current_lr:.6f} "
                f"top_act={top_actions[0][0]}({top_pct:.0f}%) {top_actions}"
            )

        if upd % 5000 == 0:  # Cada 5k updates
            checkpoint_path = model_path.replace('.pt', f'_upd{upd}.pt')
            torch.save({
                'update': upd,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'avg100': avg100,
                'consec_success': consec_success,
                'returns_hist': returns_hist[-100:],  # Guardar últimos 100
            }, checkpoint_path)
            print(f"[CHECKPOINT] Guardado en {checkpoint_path}")

    torch.save(model.state_dict(), model_path)
    env.close()
    return model, resolved_id

## 7) Evaluación (100 episodios, umbral 20.0) + conteo de acciones

In [16]:
@torch.no_grad()
def evaluate_agent(
    model,
    env_id="SpaceInvaders-v0",
    n_eval_episodes=500,
    threshold=20.0,
    print_every=10,
    mode="greedy",          # greedy | sample | epsilon_greedy
    epsilon=0.05,
    reward_clip=True,
):
    eval_env, resolved = make_env(env_id, seed=SEED+999, reward_clip=reward_clip)
    model.eval()

    returns = []
    action_counts = Counter()

    for ep in range(1, n_eval_episodes + 1):
        reset_out = eval_env.reset()
        obs, _, _ = _unpack_reset(reset_out)
        done = False
        ep_r = 0.0

        while not done:
            s = obs_to_tensor(obs).to(device)
            logits, _ = model(s,t)

            if mode == "greedy":
                a = int(torch.argmax(logits, dim=-1).item())
            elif mode == "sample":
                dist = torch.distributions.Categorical(logits=logits)
                a = int(dist.sample().item())
            elif mode == "epsilon_greedy":
                if random.random() < epsilon:
                    a = int(eval_env.action_space.sample())
                else:
                    a = int(torch.argmax(logits, dim=-1).item())
            else:
                raise ValueError("mode inválido")

            action_counts[a] += 1
            step_out = eval_env.step(a)
            obs, r, done, info2, _, _ = _unpack_step(step_out)
            ep_r += float(r)

            if done:
                ep_r = info2.get("episode", {}).get("r", ep_r)

        returns.append(float(ep_r))
        if ep % print_every == 0:
            print(f"[eval] ep {ep}/{n_eval_episodes} | return={ep_r:.2f} | mean={np.mean(returns):.2f}")

    mean_r = float(np.mean(returns))
    ok = mean_r >= threshold

    print("\n=== Resultado evaluación ===")
    print("Resolved env id:", resolved)
    print(f"Media return (n={n_eval_episodes}): {mean_r:.2f} | Umbral: {threshold:.2f} | OK: {ok}")
    print("Distribución de acciones (conteo):", dict(action_counts))
    return mean_r, returns, dict(action_counts)


def max_run_above_threshold(returns, threshold):
    best = 0
    cur = 0
    for r in returns:
        if r > threshold:
            cur += 1
            best = max(best, cur)
        else:
            cur = 0
    return best

@torch.no_grad()
def evaluate_consecutive_success(
    model,
    env_id="SpaceInvaders-v0",
    threshold=20.0,
    need_consecutive=101,
    max_episodes=500,
    mode="greedy",
    reward_clip=True,
):
    # Devuelve True si se logra `need_consecutive` episodios consecutivos con return > threshold.
    eval_env, resolved = make_env(env_id, seed=SEED+2026, reward_clip=reward_clip)
    model.eval()

    consec = 0
    returns = []

    for ep in range(1, max_episodes + 1):
        reset_out = eval_env.reset()
        obs, _, _ = _unpack_reset(reset_out)
        done = False
        ep_r = 0.0

        while not done:
            s = obs_to_tensor(obs).to(device)
            logits, _ = model(s,t)

            if mode == "greedy":
                a = int(torch.argmax(logits, dim=-1).item())
            elif mode == "sample":
                dist = torch.distributions.Categorical(logits=logits)
                a = int(dist.sample().item())
            else:
                raise ValueError("mode inválido (usa greedy o sample)")

            step_out = eval_env.step(a)
            obs, r, done, info2, _, _ = _unpack_step(step_out)
            ep_r += float(r)

            if done:
                ep_r = info2.get("episode", {}).get("r", ep_r)

        returns.append(float(ep_r))

        if ep_r > threshold:
            consec += 1
        else:
            consec = 0

        if ep % 10 == 0 or consec in (1, need_consecutive):
            print(f"[eval-consec] ep {ep}/{max_episodes} | return={ep_r:.2f} | consec>{threshold}={consec}")

        if consec >= need_consecutive:
            print(f"\n[OK] Logrado: {consec} episodios consecutivos con return>{threshold}. (env={resolved})")
            return True, returns

    best = max_run_above_threshold(returns, threshold)
    print(f"\n[NO] No se logró {need_consecutive} consecutivos en {max_episodes} episodios. Mejor racha={best}")
    return False, returns


## 8) Ejecutar entrenamiento y evaluación

In [17]:
ENV_ID = "SpaceInvaders-v0"

model, env_id_used = train_pg_ac_spaceinvaders(
    env_id="SpaceInvaders-v0",  
    total_updates=100000,
    n_steps=256,
    gamma=0.99,
    lr=1e-4,
    batch_size=128,
    ent_coef_start=0.05,
    ent_coef_end=0.01,
    ent_decay_updates=50000,
    temp_start=2.0,
    temp_end=1.0,
    temp_decay_updates=30000,
    gae_lambda=0.95,
    eps_start=0.15,
    eps_end=0.05,
    eps_decay_updates=60000,
    vf_coef=0.5,
    max_grad_norm=0.5,
    reward_clip=True,
    model_path="pg_ac_spaceinvaders_v6.pt",
    log_every=50,  # Log más frecuente para monitorear
    early_stop=True,
    success_threshold=20.0,
    success_consecutive=100,
)

# Evaluación (restricción típica del proyecto)
mean_r, returns, action_hist = evaluate_agent(
    model,
    env_id=ENV_ID,
    n_eval_episodes=100,
    threshold=20.0,
    print_every=20,
    mode="greedy",
    reward_clip=True,
)


# Evaluación estricta: >20 (clipped) durante >100 episodios consecutivos
ok_consec, eval_returns = evaluate_consecutive_success(
    model,
    env_id=ENV_ID,
    threshold=20.0,
    need_consecutive=100,
    max_episodes=500,
    mode="greedy",
    reward_clip=True,
)
print("Criterio estricto cumplido:", ok_consec)


[upd     50/100000] pol_loss=-0.0011 val_loss=0.7764 last=18.0 avg20= 9.65 avg100= 9.10 consec>20.0=  0 ent=0.0500 temp=2.00 eps=0.150 lr=0.000100 top_act=4(21%) [(4, 55), (3, 50), (5, 45)]
[upd    100/100000] pol_loss=-0.0002 val_loss=0.9824 last= 3.0 avg20= 8.35 avg100= 9.04 consec>20.0=  0 ent=0.0499 temp=2.00 eps=0.150 lr=0.000100 top_act=1(20%) [(1, 52), (4, 44), (3, 44)]
[upd    150/100000] pol_loss=-0.0250 val_loss=0.7916 last=11.0 avg20= 9.55 avg100= 9.53 consec>20.0=  0 ent=0.0499 temp=2.00 eps=0.150 lr=0.000100 top_act=4(21%) [(4, 55), (5, 47), (1, 44)]
[upd    200/100000] pol_loss=-0.0357 val_loss=0.5682 last= 8.0 avg20= 7.90 avg100= 9.09 consec>20.0=  0 ent=0.0498 temp=1.99 eps=0.150 lr=0.000100 top_act=3(19%) [(3, 48), (1, 48), (4, 44)]
[upd    250/100000] pol_loss=-0.0075 val_loss=0.5475 last= 8.0 avg20= 9.50 avg100= 9.32 consec>20.0=  0 ent=0.0498 temp=1.99 eps=0.150 lr=0.000100 top_act=4(25%) [(4, 64), (2, 48), (1, 43)]
[upd    300/100000] pol_loss=+0.0205 val_loss=0.62

NameError: name 's_t' is not defined

## 9) (Opcional) Diagnóstico rápido: evaluación en modo `sample`

In [19]:
# Evaluación (restricción típica del proyecto)
mean_r, returns, action_hist = evaluate_agent(
    model,
    env_id=ENV_ID,
    n_eval_episodes=100,
    threshold=20.0,
    print_every=20,
    mode="greedy",
    reward_clip=True,
)


# Evaluación estricta: >20 (clipped) durante >100 episodios consecutivos
ok_consec, eval_returns = evaluate_consecutive_success(
    model,
    env_id=ENV_ID,
    threshold=20.0,
    need_consecutive=100,
    max_episodes=500,
    mode="greedy",
    reward_clip=True,
)
print("Criterio estricto cumplido:", ok_consec)


[eval] ep 20/100 | return=18.00 | mean=18.00
[eval] ep 40/100 | return=18.00 | mean=18.00
[eval] ep 60/100 | return=18.00 | mean=18.00
[eval] ep 80/100 | return=18.00 | mean=18.00
[eval] ep 100/100 | return=18.00 | mean=18.00

=== Resultado evaluación ===
Resolved env id: SpaceInvaders-v0
Media return (n=100): 18.00 | Umbral: 20.00 | OK: False
Distribución de acciones (conteo): {4: 17471}


NameError: name 's_t' is not defined

In [None]:
_ = evaluate_agent(
    model,
    env_id=ENV_ID,
    n_eval_episodes=500,
    threshold=0.0,
    print_every=1,
    mode="sample",
    reward_clip=True,
)


In [None]:
# -----------------------------------------------------------------------------
# Función para continuar entrenamiento desde checkpoint
# -----------------------------------------------------------------------------

def continue_training_from_checkpoint(checkpoint_path, additional_updates=50000):
    """
    Continúa entrenamiento desde un checkpoint guardado
    """
    import torch
    
    # Cargar checkpoint
    checkpoint = torch.load(checkpoint_path, map_location=device)
    
    # Recrear modelo y optimizador
    env, _ = make_env("SpaceInvadersNoFrameskip-v4", reward_clip=True)
    n_actions = env.action_space.n
    model = ActorCritic(n_actions=n_actions).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1.5e-4, eps=1e-5)
    
    # Restaurar estados
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_update = checkpoint['update']
    returns_hist = checkpoint.get('returns_hist', [])
    
    print(f"Continuando desde update {start_update}")
    print(f"avg100 anterior: {checkpoint.get('avg100', 0):.2f}")
    
    # Continuar entrenamiento (necesitarás adaptar la función train)
    # ...
    
    return model