In [5]:
!python -m pip install -U pip setuptools wheel




In [57]:
!python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128


Looking in indexes: https://download.pytorch.org/whl/cu128


In [58]:
!python -m pip install -U "gymnasium[atari]"




In [1]:
import torch

print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("gpu:", torch.cuda.get_device_name(0))
    print("torch cuda version:", torch.version.cuda)

torch: 2.9.1+cu128
cuda available: True
gpu: NVIDIA GeForce RTX 3060
torch cuda version: 12.8


In [2]:
import gymnasium as gym
import ale_py  # registers ALE envs

env = gym.make("ALE/Pong-v5")  # no rendering needed for a smoke test
obs, info = env.reset()
print("obs shape:", obs.shape)

for _ in range(200):
    obs, reward, terminated, truncated, info = env.step(env.action_space.sample())
    if terminated or truncated:
        obs, info = env.reset()

env.close()
print("ok")

obs shape: (210, 160, 3)
ok


In [3]:
# Install opencv-python if not already installed (required for PreprocessAtari wrapper)
try:
    import cv2
    print("‚úÖ opencv-python is already installed")
except ImportError:
    print("Installing opencv-python...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "opencv-python"])
    print("‚úÖ opencv-python installed successfully!")
    import cv2

‚úÖ opencv-python is already installed


In [4]:
# Ensure torch is imported (if not already imported in earlier cells)
import torch
import torch.nn as nn
import torch.nn.functional as F


In [5]:
# FIXED: PreprocessAtari wrapper using ObservationWrapper for proper observation transformation
import gymnasium as gym
import numpy as np
from gymnasium import ObservationWrapper, Wrapper
import cv2
from collections import deque

class PreprocessAtari(ObservationWrapper):
    """
    Preprocesses Atari frames: resize to 84x84, convert to grayscale.
    Uses ObservationWrapper to ensure observations are automatically transformed.
    """
    def __init__(self, env):
        super().__init__(env)
        # Update observation space after transformation
        self.observation_space = gym.spaces.Box(
            low=0, high=255, shape=(84, 84), dtype=np.uint8
        )
    
    def observation(self, obs):
        """
        Transform observation: RGB -> Grayscale -> Resize to 84x84
        This method is automatically called by ObservationWrapper for all observations.
        """
        # Ensure we have the right input shape
        if len(obs.shape) == 3 and obs.shape[2] == 3:
            # Convert RGB to grayscale
            obs = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
        elif len(obs.shape) == 3 and obs.shape[2] == 1:
            # Already grayscale, just remove channel dimension
            obs = obs.squeeze(2)
        elif len(obs.shape) == 2:
            # Already 2D grayscale
            pass
        else:
            raise ValueError(f"Unexpected observation shape: {obs.shape}")
        
        # Resize to 84x84 (handles both grayscale and color inputs)
        if obs.shape != (84, 84):
            obs = cv2.resize(obs, (84, 84), interpolation=cv2.INTER_AREA)
        
        # Ensure output is uint8 and 2D
        obs = obs.astype(np.uint8)
        if len(obs.shape) == 3:
            obs = obs.squeeze()
        
        return obs

class FrameStack(Wrapper):
    """
    Stacks the last n frames together.
    """
    def __init__(self, env, n_frames=4):
        super().__init__(env)
        self.n_frames = n_frames
        self.frames = deque(maxlen=n_frames)
        
        # Update observation space
        obs_shape = env.observation_space.shape
        self.observation_space = gym.spaces.Box(
            low=0, high=255, 
            shape=(n_frames, obs_shape[0], obs_shape[1]), 
            dtype=np.uint8
        )
    
    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        # Ensure obs is 2D (height, width)
        if len(obs.shape) > 2:
            obs = obs.squeeze()
        # Fill the frame stack with the first frame
        for _ in range(self.n_frames):
            self.frames.append(obs.copy())
        return self._get_obs(), info
    
    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        # Ensure obs is 2D (height, width)
        if len(obs.shape) > 2:
            obs = obs.squeeze()
        self.frames.append(obs.copy())
        return self._get_obs(), reward, terminated, truncated, info
    
    def _get_obs(self):
        """
        Stack frames: (n_frames, height, width)
        Returns numpy array of shape (n_frames, H, W)
        """
        # Ensure all frames are 2D
        frames_2d = []
        for frame in self.frames:
            if len(frame.shape) > 2:
                frame = frame.squeeze()
            frames_2d.append(frame)
        
        # Stack along first dimension: (n_frames, height, width)
        stacked = np.stack(frames_2d, axis=0)
        
        # Ensure dtype is uint8
        stacked = stacked.astype(np.uint8)
        
        return stacked

class FireReset(gym.Wrapper):
    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        obs, _, terminated, truncated, info = self.env.step(1)  # FIRE
        if terminated or truncated:
            obs, info = self.env.reset(**kwargs)
        return obs, info

class ReducedActionSet(gym.ActionWrapper):
    def __init__(self, env, allowed_actions):
        super().__init__(env)
        self.allowed_actions = allowed_actions
        self.action_space = gym.spaces.Discrete(len(allowed_actions))

    def action(self, a):
        return self.allowed_actions[a]



print("‚úÖ Preprocessing wrappers fixed using ObservationWrapper!")
print("   - PreprocessAtari now uses ObservationWrapper for automatic transformation")
print("   - FrameStack handles shape validation and dtype conversion")


‚úÖ Preprocessing wrappers fixed using ObservationWrapper!
   - PreprocessAtari now uses ObservationWrapper for automatic transformation
   - FrameStack handles shape validation and dtype conversion


In [6]:
import random
import numpy as np
import torch
from collections import deque, namedtuple

Experience = namedtuple('Experience', ['state', 'action', 'reward', 'next_state', 'done'])

class ReplayBuffer:
    def __init__(self, capacity: int):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        # Store raw uint8 frames on CPU (compact, fast)
        self.buffer.append(Experience(
            state.astype(np.uint8),
            int(action),
            float(reward),
            next_state.astype(np.uint8),
            bool(done),
        ))

    def sample(self, batch_size: int, device: str):
        batch = random.sample(self.buffer, batch_size)

        states = np.stack([e.state for e in batch])          # (B, 4, 84, 84) uint8
        next_states = np.stack([e.next_state for e in batch])# (B, 4, 84, 84) uint8

        # Convert to float + normalize ONCE here
        states = torch.as_tensor(states, device=device, dtype=torch.float32) / 255.0
        next_states = torch.as_tensor(next_states, device=device, dtype=torch.float32) / 255.0

        actions = torch.as_tensor([e.action for e in batch], device=device, dtype=torch.long)
        rewards = torch.as_tensor([e.reward for e in batch], device=device, dtype=torch.float32)
        dones = torch.as_tensor([e.done for e in batch], device=device, dtype=torch.float32)

        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.buffer)


In [8]:
# üîç DIAGNOSTIC: Check why agent isn't learning at 1.3M steps
import torch
import numpy as np
import os

print("=== DIAGNOSTIC CHECK ===")

# Check 1: Verify wrapper is working
print("\n1. Checking observation preprocessing...")
try:
    import gymnasium as gym
    import ale_py
    test_env = gym.make("ALE/Pong-v5")
    test_env = PreprocessAtari(test_env)
    test_env = FrameStack(test_env, n_frames=4)
    test_obs, _ = test_env.reset()
    print(f"   ‚úÖ Observation shape: {test_obs.shape} (should be (4, 84, 84))")
    print(f"   ‚úÖ Observation dtype: {test_obs.dtype} (should be uint8)")
    if test_obs.shape != (4, 84, 84):
        print(f"   ‚ùå ERROR: Shape mismatch! This is why it's not learning!")
        raise ValueError("Wrapper not working!")
except Exception as e:
    print(f"   ‚ùå ERROR in wrapper: {e}")

# Check 2: Check current model metrics
print("\n2. Checking saved model...")
model_path = "models/dqn_pong_improved.pth"
if os.path.exists(model_path):
    checkpoint = torch.load(model_path, map_location='cpu')
    print(f"   ‚úÖ Model exists, trained for {checkpoint.get('steps', 'unknown')} steps")
    
    # Check Q-values to see if they're reasonable
    print("\n3. Checking Q-network outputs...")
    try:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        state_shape = (4, 84, 84)
        n_actions = 6
        n_frames = state_shape[0]
        
        # Create a dummy model to check structure
        # Model should be defined in a previous cell
        if 'Model' not in globals():
            print(f"   ‚ùå ERROR: Model class not found! Make sure you've run the Model cell.")
            raise NameError("Model class not found in global namespace")
        test_model = Model(n_actions, n_frames).to(device)
        test_model.load_state_dict(checkpoint['q_network'])
        test_model.eval()
        
        # Test with dummy state
        dummy_state = torch.zeros(1, 4, 84, 84).to(device) / 255.0
        with torch.no_grad():
            q_values = test_model(dummy_state)
        
        print(f"   ‚úÖ Q-values shape: {q_values.shape}")
        print(f"   ‚úÖ Q-values range: [{q_values.min().item():.2f}, {q_values.max().item():.2f}]")
        print(f"   ‚úÖ Q-values mean: {q_values.mean().item():.2f}")
        
        if q_values.abs().max() < 0.1:
            print(f"   ‚ö†Ô∏è  WARNING: Q-values are very small! Network might not be learning.")
        if q_values.max() - q_values.min() < 0.01:
            print(f"   ‚ö†Ô∏è  WARNING: Q-values are almost identical! No action differentiation.")
        
    except Exception as e:
        print(f"   ‚ùå Error checking model: {e}")
else:
    print(f"   ‚ö†Ô∏è  Model file not found at {model_path}")

# Check 4: Common issues
print("\n4. Common issues checklist:")
print("   ‚ñ° Wrapper preprocessing observations correctly")
print("   ‚ñ° Learning rate not too small (should be ~1e-4)")
print("   ‚ñ° Batch size reasonable (32 is good)")
print("   ‚ñ° Replay buffer filling up (needs >10k samples)")
print("   ‚ñ° Training actually happening (check loss values)")
print("   ‚ñ° Epsilon decay schedule appropriate")

print("\n=== RECOMMENDATIONS ===")
print("If rewards are stuck at -21 (random play):")
print("1. Verify the wrapper cell has been re-run after fixes")
print("2. Check if loss is actually changing during training")
print("3. Try increasing learning rate to 2e-4 or 5e-4")
print("4. Verify Double DQN is working (check train_step method)")
print("5. Consider resetting and starting fresh with improved agent")

=== DIAGNOSTIC CHECK ===

1. Checking observation preprocessing...
   ‚úÖ Observation shape: (4, 84, 84) (should be (4, 84, 84))
   ‚úÖ Observation dtype: uint8 (should be uint8)

2. Checking saved model...
   ‚úÖ Model exists, trained for 1200001 steps

3. Checking Q-network outputs...
   ‚ùå ERROR: Model class not found! Make sure you've run the Model cell.
   ‚ùå Error checking model: Model class not found in global namespace

4. Common issues checklist:
   ‚ñ° Wrapper preprocessing observations correctly
   ‚ñ° Learning rate not too small (should be ~1e-4)
   ‚ñ° Batch size reasonable (32 is good)
   ‚ñ° Replay buffer filling up (needs >10k samples)
   ‚ñ° Training actually happening (check loss values)
   ‚ñ° Epsilon decay schedule appropriate

=== RECOMMENDATIONS ===
If rewards are stuck at -21 (random play):
1. Verify the wrapper cell has been re-run after fixes
2. Check if loss is actually changing during training
3. Try increasing learning rate to 2e-4 or 5e-4
4. Verify Double

In [7]:
# üõ†Ô∏è FIXED VERSION: Enhanced training with better diagnostics
# Run this if your agent isn't learning after 1M+ steps

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random
from collections import deque, namedtuple

# Enhanced ImprovedDQNAgent with better diagnostics
class EnhancedDQNAgent:
    """
    Enhanced DQN Agent with:
    - Better learning rate schedule
    - Gradient norm monitoring
    - Q-value diagnostics
    - More robust training
    """
    def __init__(
        self,
        state_shape,
        n_actions,
        device='cuda',
        lr=1e-4,  # Slightly higher learning rate
        gamma=0.99,
        epsilon_start=1.0,
        epsilon_end=0.01,
        epsilon_decay=300000,  
        target_update_freq=10000,
        buffer_size=100000,
        batch_size=32,
        optimistic_init=1.0  # More conservative initialization
    ):
        self.device = device
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.target_update_freq = target_update_freq
        self.batch_size = batch_size
        self.steps = 0
        self.optimistic_init = optimistic_init
        
        # Create Q-network and target network
        n_frames = state_shape[0]
        self.q_network = Model(n_actions, n_frames).to(device)
        self.target_network = Model(n_actions, n_frames).to(device)
        
        self.target_network.load_state_dict(self.q_network.state_dict())
        
        self.target_network.eval()
        
        # Optimizer with better learning rate
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr, eps=1e-8)
        
        # Replay buffer
        self.replay_buffer = ReplayBuffer(buffer_size)
        
        # Diagnostics
        self.loss_history = []
        self.q_value_history = []
    
    def get_epsilon(self):
        """Calculate current epsilon with linear decay."""
        if self.steps < self.epsilon_decay:
            return self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
                   (1 - self.steps / self.epsilon_decay)
        else:
            return self.epsilon_end
    
    def select_action(self, state, training=True):
        """Select an action using epsilon-greedy policy."""
        if training and random.random() < self.get_epsilon():
            return random.randrange(self.n_actions)
        
        # Ensure state is correct shape
        if isinstance(state, np.ndarray):
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) / 255.0
        else:
            state_tensor = state.unsqueeze(0) if len(state.shape) == 3 else state
            state_tensor = state_tensor.to(self.device) / 255.0
        
        with torch.no_grad():
            q_values_t = self.q_network(state_tensor)        # shape (1, n_actions)
            q_values = q_values_t[0].detach().cpu().numpy()  # shape (n_actions,)

            max_q = q_values.max()
            best = np.flatnonzero(q_values == max_q)         # all max actions
            action = int(np.random.choice(best))             # break ties randomly

            
            # Store Q-values for diagnostics
            if self.steps % 1000 == 0:
                self.q_value_history.append(q_values.mean().item())
        
        return action
    
    def store_transition(self, state, action, reward, next_state, done):
        # reward clipping is fine
        reward = np.clip(reward, -1.0, 1.0)

        # IMPORTANT: store raw uint8 arrays (no torch, no /255 here)
        self.replay_buffer.push(state, action, reward, next_state, done)
        self.steps += 1
    
    def train_step(self):
        """Perform one training step with Double DQN."""
        if len(self.replay_buffer) < self.batch_size:
            return None
        
        # Sample batch
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
        self.batch_size, self.device
        )
        # states/next_states are already float32 on device in [0,1]
        
        # Compute Q(s, a)
        q_values = self.q_network(states)
        q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
        
        # Double DQN
        with torch.no_grad():
            next_q_values_main = self.q_network(next_states)
            next_actions = next_q_values_main.argmax(1)
            next_q_values_target = self.target_network(next_states)
            next_q_value = next_q_values_target.gather(1, next_actions.unsqueeze(1)).squeeze(1)
            target_q_value = rewards + (1 - dones) * self.gamma * next_q_value
        
        # Compute loss
        loss = F.smooth_l1_loss(q_value, target_q_value)

        
        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
        
        # Gradient clipping
        grad_norm = torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), 10)
        self.optimizer.step()
        
        # Update target network
        if self.steps % self.target_update_freq == 0:
            self.target_network.load_state_dict(self.q_network.state_dict())
        
        # Store diagnostics
        loss_item = loss.item()
        self.loss_history.append(loss_item)
        
        # Print diagnostics every 10k steps
        if self.steps % 10000 == 0 and len(self.loss_history) > 0:
            avg_loss = np.mean(self.loss_history[-100:])
            print(f"   [Step {self.steps}] Loss: {avg_loss:.4f}, Epsilon: {self.get_epsilon():.3f}, "
                  f"Q-mean: {np.mean(self.q_value_history[-10:]) if len(self.q_value_history) > 0 else 'N/A':.2f}")
        
        return loss_item
    
    def save(self, filepath):
        """Save the model."""
        torch.save({
            'q_network': self.q_network.state_dict(),
            'target_network': self.target_network.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'steps': self.steps,
            'loss_history': self.loss_history[-1000:],  # Save recent loss history
        }, filepath)
    
    def load(self, filepath):
        """Load the model."""
        checkpoint = torch.load(filepath, map_location=self.device)
        self.q_network.load_state_dict(checkpoint['q_network'])
        self.target_network.load_state_dict(checkpoint['target_network'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        self.steps = checkpoint['steps']
        self.loss_history = checkpoint.get('loss_history', [])

print("‚úÖ Enhanced DQN Agent ready with better diagnostics!")

‚úÖ Enhanced DQN Agent ready with better diagnostics!


In [10]:
# üöÄ TRAINING with Enhanced DQN Agent
import gymnasium as gym
import ale_py
import os
from tqdm import tqdm
import numpy as np
from gymnasium.wrappers import AtariPreprocessing, FrameStackObservation
from collections import Counter
import torch

def evaluate_greedy(agent, env, n_episodes=10):
    rewards = []
    for _ in range(n_episodes):
        s, _ = env.reset()
        done = False
        ep_r = 0.0
        while not done:
            a = agent.select_action(s, training=False)
            s, r, terminated, truncated, _ = env.step(a)
            done = terminated or truncated
            ep_r += r
        rewards.append(ep_r)
    return float(np.mean(rewards)), float(np.std(rewards)), rewards

def greedy_eval_diagnostics(agent, env, n_episodes=5):
    action_counts = Counter()
    scores = []
    q_means = []
    q_stds = []

    for _ in range(n_episodes):
        s, _ = env.reset()
        done = False
        ep_r = 0.0

        while not done:
            a = agent.select_action(s, training=False)
            action_counts[a] += 1

            with torch.no_grad():
                st = torch.as_tensor(s, dtype=torch.float32, device=agent.device).unsqueeze(0) / 255.0
                q = agent.q_network(st)[0]
                q_means.append(float(q.mean().item()))
                q_stds.append(float(q.std().item()))

            s, r, terminated, truncated, _ = env.step(a)
            done = terminated or truncated
            ep_r += r

        scores.append(ep_r)

    return scores, action_counts, float(np.mean(q_means)), float(np.mean(q_stds))

gym.register_envs(ale_py)

def make_env():
    env = gym.make(
        "ALE/Pong-v5",
        frameskip=1,
        repeat_action_probability=0.0,
        full_action_space=False
    )

    env = AtariPreprocessing(
        env,
        noop_max=30,
        frame_skip=4,
        terminal_on_life_loss=False,
        screen_size=84,
        grayscale_obs=True,
        grayscale_newaxis=False,
        scale_obs=False
    )

    env = FireReset(env)
    env = FrameStackObservation(env, stack_size=4)

    # Reduced actions: NOOP, RIGHT, LEFT
    env = ReducedActionSet(env, allowed_actions=[0, 2, 3])

    return env


# Create environment with preprocessing
env = gym.make(
    "ALE/Pong-v5",
    frameskip=1,
    repeat_action_probability=0.0,
    full_action_space=False
)

env = AtariPreprocessing(
    env,
    noop_max=30,
    frame_skip=4,
    terminal_on_life_loss=False,
    screen_size=84,
    grayscale_obs=True,
    grayscale_newaxis=False,
    scale_obs=False
)

env = make_env()
eval_env = make_env()

print("Action meanings:", env.unwrapped.get_action_meanings())

# Get environment info
state_shape = env.observation_space.shape
n_actions = env.action_space.n

print(f"State shape: {state_shape}")
print(f"Number of actions: {n_actions}")

# Create ENHANCED agent with better hyperparameters
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

agent = EnhancedDQNAgent(
    state_shape=state_shape,
    n_actions=n_actions,
    device=device,
    lr=1e-4,
    gamma=0.99,
    epsilon_start=1.0,
    epsilon_end=0.01,
    epsilon_decay=500_000,
    target_update_freq=10_000,
    buffer_size=1_000_000,
    batch_size=32,
    optimistic_init=1.0
)

# Training parameters
total_steps = 6_000_000  # Train for 6M steps
learning_starts = 50_000  # Start training after 50k steps
train_freq = 4  # Train every 4 steps
save_freq = 200_000  # Save every 200k steps
eval_freq = 50_000  # Print stats every 50k steps

# Statistics
episode_rewards = []
episode_lengths = []
current_episode_reward = 0
current_episode_length = 0

# Start training from scratch (always)
model_path = "models/dqn_pong_enhanced.pth"
os.makedirs("models", exist_ok=True)

# Optionally delete previous model if you want to start completely fresh
# Uncomment the next line if you want to delete any existing model:
# if os.path.exists(model_path):
#     os.remove(model_path)
#     print(f"üóëÔ∏è  Deleted previous model at {model_path}")

# Verify state shape is correct
state, info = env.reset()
print(f"\n‚úÖ Initial state shape: {state.shape}, dtype: {state.dtype}")
if state.shape != state_shape:
    raise ValueError(f"State shape mismatch! Expected {state_shape}, got {state.shape}")

# Training loop
pbar = tqdm(range(total_steps), desc="Training")

for step in range(total_steps):
    # Select action
    action = agent.select_action(state, training=True)
    
    # Take step
    next_state, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    
    # Store transition
    agent.store_transition(state, action, reward, next_state, done)
    
    # Update statistics
    current_episode_reward += reward
    current_episode_length += 1
    
    # Train
    if step >= learning_starts and step % train_freq == 0:
        loss = agent.train_step()
        if loss is not None:
            current_lr = agent.optimizer.param_groups[0]['lr']
            pbar.set_postfix({
                'epsilon': f'{agent.get_epsilon():.3f}',
                'loss': f'{loss:.4f}',
                'lr': f'{current_lr:.2e}',
                'avg_reward': f'{np.mean(episode_rewards[-10:]):.1f}' if len(episode_rewards) >= 10 else 'N/A'
            })
    
    # Handle episode end
    if done:
        episode_rewards.append(current_episode_reward)
        episode_lengths.append(current_episode_length)
        current_episode_reward = 0
        current_episode_length = 0
        state, info = env.reset()
    else:
        state = next_state
    
    # Save model periodically
    if step > 0 and step % save_freq == 0:
        agent.save(model_path)
        print(f"\nüíæ Model saved at step {step:,}")
        if len(episode_rewards) >= 10:
            recent_avg = np.mean(episode_rewards[-10:])
            print(f"   Recent avg reward: {recent_avg:.2f}")
    
    # Evaluate and show progress
    if step > 0 and step % eval_freq == 0 and len(episode_rewards) >= 10:
        avg_reward = np.mean(episode_rewards[-10:])
        avg_length = np.mean(episode_lengths[-10:])
        print(f"\nüìä Step {step:,}:")
        print(f"   Avg reward (last 10): {avg_reward:.2f}")
        print(f"   Avg episode length: {avg_length:.1f}")
        print(f"   Epsilon: {agent.get_epsilon():.3f}")

        mean_r, std_r, all_r = evaluate_greedy(agent, eval_env, n_episodes=10)
        print(f"\nüß™ Greedy eval @ step {step:,}: mean={mean_r:.2f} ¬± {std_r:.2f}  (scores={all_r})")

        scores, counts, qmean, qstd = greedy_eval_diagnostics(agent, eval_env, n_episodes=5)
        print("Greedy action counts:", counts)
        print(f"Avg Q mean/std: {qmean:.3f} / {qstd:.3f}")
        
        # Show learning progress
        if avg_reward > 10:
            print("   üéâüéâüéâ EXCELLENT! Agent is winning consistently!")
        elif avg_reward > 0:
            print("   üéâüéâ BREAKTHROUGH! Agent is winning!")
        elif avg_reward > -10:
            print("   üéØ Great progress! Agent is learning!")
        elif avg_reward > -15:
            print("   üìà Starting to improve!")
        elif avg_reward > -19:
            print("   üìä Better than random, keep going!")
        else:
            print("   ‚è≥ Still exploring...")
    
    pbar.update(1)

# Final save
agent.save(model_path)
env.close()
pbar.close()

print(f"\n‚úÖ Training complete! Model saved to {model_path}")
print(f"Total episodes: {len(episode_rewards)}")
if len(episode_rewards) > 0:
    print(f"Final average reward (last 100): {np.mean(episode_rewards[-100:]):.2f}")
    print(f"Best average reward (last 10): {np.mean(episode_rewards[-10:]):.2f}")
    print(f"Best single episode: {max(episode_rewards):.2f}")
    
    # Final assessment
    final_avg = np.mean(episode_rewards[-10:])
    if final_avg > 10:
        print("üåüüåüüåü EXCELLENT! Agent mastered the game!")
    elif final_avg > 0:
        print("üéØ SUCCESS! Agent is winning more than losing!")
    elif final_avg > -10:
        print("üìà Good progress! Agent is learning!")
    elif final_avg > -15:
        print("üìä Some improvement, but needs more training")
    else:
        print("‚è≥ Still needs work - may need hyperparameter tuning")

Action meanings: ['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']
State shape: (4, 84, 84)
Number of actions: 3
Using device: cuda

‚úÖ Initial state shape: (4, 84, 84), dtype: uint8


Training:   1%|          | 50000/6000000 [00:20<41:16, 2403.04it/s, epsilon=0.901, loss=0.0002, lr=1.00e-04, avg_reward=-20.2]


üìä Step 50,000:
   Avg reward (last 10): -20.20
   Avg episode length: 1049.0
   Epsilon: 0.901

üß™ Greedy eval @ step 50,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:   1%|          | 50124/6000000 [00:30<22:45:15, 72.63it/s, epsilon=0.901, loss=0.0297, lr=1.00e-04, avg_reward=-20.2]

Greedy action counts: Counter({2: 3793})
Avg Q mean/std: 0.009 / 0.007
   ‚è≥ Still exploring...


Training:   2%|‚ñè         | 100000/6000000 [01:57<3:17:59, 496.64it/s, epsilon=0.802, loss=0.0161, lr=1.00e-04, avg_reward=-20.8]


üìä Step 100,000:
   Avg reward (last 10): -20.80
   Avg episode length: 911.5
   Epsilon: 0.802

üß™ Greedy eval @ step 100,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:   2%|‚ñè         | 100096/6000000 [02:10<85:07:27, 19.25it/s, epsilon=0.802, loss=0.0001, lr=1.00e-04, avg_reward=-20.8] 

Greedy action counts: Counter({1: 3799})
Avg Q mean/std: -0.031 / 0.003
   ‚è≥ Still exploring...


Training:   2%|‚ñé         | 150000/6000000 [04:01<4:12:10, 386.64it/s, epsilon=0.703, loss=0.0003, lr=1.00e-04, avg_reward=-20.6] 


üìä Step 150,000:
   Avg reward (last 10): -20.60
   Avg episode length: 894.4
   Epsilon: 0.703

üß™ Greedy eval @ step 150,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:   3%|‚ñé         | 150072/6000000 [04:21<191:56:24,  8.47it/s, epsilon=0.703, loss=0.0003, lr=1.00e-04, avg_reward=-20.5]

Greedy action counts: Counter({2: 3797})
Avg Q mean/std: -0.011 / 0.002
   ‚è≥ Still exploring...


Training:   3%|‚ñé         | 200000/6000000 [06:52<5:28:30, 294.26it/s, epsilon=0.604, loss=0.0151, lr=1.00e-04, avg_reward=-20.8] 


üíæ Model saved at step 200,000
   Recent avg reward: -20.80

üìä Step 200,000:
   Avg reward (last 10): -20.80
   Avg episode length: 897.9
   Epsilon: 0.604

üß™ Greedy eval @ step 200,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:   3%|‚ñé         | 200056/6000000 [07:17<290:04:46,  5.55it/s, epsilon=0.604, loss=0.0302, lr=1.00e-04, avg_reward=-20.8]

Greedy action counts: Counter({2: 3802})
Avg Q mean/std: -0.008 / 0.003
   ‚è≥ Still exploring...


Training:   4%|‚ñç         | 250000/6000000 [10:27<6:37:10, 241.29it/s, epsilon=0.505, loss=0.0302, lr=1.00e-04, avg_reward=-20.8] 


üìä Step 250,000:
   Avg reward (last 10): -20.80
   Avg episode length: 851.6
   Epsilon: 0.505


Training:   4%|‚ñç         | 250000/6000000 [10:40<6:37:10, 241.29it/s, epsilon=0.505, loss=0.0302, lr=1.00e-04, avg_reward=-20.8]


üß™ Greedy eval @ step 250,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:   4%|‚ñç         | 250048/6000000 [10:55<435:06:43,  3.67it/s, epsilon=0.505, loss=0.0300, lr=1.00e-04, avg_reward=-20.8]

Greedy action counts: Counter({0: 3801})
Avg Q mean/std: -0.022 / 0.001
   ‚è≥ Still exploring...


Training:   5%|‚ñå         | 300000/6000000 [14:39<7:39:20, 206.81it/s, epsilon=0.406, loss=0.0151, lr=1.00e-04, avg_reward=-20.7] 


üìä Step 300,000:
   Avg reward (last 10): -20.70
   Avg episode length: 975.7
   Epsilon: 0.406


Training:   5%|‚ñå         | 300000/6000000 [14:50<7:39:20, 206.81it/s, epsilon=0.406, loss=0.0151, lr=1.00e-04, avg_reward=-20.7]


üß™ Greedy eval @ step 300,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:   5%|‚ñå         | 300040/6000000 [15:15<622:34:14,  2.54it/s, epsilon=0.406, loss=0.0003, lr=1.00e-04, avg_reward=-20.7]

Greedy action counts: Counter({0: 3806})
Avg Q mean/std: -0.023 / 0.003
   ‚è≥ Still exploring...


Training:   6%|‚ñå         | 350000/6000000 [19:32<9:12:23, 170.47it/s, epsilon=0.307, loss=0.0004, lr=1.00e-04, avg_reward=-21.0] 


üìä Step 350,000:
   Avg reward (last 10): -21.00
   Avg episode length: 831.9
   Epsilon: 0.307


Training:   6%|‚ñå         | 350000/6000000 [19:51<9:12:23, 170.47it/s, epsilon=0.307, loss=0.0004, lr=1.00e-04, avg_reward=-21.0]


üß™ Greedy eval @ step 350,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:   6%|‚ñå         | 350036/6000000 [20:12<789:30:09,  1.99it/s, epsilon=0.307, loss=0.0151, lr=1.00e-04, avg_reward=-21.0] 

Greedy action counts: Counter({2: 3799})
Avg Q mean/std: -0.013 / 0.003
   ‚è≥ Still exploring...


Training:   7%|‚ñã         | 400000/6000000 [25:04<9:46:32, 159.12it/s, epsilon=0.208, loss=0.0005, lr=1.00e-04, avg_reward=-21.0] 


üíæ Model saved at step 400,000
   Recent avg reward: -21.00

üìä Step 400,000:
   Avg reward (last 10): -21.00
   Avg episode length: 915.2
   Epsilon: 0.208


Training:   7%|‚ñã         | 400000/6000000 [25:21<9:46:32, 159.12it/s, epsilon=0.208, loss=0.0005, lr=1.00e-04, avg_reward=-21.0]


üß™ Greedy eval @ step 400,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:   7%|‚ñã         | 400032/6000000 [25:48<1024:02:54,  1.52it/s, epsilon=0.208, loss=0.0151, lr=1.00e-04, avg_reward=-21.0]

Greedy action counts: Counter({2: 3798})
Avg Q mean/std: -0.017 / 0.001
   ‚è≥ Still exploring...


Training:   8%|‚ñä         | 450000/6000000 [31:15<10:43:07, 143.83it/s, epsilon=0.109, loss=0.0153, lr=1.00e-04, avg_reward=-21.0] 


üìä Step 450,000:
   Avg reward (last 10): -21.00
   Avg episode length: 838.7
   Epsilon: 0.109


Training:   8%|‚ñä         | 450000/6000000 [31:31<10:43:07, 143.83it/s, epsilon=0.109, loss=0.0153, lr=1.00e-04, avg_reward=-21.0]


üß™ Greedy eval @ step 450,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:   8%|‚ñä         | 450028/6000000 [32:03<1190:46:29,  1.29it/s, epsilon=0.109, loss=0.0153, lr=1.00e-04, avg_reward=-21.0]

Greedy action counts: Counter({0: 3798})
Avg Q mean/std: -0.016 / 0.002
   ‚è≥ Still exploring...


Training:   8%|‚ñä         | 500000/6000000 [38:04<11:20:01, 134.80it/s, epsilon=0.010, loss=0.0004, lr=1.00e-04, avg_reward=-21.0] 


üìä Step 500,000:
   Avg reward (last 10): -21.00
   Avg episode length: 790.3
   Epsilon: 0.010


Training:   8%|‚ñä         | 500000/6000000 [38:21<11:20:01, 134.80it/s, epsilon=0.010, loss=0.0004, lr=1.00e-04, avg_reward=-21.0]


üß™ Greedy eval @ step 500,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:   8%|‚ñä         | 500024/6000000 [38:57<1226:18:41,  1.25it/s, epsilon=0.010, loss=0.0004, lr=1.00e-04, avg_reward=-21.0]

Greedy action counts: Counter({2: 3799})
Avg Q mean/std: -0.014 / 0.002
   ‚è≥ Still exploring...


Training:   9%|‚ñâ         | 550000/6000000 [45:31<12:22:04, 122.40it/s, epsilon=0.010, loss=0.0003, lr=1.00e-04, avg_reward=-21.0] 


üìä Step 550,000:
   Avg reward (last 10): -21.00
   Avg episode length: 759.9
   Epsilon: 0.010


Training:   9%|‚ñâ         | 550000/6000000 [45:42<12:22:04, 122.40it/s, epsilon=0.010, loss=0.0003, lr=1.00e-04, avg_reward=-21.0]


üß™ Greedy eval @ step 550,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:   9%|‚ñâ         | 550020/6000000 [46:28<1451:13:28,  1.04it/s, epsilon=0.010, loss=0.0152, lr=1.00e-04, avg_reward=-21.0]

Greedy action counts: Counter({0: 3788})
Avg Q mean/std: -0.022 / 0.002
   ‚è≥ Still exploring...


Training:  10%|‚ñà         | 600000/6000000 [53:38<13:09:37, 113.98it/s, epsilon=0.010, loss=0.0152, lr=1.00e-04, avg_reward=-21.0] 


üíæ Model saved at step 600,000
   Recent avg reward: -21.00

üìä Step 600,000:
   Avg reward (last 10): -21.00
   Avg episode length: 813.8
   Epsilon: 0.010


Training:  10%|‚ñà         | 600000/6000000 [53:52<13:09:37, 113.98it/s, epsilon=0.010, loss=0.0152, lr=1.00e-04, avg_reward=-21.0]


üß™ Greedy eval @ step 600,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:  10%|‚ñà         | 600020/6000000 [54:36<1799:24:43,  1.20s/it, epsilon=0.010, loss=0.0152, lr=1.00e-04, avg_reward=-21.0]

Greedy action counts: Counter({1: 3801})
Avg Q mean/std: -0.039 / 0.002
   ‚è≥ Still exploring...


Training:  11%|‚ñà         | 650000/6000000 [1:02:18<13:51:56, 107.18it/s, epsilon=0.010, loss=0.0152, lr=1.00e-04, avg_reward=-21.0]


üìä Step 650,000:
   Avg reward (last 10): -21.00
   Avg episode length: 783.5
   Epsilon: 0.010


Training:  11%|‚ñà         | 650000/6000000 [1:02:33<13:51:56, 107.18it/s, epsilon=0.010, loss=0.0152, lr=1.00e-04, avg_reward=-21.0]


üß™ Greedy eval @ step 650,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:  11%|‚ñà         | 650020/6000000 [1:03:24<2239:02:45,  1.51s/it, epsilon=0.010, loss=0.0299, lr=1.00e-04, avg_reward=-21.0]

Greedy action counts: Counter({2: 3804})
Avg Q mean/std: -0.013 / 0.001
   ‚è≥ Still exploring...


Training:  12%|‚ñà‚ñè        | 700000/6000000 [1:11:42<15:06:02, 97.49it/s, epsilon=0.010, loss=0.0152, lr=1.00e-04, avg_reward=-21.0]  


üìä Step 700,000:
   Avg reward (last 10): -21.00
   Avg episode length: 819.6
   Epsilon: 0.010


Training:  12%|‚ñà‚ñè        | 700000/6000000 [1:11:53<15:06:02, 97.49it/s, epsilon=0.010, loss=0.0152, lr=1.00e-04, avg_reward=-21.0]


üß™ Greedy eval @ step 700,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:  12%|‚ñà‚ñè        | 700016/6000000 [1:12:52<2335:47:17,  1.59s/it, epsilon=0.010, loss=0.0005, lr=1.00e-04, avg_reward=-21.0]

Greedy action counts: Counter({2: 3796})
Avg Q mean/std: -0.017 / 0.002
   ‚è≥ Still exploring...


Training:  12%|‚ñà‚ñé        | 750000/6000000 [1:21:43<17:19:45, 84.15it/s, epsilon=0.010, loss=0.0001, lr=1.00e-04, avg_reward=-21.0]  


üìä Step 750,000:
   Avg reward (last 10): -21.00
   Avg episode length: 789.8
   Epsilon: 0.010


Training:  12%|‚ñà‚ñé        | 750000/6000000 [1:21:54<17:19:45, 84.15it/s, epsilon=0.010, loss=0.0001, lr=1.00e-04, avg_reward=-21.0]


üß™ Greedy eval @ step 750,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:  13%|‚ñà‚ñé        | 750016/6000000 [1:23:05<3059:11:59,  2.10s/it, epsilon=0.010, loss=0.0001, lr=1.00e-04, avg_reward=-21.0]

Greedy action counts: Counter({0: 3797})
Avg Q mean/std: -0.013 / 0.001
   ‚è≥ Still exploring...


Training:  13%|‚ñà‚ñé        | 800000/6000000 [1:32:33<17:10:51, 84.07it/s, epsilon=0.010, loss=0.0000, lr=1.00e-04, avg_reward=-21.0]  


üíæ Model saved at step 800,000
   Recent avg reward: -21.00

üìä Step 800,000:
   Avg reward (last 10): -21.00
   Avg episode length: 784.3
   Epsilon: 0.010


Training:  13%|‚ñà‚ñé        | 800000/6000000 [1:32:44<17:10:51, 84.07it/s, epsilon=0.010, loss=0.0000, lr=1.00e-04, avg_reward=-21.0]


üß™ Greedy eval @ step 800,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:  13%|‚ñà‚ñé        | 800016/6000000 [1:33:49<2975:00:53,  2.06s/it, epsilon=0.010, loss=0.0155, lr=1.00e-04, avg_reward=-21.0]

Greedy action counts: Counter({1: 3798})
Avg Q mean/std: -0.024 / 0.000
   ‚è≥ Still exploring...


Training:  14%|‚ñà‚ñç        | 850000/6000000 [1:44:00<18:02:16, 79.31it/s, epsilon=0.010, loss=0.0449, lr=1.00e-04, avg_reward=-21.0]  


üìä Step 850,000:
   Avg reward (last 10): -21.00
   Avg episode length: 789.9
   Epsilon: 0.010


Training:  14%|‚ñà‚ñç        | 850000/6000000 [1:44:15<18:02:16, 79.31it/s, epsilon=0.010, loss=0.0449, lr=1.00e-04, avg_reward=-21.0]


üß™ Greedy eval @ step 850,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:  14%|‚ñà‚ñç        | 850016/6000000 [1:45:26<3296:55:11,  2.30s/it, epsilon=0.010, loss=0.0156, lr=1.00e-04, avg_reward=-21.0]

Greedy action counts: Counter({2: 3796})
Avg Q mean/std: -0.009 / 0.002
   ‚è≥ Still exploring...


Training:  15%|‚ñà‚ñå        | 900000/6000000 [1:56:19<18:05:14, 78.32it/s, epsilon=0.010, loss=0.0297, lr=1.00e-04, avg_reward=-21.0]  


üìä Step 900,000:
   Avg reward (last 10): -21.00
   Avg episode length: 759.6
   Epsilon: 0.010


Training:  15%|‚ñà‚ñå        | 900000/6000000 [1:56:36<18:05:14, 78.32it/s, epsilon=0.010, loss=0.0297, lr=1.00e-04, avg_reward=-21.0]


üß™ Greedy eval @ step 900,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:  15%|‚ñà‚ñå        | 900012/6000000 [1:57:57<4225:04:52,  2.98s/it, epsilon=0.010, loss=0.0298, lr=1.00e-04, avg_reward=-21.0]

Greedy action counts: Counter({0: 3796})
Avg Q mean/std: -0.033 / 0.002
   ‚è≥ Still exploring...


Training:  16%|‚ñà‚ñå        | 950000/6000000 [2:09:31<20:22:25, 68.85it/s, epsilon=0.010, loss=0.0003, lr=1.00e-04, avg_reward=-21.0]  


üìä Step 950,000:
   Avg reward (last 10): -21.00
   Avg episode length: 818.3
   Epsilon: 0.010


Training:  16%|‚ñà‚ñå        | 950000/6000000 [2:09:46<20:22:25, 68.85it/s, epsilon=0.010, loss=0.0003, lr=1.00e-04, avg_reward=-21.0]


üß™ Greedy eval @ step 950,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:  16%|‚ñà‚ñå        | 950012/6000000 [2:11:08<3569:34:05,  2.54s/it, epsilon=0.010, loss=0.0152, lr=1.00e-04, avg_reward=-21.0]

Greedy action counts: Counter({0: 3800})
Avg Q mean/std: -0.024 / 0.001
   ‚è≥ Still exploring...


Training:  17%|‚ñà‚ñã        | 1000000/6000000 [2:24:07<19:47:12, 70.19it/s, epsilon=0.010, loss=0.0154, lr=1.00e-04, avg_reward=-21.0] 


üíæ Model saved at step 1,000,000
   Recent avg reward: -21.00

üìä Step 1,000,000:
   Avg reward (last 10): -21.00
   Avg episode length: 796.0
   Epsilon: 0.010


Training:  17%|‚ñà‚ñã        | 1000000/6000000 [2:24:17<19:47:12, 70.19it/s, epsilon=0.010, loss=0.0154, lr=1.00e-04, avg_reward=-21.0]


üß™ Greedy eval @ step 1,000,000: mean=-21.00 ¬± 0.00  (scores=[-21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0, -21.0])


Training:  17%|‚ñà‚ñã        | 1000012/6000000 [2:25:42<3480:11:46,  2.51s/it, epsilon=0.010, loss=0.0000, lr=1.00e-04, avg_reward=-21.0]

Greedy action counts: Counter({1: 3790})
Avg Q mean/std: -0.026 / 0.001
   ‚è≥ Still exploring...


Training:  18%|‚ñà‚ñä        | 1050000/6000000 [2:38:52<27:24:51, 50.16it/s, epsilon=0.010, loss=0.0301, lr=1.00e-04, avg_reward=-21.0]  


üìä Step 1,050,000:
   Avg reward (last 10): -21.00
   Avg episode length: 813.1
   Epsilon: 0.010


Training:  18%|‚ñà‚ñä        | 1050000/6000000 [2:39:08<27:24:51, 50.16it/s, epsilon=0.010, loss=0.0301, lr=1.00e-04, avg_reward=-21.0]

KeyboardInterrupt: 

In [9]:
# Quick verification - Run this before training to make sure everything is set up
try:
    # Check if all classes are defined
    assert 'Model' in globals(), "Model class not found - run the Model cell first!"
    assert 'ReplayBuffer' in globals(), "ReplayBuffer class not found - run the ReplayBuffer cell first!"
    assert 'PreprocessAtari' in globals(), "PreprocessAtari class not found - run the preprocessing cell first!"
    assert 'FrameStack' in globals(), "FrameStack class not found - run the preprocessing cell first!"
    assert 'EnhancedDQNAgent' in globals(), "DQNAgent class not found - run the DQNAgent cell first!"
    
    # Check imports
    import torch
    import gymnasium as gym
    import numpy as np
    import cv2
    from tqdm import tqdm
    
    print("‚úÖ All classes and imports are ready!")
    print("‚úÖ You can now run the training cell to start training!")
except AssertionError as e:
    print(f"‚ùå {e}")
except ImportError as e:
    print(f"‚ùå Missing import: {e}")
    print("Make sure you've run all the setup cells in order.")


‚úÖ All classes and imports are ready!
‚úÖ You can now run the training cell to start training!


In [None]:
# üîÑ RESUME TRAINING: Continue from saved checkpoint
# This cell loads an existing model and continues training from where it left off

import ale_py
import os
from tqdm import tqdm
import numpy as np
import gc  # For garbage collection to prevent memory issues
import torch

# Create environment with preprocessing
env = gym.make("ALE/Pong-v5")
env = PreprocessAtari(env)
env = FrameStack(env, n_frames=4)

# Get environment info
state_shape = env.observation_space.shape
n_actions = env.action_space.n

print(f"State shape: {state_shape}")
print(f"Number of actions: {n_actions}")

# Create agent
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

agent = EnhancedDQNAgent(
    state_shape=state_shape,
    n_actions=n_actions,
    device=device,
    lr=2e-4,
    gamma=0.99,
    epsilon_start=1.0,
    epsilon_end=0.01,
    epsilon_decay=1000000,
    target_update_freq=1000,
    buffer_size=100000,
    batch_size=32,
    optimistic_init=1.0
)

# Load the saved model checkpoint
model_path = "models/dqn_pong_enhanced_800k.pth"
if os.path.exists(model_path):
    agent.load(model_path)
    print(f"\n‚úÖ Loaded model from {model_path}")
    print(f"   Resuming from step {agent.steps:,}")
    if len(agent.loss_history) > 0:
        recent_loss = np.mean(agent.loss_history[-100:]) if len(agent.loss_history) >= 100 else np.mean(agent.loss_history)
        print(f"   Recent average loss: {recent_loss:.4f}")
else:
    print(f"\n‚ùå Model file not found at {model_path}")
    print("   Please check the file path or use the fresh training cell instead.")
    raise FileNotFoundError(f"Model file not found: {model_path}")

# Training parameters
total_steps = 6_000_000  # Continue training up to 6M steps total
learning_starts = 10_000  # Already passed this
train_freq = 4
save_freq = 200_000  # Save every 200k steps
eval_freq = 50_000  # Print stats every 50k steps

# Statistics - will continue tracking from here
episode_rewards = []
episode_lengths = []
current_episode_reward = 0
current_episode_length = 0

# Verify state shape
state, info = env.reset()
print(f"\n‚úÖ Initial state shape: {state.shape}, dtype: {state.dtype}")
if state.shape != state_shape:
    raise ValueError(f"State shape mismatch! Expected {state_shape}, got {state.shape}")

# Training loop - resume from agent.steps
print(f"\nüöÄ Resuming training from step {agent.steps:,} to {total_steps:,}")
print(f"   Total steps remaining: {total_steps - agent.steps:,}")

pbar = tqdm(range(agent.steps, total_steps), desc="Training (Resumed)", initial=agent.steps)

try:
    for step in range(agent.steps, total_steps):
        # Select action
        action = agent.select_action(state, training=True)
        
        # Take step
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        
        # Store transition
        agent.store_transition(state, action, reward, next_state, done)
        
        # Update statistics
        current_episode_reward += reward
        current_episode_length += 1
        
        # Train
        if step >= learning_starts and step % train_freq == 0:
            loss = agent.train_step()
            if loss is not None:
                current_lr = agent.optimizer.param_groups[0]['lr']
                pbar.set_postfix({
                    'epsilon': f'{agent.get_epsilon():.3f}',
                    'loss': f'{loss:.4f}',
                    'lr': f'{current_lr:.2e}',
                    'avg_reward': f'{np.mean(episode_rewards[-10:]):.1f}' if len(episode_rewards) >= 10 else 'N/A'
                })
        
        # Handle episode end
        if done:
            episode_rewards.append(current_episode_reward)
            episode_lengths.append(current_episode_length)
            current_episode_reward = 0
            current_episode_length = 0
            state, info = env.reset()
        else:
            state = next_state
        
        # Save model periodically
        if step > 0 and step % save_freq == 0:
            # Save with step number in filename to keep multiple checkpoints
            checkpoint_path = f"models/dqn_pong_enhanced_{step//1000}k.pth"
            agent.save(checkpoint_path)
            print(f"\nüíæ Model saved at step {step:,} to {checkpoint_path}")
            if len(episode_rewards) >= 10:
                recent_avg = np.mean(episode_rewards[-10:])
                print(f"   Recent avg reward: {recent_avg:.2f}")
            
            # Force garbage collection after saving to free memory
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        
        # Evaluate and show progress
        if step > 0 and step % eval_freq == 0 and len(episode_rewards) >= 10:
            avg_reward = np.mean(episode_rewards[-10:])
            avg_length = np.mean(episode_lengths[-10:])
            print(f"\nüìä Step {step:,}:")
            print(f"   Avg reward (last 10): {avg_reward:.2f}")
            print(f"   Avg episode length: {avg_length:.1f}")
            print(f"   Epsilon: {agent.get_epsilon():.3f}")
            
            # Show learning progress
            if avg_reward > 10:
                print("   üéâüéâüéâ EXCELLENT! Agent is winning consistently!")
            elif avg_reward > 0:
                print("   üéâüéâ BREAKTHROUGH! Agent is winning!")
            elif avg_reward > -10:
                print("   üéØ Great progress! Agent is learning!")
            elif avg_reward > -15:
                print("   üìà Starting to improve!")
            elif avg_reward > -19:
                print("   üìä Better than random, keep going!")
            else:
                print("   ‚è≥ Still exploring...")
        
        pbar.update(1)
        
        # Periodic memory cleanup to prevent MemoryError
        if step % 10000 == 0:
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

except MemoryError as e:
    print(f"\n‚ö†Ô∏è  MemoryError occurred at step {step:,}")
    print(f"   Saving model before exit...")
    emergency_path = f"models/dqn_pong_enhanced_emergency_{step//1000}k.pth"
    agent.save(emergency_path)
    print(f"   ‚úÖ Emergency save completed: {emergency_path}")
    raise

finally:
    # Final save
    final_path = f"models/dqn_pong_enhanced_final_{agent.steps//1000}k.pth"
    agent.save(final_path)
    env.close()
    pbar.close()
    
    print(f"\n‚úÖ Training session ended. Model saved to {final_path}")
    print(f"Final step: {agent.steps:,}")
    print(f"Total episodes completed: {len(episode_rewards)}")
    if len(episode_rewards) > 0:
        print(f"Final average reward (last 100): {np.mean(episode_rewards[-100:]):.2f}")
        print(f"Best average reward (last 10): {np.mean(episode_rewards[-10:]):.2f}")
        print(f"Best single episode: {max(episode_rewards):.2f}")

In [15]:
# Evaluate the trained agent
import ale_py
import numpy as np

# Create evaluation environment
eval_env = gym.make("ALE/Pong-v5")
eval_env = PreprocessAtari(eval_env)
eval_env = FrameStack(eval_env, n_frames=4)

# Load the trained model
model_path = "models/dqn_pong.pth"
if os.path.exists(model_path):
    # Recreate agent with same parameters
    state_shape = eval_env.observation_space.shape
    n_actions = eval_env.action_space.n
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    eval_agent = DQNAgent(
        state_shape=state_shape,
        n_actions=n_actions,
        device=device,
        lr=1e-4,
        gamma=0.99,
        epsilon_start=1.0,
        epsilon_end=0.01,
        epsilon_decay=10000,
        target_update_freq=1000,
        buffer_size=100000,
        batch_size=32
    )
    eval_agent.load(model_path)
    print(f"‚úÖ Loaded model from {model_path}")
    print(f"   Model was trained for {eval_agent.steps} steps")
else:
    print("‚ùå No model found to evaluate")
    eval_env.close()

# Run evaluation episodes
n_eval_episodes = 10
eval_rewards = []
eval_lengths = []

print(f"\nRunning {n_eval_episodes} evaluation episodes (no exploration, greedy policy)...")
for episode in range(n_eval_episodes):
    state, info = eval_env.reset()
    episode_reward = 0
    episode_length = 0
    
    while True:
        # Use greedy policy (no exploration)
        action = eval_agent.select_action(state, training=False)
        state, reward, terminated, truncated, info = eval_env.step(action)
        done = terminated or truncated
        
        episode_reward += reward
        episode_length += 1
        
        if done:
            break
    
    eval_rewards.append(episode_reward)
    eval_lengths.append(episode_length)
    print(f"Episode {episode+1}: Reward = {episode_reward:+.1f}, Length = {episode_length}")

eval_env.close()

# Print summary
print(f"\n{'='*50}")
print(f"Evaluation Summary ({n_eval_episodes} episodes):")
print(f"  Average Reward: {np.mean(eval_rewards):.2f}")
print(f"  Best Reward: {np.max(eval_rewards):.2f}")
print(f"  Worst Reward: {np.min(eval_rewards):.2f}")
print(f"  Average Length: {np.mean(eval_lengths):.1f} steps")
print(f"{'='*50}")

# Interpretation
avg_reward = np.mean(eval_rewards)
if avg_reward < -19:
    print("üìâ Status: Still playing randomly (needs more training)")
    print("   ‚Üí Increase training to 1,000,000+ steps")
elif avg_reward < -10:
    print("üìà Status: Starting to learn (showing some improvement)")
    print("   ‚Üí Continue training to see more improvement")
elif avg_reward < 0:
    print("üéØ Status: Learning! (better than random)")
    print("   ‚Üí Keep training to reach positive rewards")
elif avg_reward < 10:
    print("üèÜ Status: Playing well! (winning some games)")
    print("   ‚Üí Excellent progress!")
else:
    print("üåü Status: Master level! (consistently winning)")


‚úÖ Loaded model from models/dqn_pong.pth
   Model was trained for 100000 steps

Running 10 evaluation episodes (no exploration, greedy policy)...
Episode 1: Reward = -21.0, Length = 764
Episode 2: Reward = -21.0, Length = 764
Episode 3: Reward = -21.0, Length = 764
Episode 4: Reward = -21.0, Length = 764
Episode 5: Reward = -21.0, Length = 764
Episode 6: Reward = -21.0, Length = 764
Episode 7: Reward = -21.0, Length = 764
Episode 8: Reward = -21.0, Length = 764
Episode 9: Reward = -21.0, Length = 764
Episode 10: Reward = -21.0, Length = 764

Evaluation Summary (10 episodes):
  Average Reward: -21.00
  Best Reward: -21.00
  Worst Reward: -21.00
  Average Length: 764.0 steps
üìâ Status: Still playing randomly (needs more training)
   ‚Üí Increase training to 1,000,000+ steps


In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    """
    DQN Model for Atari games.
    Takes stacked frames as input and outputs Q-values for each action.
    """
    def __init__(self, n_actions, n_frames=4):
        """
        Args:
            n_actions: Number of possible actions (e.g., 6 for Pong)
            n_frames: Number of stacked frames (default: 4)
        """
        super(Model, self).__init__()
        
        # Convolutional layers to process the image frames
        self.conv1 = nn.Conv2d(n_frames, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        
        # Calculate the size of the flattened feature map
        # Input shape: (n_frames, 84, 84) after preprocessing (or 210x160x3 raw)
        # After conv layers, we need to calculate the output size
        # For standard Atari preprocessing (84x84), the output is 7x7x64
        self.fc1 = nn.Linear(7 * 7 * 64, 512)
        self.fc2 = nn.Linear(512, n_actions)
        
    def forward(self, x):
        """
        Forward pass through the network.
        
        Args:
            x: Input tensor of shape (batch_size, n_frames, height, width)
        
        Returns:
            Q-values for each action, shape (batch_size, n_actions)
        """
        # Apply convolutional layers with ReLU activation
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        
        # Flatten the feature map
        x = x.view(x.size(0), -1)
        
        # Apply fully connected layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x