In [1]:
!python -m pip install -U pip setuptools wheel





In [2]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128


Looking in indexes: https://download.pytorch.org/whl/cu128
Collecting torch
  Downloading https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp310-cp310-win_amd64.whl.metadata (29 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp310-cp310-win_amd64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp310-cp310-win_amd64.whl.metadata (7.0 kB)
Collecting filelock (from torch)
  Using cached filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx>=2.5.1 (from torch)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec>=0.8.5 (from torch)
  Using cached fsspec-2025.12.0-py3-none-any.whl.metadata (10 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadat

In [3]:
!pip install -U "gymnasium[atari]"


Collecting gymnasium[atari]
  Using cached gymnasium-1.2.2-py3-none-any.whl.metadata (10 kB)
Collecting cloudpickle>=1.2.0 (from gymnasium[atari])
  Using cached cloudpickle-3.1.2-py3-none-any.whl.metadata (7.1 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium[atari])
  Using cached Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Collecting ale_py>=0.9 (from gymnasium[atari])
  Downloading ale_py-0.11.2-cp310-cp310-win_amd64.whl.metadata (9.2 kB)
Using cached gymnasium-1.2.2-py3-none-any.whl (952 kB)
Downloading ale_py-0.11.2-cp310-cp310-win_amd64.whl (3.5 MB)
   ---------------------------------------- 0.0/3.5 MB ? eta -:--:--
   ------------------------------------ --- 3.1/3.5 MB 20.5 MB/s eta 0:00:01
   ---------------------------------------- 3.5/3.5 MB 18.7 MB/s  0:00:00
Using cached cloudpickle-3.1.2-py3-none-any.whl (22 kB)
Using cached Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, cloudpickle,

In [4]:
!pip install "stable-baselines3[extra]"

Collecting stable-baselines3[extra]
  Using cached stable_baselines3-2.7.1-py3-none-any.whl.metadata (4.8 kB)
Collecting opencv-python (from stable-baselines3[extra])
  Using cached opencv_python-4.12.0.88-cp37-abi3-win_amd64.whl.metadata (19 kB)
Collecting pygame (from stable-baselines3[extra])
  Downloading pygame-2.6.1-cp310-cp310-win_amd64.whl.metadata (13 kB)
Collecting tqdm (from stable-baselines3[extra])
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting rich (from stable-baselines3[extra])
  Using cached rich-14.2.0-py3-none-any.whl.metadata (18 kB)
Collecting numpy<3.0,>=1.20 (from stable-baselines3[extra])
  Using cached numpy-2.2.6-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting markdown-it-py>=2.2.0 (from rich->stable-baselines3[extra])
  Using cached markdown_it_py-4.0.0-py3-none-any.whl.metadata (7.3 kB)
Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich->stable-baselines3[extra])
  Using cached mdurl-0.1.2-py3-none-any.whl.metadata (1.6 kB

In [1]:
import torch

print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("gpu:", torch.cuda.get_device_name(0))
    print("torch cuda version:", torch.version.cuda)

torch: 2.9.1+cu128
cuda available: True
gpu: NVIDIA GeForce RTX 4070
torch cuda version: 12.8


In [2]:
import gymnasium as gym
import ale_py  # registers ALE envs

env = gym.make("ALE/Pong-v5")  # no rendering needed for a smoke test
obs, info = env.reset()
print("obs shape:", obs.shape)

for _ in range(200):
    obs, reward, terminated, truncated, info = env.step(env.action_space.sample())
    if terminated or truncated:
        obs, info = env.reset()

env.close()
print("ok")

obs shape: (210, 160, 3)
ok


# DQN Training Setup

**To start training, run these cells in order:**
1. Cell with Model class (torch imports + Model definition)
2. Cell with ReplayBuffer class
3. Cell with Environment Preprocessing (PreprocessAtari, FrameStack)
4. Cell with DQNAgent class
5. **Cell with Training Loop** - This is the one that actually starts training!

**Training will:**
- Run for 1,000,000 steps
- Start learning after 10,000 steps of exploration
- Save model every 100,000 steps to `models/dqn_pong.pth`
- Show progress with epsilon, loss, and episode rewards


In [2]:
# Ensure torch is imported (if not already imported in earlier cells)
import torch
import torch.nn as nn
import torch.nn.functional as F


In [17]:
# FIXED: PreprocessAtari wrapper with proper reset and step methods
import gymnasium as gym
import numpy as np
from gymnasium import Wrapper
import cv2
from collections import deque

class PreprocessAtari(Wrapper):
    """
    Preprocesses Atari frames: resize to 84x84, convert to grayscale.
    """
    def __init__(self, env):
        super().__init__(env)
        self.observation_space = gym.spaces.Box(
            low=0, high=255, shape=(84, 84), dtype=np.uint8
        )
    
    def observation(self, obs):
        # Convert to grayscale if needed
        if len(obs.shape) == 3:
            obs = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
        
        # Resize to 84x84
        obs = cv2.resize(obs, (84, 84), interpolation=cv2.INTER_AREA)
        
        return obs
    
    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        return self.observation(obs), info
    
    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        return self.observation(obs), reward, terminated, truncated, info

class FrameStack(Wrapper):
    """
    Stacks the last n frames together.
    """
    def __init__(self, env, n_frames=4):
        super().__init__(env)
        self.n_frames = n_frames
        self.frames = deque(maxlen=n_frames)
        
        # Update observation space
        obs_shape = env.observation_space.shape
        self.observation_space = gym.spaces.Box(
            low=0, high=255, 
            shape=(n_frames, obs_shape[0], obs_shape[1]), 
            dtype=np.uint8
        )
    
    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        # Fill the frame stack with the first frame
        for _ in range(self.n_frames):
            self.frames.append(obs)
        return self._get_obs(), info
    
    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        self.frames.append(obs)
        return self._get_obs(), reward, terminated, truncated, info
    
    def _get_obs(self):
        # Stack frames: (n_frames, height, width)
        return np.stack(self.frames, axis=0)

print("‚úÖ Preprocessing wrappers updated with proper reset/step methods!")


‚úÖ Preprocessing wrappers updated with proper reset/step methods!


In [18]:
import random
from collections import deque, namedtuple

# Named tuple for storing experiences
Experience = namedtuple('Experience', ['state', 'action', 'reward', 'next_state', 'done'])

class ReplayBuffer:
    """
    Experience Replay Buffer for DQN.
    Stores and samples batches of experiences for training.
    """
    def __init__(self, capacity):
        """
        Args:
            capacity: Maximum number of experiences to store
        """
        self.buffer = deque(maxlen=capacity)
        self.capacity = capacity
    
    def push(self, state, action, reward, next_state, done):
        """Add an experience to the buffer."""
        experience = Experience(state, action, reward, next_state, done)
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        """Sample a batch of experiences from the buffer."""
        batch = random.sample(self.buffer, batch_size)
        
        # Unpack the batch
        states = torch.stack([e.state for e in batch])
        actions = torch.tensor([e.action for e in batch], dtype=torch.long)
        rewards = torch.tensor([e.reward for e in batch], dtype=torch.float32)
        next_states = torch.stack([e.next_state for e in batch])
        dones = torch.tensor([e.done for e in batch], dtype=torch.float32)
        
        return states, actions, rewards, next_states, dones
    
    def __len__(self):
        """Return the current size of the buffer."""
        return len(self.buffer)


In [None]:
import gymnasium as gym
import numpy as np
from gymnasium import Wrapper
import cv2

class PreprocessAtari(Wrapper):
    """
    Preprocesses Atari frames: resize to 84x84, convert to grayscale, normalize.
    """
    def __init__(self, env):
        super().__init__(env)
        self.observation_space = gym.spaces.Box(
            low=0, high=255, shape=(84, 84), dtype=np.uint8
        )
    
    def observation(self, obs):
        # Convert to grayscale if needed
        if len(obs.shape) == 3:
            obs = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
        
        # Resize to 84x84
        obs = cv2.resize(obs, (84, 84), interpolation=cv2.INTER_AREA)
        
        return obs
    
    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        return self.observation(obs), info
    
    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        return self.observation(obs), reward, terminated, truncated, info

class FrameStack(Wrapper):
    """
    Stacks the last n frames together.
    """
    def __init__(self, env, n_frames=4):
        super().__init__(env)
        self.n_frames = n_frames
        self.frames = deque(maxlen=n_frames)
        
        # Update observation space
        obs_shape = env.observation_space.shape
        self.observation_space = gym.spaces.Box(
            low=0, high=255, 
            shape=(n_frames, obs_shape[0], obs_shape[1]), 
            dtype=np.uint8
        )
    
    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        # Fill the frame stack with the first frame
        for _ in range(self.n_frames):
            self.frames.append(obs)
        return self._get_obs(), info
    
    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        self.frames.append(obs)
        return self._get_obs(), reward, terminated, truncated, info
    
    def _get_obs(self):
        # Stack frames: (n_frames, height, width)
        return np.stack(self.frames, axis=0)


## üö® Problem Diagnosis

**Current Status:** Agent stuck at -21.00 reward (random play) after 1M steps - **NOT LEARNING**

**Issues Identified:**
1. ‚ùå Epsilon decays too fast (reaches 0.01 by step 10k, then stays there)
2. ‚ùå No optimistic initialization - agent doesn't explore enough initially
3. ‚ùå Q-values might be converging to bad local minimum
4. ‚ùå Loss is very small but rewards don't improve (overfitting to bad policy)
5. ‚ùå No Double DQN (helps with overestimation bias)

**Solutions:**
- ‚úÖ Optimistic initialization (initialize Q-values high to encourage exploration)
- ‚úÖ Slower epsilon decay (explore longer)
- ‚úÖ Double DQN (reduce overestimation)
- ‚úÖ Better learning rate schedule
- ‚úÖ Reward clipping/normalization


In [22]:
# IMPROVED DQN Agent with Optimistic Initialization and Double DQN
import torch.optim as optim
import math
import random

class ImprovedDQNAgent:
    """
    Improved DQN Agent with:
    - Optimistic initialization (early exploration, late exploitation)
    - Double DQN (reduces overestimation bias)
    - Better epsilon decay schedule
    - Learning rate scheduling
    """
    def __init__(
        self,
        state_shape,
        n_actions,
        device='cuda',
        lr=1e-4,
        gamma=0.99,
        epsilon_start=1.0,
        epsilon_end=0.01,
        epsilon_decay=500000,  # MUCH slower decay - explore for longer
        target_update_freq=1000,
        buffer_size=100000,
        batch_size=32,
        optimistic_init=10.0  # Initialize Q-values optimistically
    ):
        """
        Args:
            optimistic_init: Initial Q-value (high = more exploration early on)
        """
        self.device = device
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.target_update_freq = target_update_freq
        self.batch_size = batch_size
        self.steps = 0
        self.optimistic_init = optimistic_init
        
        # Create Q-network and target network
        n_frames = state_shape[0]
        self.q_network = Model(n_actions, n_frames).to(device)
        self.target_network = Model(n_actions, n_frames).to(device)
        
        # OPTIMISTIC INITIALIZATION: Initialize final layer to output high Q-values
        # This encourages exploration early on
        with torch.no_grad():
            self.q_network.fc2.weight.data.fill_(0.0)
            self.q_network.fc2.bias.data.fill_(optimistic_init)
            self.target_network.fc2.weight.data.fill_(0.0)
            self.target_network.fc2.bias.data.fill_(optimistic_init)
        
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.target_network.eval()
        
        # Optimizer with learning rate scheduling
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=200000, gamma=0.5)
        
        # Replay buffer
        self.replay_buffer = ReplayBuffer(buffer_size)
    
    def get_epsilon(self):
        """Calculate current epsilon - slower decay."""
        if self.steps < self.epsilon_decay:
            return self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
                   (1 - self.steps / self.epsilon_decay)
        else:
            return self.epsilon_end
    
    def select_action(self, state, training=True):
        """Select an action using epsilon-greedy policy."""
        if training and random.random() < self.get_epsilon():
            return random.randrange(self.n_actions)
        
        # Convert state to tensor and add batch dimension
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) / 255.0
        
        with torch.no_grad():
            q_values = self.q_network(state_tensor)
            action = q_values.argmax(1).item()
        
        return action
    
    def store_transition(self, state, action, reward, next_state, done):
        """Store a transition in the replay buffer."""
        # Clip rewards to [-1, 1] for stability
        reward = np.clip(reward, -1.0, 1.0)
        
        # Convert to tensors
        state_tensor = torch.FloatTensor(state).to(self.device) / 255.0
        next_state_tensor = torch.FloatTensor(next_state).to(self.device) / 255.0
        
        self.replay_buffer.push(state_tensor, action, reward, next_state_tensor, done)
        self.steps += 1
    
    def train_step(self):
        """Perform one training step with Double DQN."""
        if len(self.replay_buffer) < self.batch_size:
            return None
        
        # Sample batch from replay buffer
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(self.batch_size)
        
        # Move to device
        states = states.to(self.device)
        actions = actions.to(self.device)
        rewards = rewards.to(self.device)
        next_states = next_states.to(self.device)
        dones = dones.to(self.device)
        
        # Normalize states
        states = states / 255.0
        next_states = next_states / 255.0
        
        # Compute Q(s, a)
        q_values = self.q_network(states)
        q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
        
        # DOUBLE DQN: Use main network to select action, target network to evaluate
        with torch.no_grad():
            # Select best action using main network
            next_q_values_main = self.q_network(next_states)
            next_actions = next_q_values_main.argmax(1)
            
            # Evaluate using target network
            next_q_values_target = self.target_network(next_states)
            next_q_value = next_q_values_target.gather(1, next_actions.unsqueeze(1)).squeeze(1)
            
            target_q_value = rewards + (1 - dones) * self.gamma * next_q_value
        
        # Compute loss
        loss = F.mse_loss(q_value, target_q_value)
        
        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
        # Gradient clipping for stability
        torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), 10)
        self.optimizer.step()
        
        # Update learning rate
        self.scheduler.step()
        
        # Update target network periodically
        if self.steps % self.target_update_freq == 0:
            self.target_network.load_state_dict(self.q_network.state_dict())
        
        return loss.item()
    
    def save(self, filepath):
        """Save the model."""
        torch.save({
            'q_network': self.q_network.state_dict(),
            'target_network': self.target_network.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'scheduler': self.scheduler.state_dict(),
            'steps': self.steps,
        }, filepath)
    
    def load(self, filepath):
        """Load the model."""
        checkpoint = torch.load(filepath, map_location=self.device)
        self.q_network.load_state_dict(checkpoint['q_network'])
        self.target_network.load_state_dict(checkpoint['target_network'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        if 'scheduler' in checkpoint:
            self.scheduler.load_state_dict(checkpoint['scheduler'])
        self.steps = checkpoint['steps']

print("‚úÖ Improved DQN Agent with optimistic initialization and Double DQN ready!")


‚úÖ Improved DQN Agent with optimistic initialization and Double DQN ready!


In [23]:
# Training with Improved Agent
import ale_py
import os
from tqdm import tqdm
import numpy as np

# Create environment with preprocessing
env = gym.make("ALE/Pong-v5")
env = PreprocessAtari(env)
env = FrameStack(env, n_frames=4)

# Get environment info
state_shape = env.observation_space.shape
n_actions = env.action_space.n

print(f"State shape: {state_shape}")
print(f"Number of actions: {n_actions}")

# Create IMPROVED agent
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

agent = ImprovedDQNAgent(
    state_shape=state_shape,
    n_actions=n_actions,
    device=device,
    lr=1e-4,
    gamma=0.99,
    epsilon_start=1.0,
    epsilon_end=0.01,
    epsilon_decay=500000,  # Explore for 500k steps (was 10k!)
    target_update_freq=1000,
    buffer_size=100000,
    batch_size=32,
    optimistic_init=10.0  # High initial Q-values = more exploration
)

# Training parameters
total_steps = 2_000_000  # Train longer
learning_starts = 10_000
train_freq = 4
save_freq = 200_000
eval_freq = 50_000

# Statistics
episode_rewards = []
episode_lengths = []
current_episode_reward = 0
current_episode_length = 0

# Load existing model if available
model_path = "models/dqn_pong_improved.pth"
if os.path.exists(model_path):
    agent.load(model_path)
    print(f"‚úÖ Loaded existing model from {model_path}")
    print(f"   Continuing from step {agent.steps}")
else:
    os.makedirs("models", exist_ok=True)
    print("üöÄ Starting fresh training with improved agent")
    print("   - Optimistic initialization (explores more early)")
    print("   - Double DQN (reduces overestimation)")
    print("   - Slower epsilon decay (explores for 500k steps)")

# Training loop
state, info = env.reset()
pbar = tqdm(range(total_steps), desc="Training", initial=agent.steps if agent.steps < total_steps else 0)

for step in range(agent.steps, total_steps):
    # Select action
    action = agent.select_action(state, training=True)
    
    # Take step
    next_state, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    
    # Store transition
    agent.store_transition(state, action, reward, next_state, done)
    
    # Update statistics
    current_episode_reward += reward
    current_episode_length += 1
    
    # Train
    if step >= learning_starts and step % train_freq == 0:
        loss = agent.train_step()
        if loss is not None:
            current_lr = agent.optimizer.param_groups[0]['lr']
            pbar.set_postfix({
                'epsilon': f'{agent.get_epsilon():.3f}',
                'loss': f'{loss:.4f}',
                'lr': f'{current_lr:.2e}',
                'avg_reward': f'{np.mean(episode_rewards[-10:]):.1f}' if len(episode_rewards) >= 10 else 'N/A'
            })
    
    # Handle episode end
    if done:
        episode_rewards.append(current_episode_reward)
        episode_lengths.append(current_episode_length)
        current_episode_reward = 0
        current_episode_length = 0
        state, info = env.reset()
    else:
        state = next_state
    
    # Save model periodically
    if step > 0 and step % save_freq == 0:
        agent.save(model_path)
        print(f"\nüíæ Model saved at step {step:,}")
    
    # Evaluate and show progress
    if step > 0 and step % eval_freq == 0 and len(episode_rewards) >= 10:
        avg_reward = np.mean(episode_rewards[-10:])
        avg_length = np.mean(episode_lengths[-10:])
        print(f"\nüìä Step {step:,}: Avg reward (last 10): {avg_reward:.2f}, Avg length: {avg_length:.1f}, Epsilon: {agent.get_epsilon():.3f}")
        
        # Show learning progress
        if avg_reward > 0:
            print("   üéâüéâüéâ BREAKTHROUGH! Agent is winning!")
        elif avg_reward > -10:
            print("   üéØ Great progress! Agent is learning!")
        elif avg_reward > -15:
            print("   üìà Starting to improve!")
        elif avg_reward > -19:
            print("   üìä Better than random, keep going!")
        else:
            print("   ‚è≥ Still exploring...")
    
    pbar.update(1)

# Final save
agent.save(model_path)
env.close()
pbar.close()

print(f"\n‚úÖ Training complete! Model saved to {model_path}")
print(f"Total episodes: {len(episode_rewards)}")
if len(episode_rewards) > 0:
    print(f"Final average reward (last 100): {np.mean(episode_rewards[-100:]):.2f}")
    print(f"Best average reward (last 10): {np.mean(episode_rewards[-10:]):.2f}")
    
    # Final assessment
    final_avg = np.mean(episode_rewards[-10:])
    if final_avg > 10:
        print("üåüüåü EXCELLENT! Agent mastered the game!")
    elif final_avg > 0:
        print("üéØ SUCCESS! Agent is winning more than losing!")
    elif final_avg > -10:
        print("üìà Good progress! Agent is learning!")
    elif final_avg > -15:
        print("üìä Some improvement, but needs more training")
    else:
        print("‚è≥ Still needs work - may need hyperparameter tuning")


State shape: (4, 84, 84)
Number of actions: 6
Using device: cuda
üöÄ Starting fresh training with improved agent
   - Optimistic initialization (explores more early)
   - Double DQN (reduces overestimation)
   - Slower epsilon decay (explores for 500k steps)


Training:   0%|          | 1399/2000000 [00:05<5:11:59, 106.76it/s]

RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [1, 4, 210, 160, 3]

Training:   0%|          | 1436/2000000 [00:21<5:11:59, 106.76it/s]

In [21]:
import torch.optim as optim
import math

class DQNAgent:
    """
    DQN Agent with experience replay and target network.
    """
    def __init__(
        self,
        state_shape,
        n_actions,
        device='cuda',
        lr=1e-4,
        gamma=0.99,
        epsilon_start=1.0,
        epsilon_end=0.01,
        epsilon_decay=10000,
        target_update_freq=1000,
        buffer_size=100000,
        batch_size=32
    ):
        """
        Args:
            state_shape: Shape of state (n_frames, height, width)
            n_actions: Number of possible actions
            device: Device to run on ('cuda' or 'cpu')
            lr: Learning rate
            gamma: Discount factor
            epsilon_start: Starting epsilon for epsilon-greedy
            epsilon_end: Final epsilon
            epsilon_decay: Steps to decay epsilon
            target_update_freq: Frequency to update target network
            buffer_size: Size of replay buffer
            batch_size: Batch size for training
        """
        self.device = device
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.target_update_freq = target_update_freq
        self.batch_size = batch_size
        self.steps = 0
        
        # Create Q-network and target network
        n_frames = state_shape[0]
        self.q_network = Model(n_actions, n_frames).to(device)
        self.target_network = Model(n_actions, n_frames).to(device)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.target_network.eval()  # Target network is always in eval mode
        
        # Optimizer
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        
        # Replay buffer
        self.replay_buffer = ReplayBuffer(buffer_size)
    
    def get_epsilon(self):
        """Calculate current epsilon for epsilon-greedy policy."""
        return self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
               math.exp(-1. * self.steps / self.epsilon_decay)
    
    def select_action(self, state, training=True):
        """
        Select an action using epsilon-greedy policy.
        
        Args:
            state: Current state (n_frames, height, width)
            training: If True, use epsilon-greedy; if False, use greedy
        
        Returns:
            Selected action
        """
        if training and random.random() < self.get_epsilon():
            return random.randrange(self.n_actions)
        
        # Convert state to tensor and add batch dimension
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) / 255.0
        
        with torch.no_grad():
            q_values = self.q_network(state_tensor)
            action = q_values.argmax(1).item()
        
        return action
    
    def store_transition(self, state, action, reward, next_state, done):
        """Store a transition in the replay buffer."""
        # Convert to tensors
        state_tensor = torch.FloatTensor(state).to(self.device) / 255.0
        next_state_tensor = torch.FloatTensor(next_state).to(self.device) / 255.0
        
        self.replay_buffer.push(state_tensor, action, reward, next_state_tensor, done)
        self.steps += 1
    
    def train_step(self):
        """Perform one training step."""
        if len(self.replay_buffer) < self.batch_size:
            return None
        
        # Sample batch from replay buffer
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(self.batch_size)
        
        # Move to device
        states = states.to(self.device)
        actions = actions.to(self.device)
        rewards = rewards.to(self.device)
        next_states = next_states.to(self.device)
        dones = dones.to(self.device)
        
        # Normalize states
        states = states / 255.0
        next_states = next_states / 255.0
        
        # Compute Q(s, a)
        q_values = self.q_network(states)
        q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
        
        # Compute target Q-values
        with torch.no_grad():
            next_q_values = self.target_network(next_states)
            next_q_value = next_q_values.max(1)[0]
            target_q_value = rewards + (1 - dones) * self.gamma * next_q_value
        
        # Compute loss
        loss = F.mse_loss(q_value, target_q_value)
        
        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
        # Gradient clipping for stability
        torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), 10)
        self.optimizer.step()
        
        # Update target network periodically
        if self.steps % self.target_update_freq == 0:
            self.target_network.load_state_dict(self.q_network.state_dict())
        
        return loss.item()
    
    def save(self, filepath):
        """Save the model."""
        torch.save({
            'q_network': self.q_network.state_dict(),
            'target_network': self.target_network.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'steps': self.steps,
        }, filepath)
    
    def load(self, filepath):
        """Load the model."""
        checkpoint = torch.load(filepath, map_location=self.device)
        self.q_network.load_state_dict(checkpoint['q_network'])
        self.target_network.load_state_dict(checkpoint['target_network'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        self.steps = checkpoint['steps']


In [13]:
# Quick verification - Run this before training to make sure everything is set up
try:
    # Check if all classes are defined
    assert 'Model' in globals(), "Model class not found - run the Model cell first!"
    assert 'ReplayBuffer' in globals(), "ReplayBuffer class not found - run the ReplayBuffer cell first!"
    assert 'PreprocessAtari' in globals(), "PreprocessAtari class not found - run the preprocessing cell first!"
    assert 'FrameStack' in globals(), "FrameStack class not found - run the preprocessing cell first!"
    assert 'DQNAgent' in globals(), "DQNAgent class not found - run the DQNAgent cell first!"
    
    # Check imports
    import torch
    import gymnasium as gym
    import numpy as np
    import cv2
    from tqdm import tqdm
    
    print("‚úÖ All classes and imports are ready!")
    print("‚úÖ You can now run the training cell to start training!")
except AssertionError as e:
    print(f"‚ùå {e}")
except ImportError as e:
    print(f"‚ùå Missing import: {e}")
    print("Make sure you've run all the setup cells in order.")


‚úÖ All classes and imports are ready!
‚úÖ You can now run the training cell to start training!


In [16]:
import ale_py
import os
from tqdm import tqdm

# Create environment with preprocessing
env = gym.make("ALE/Pong-v5")
env = PreprocessAtari(env)
env = FrameStack(env, n_frames=4)

# Get environment info
state_shape = env.observation_space.shape
n_actions = env.action_space.n

print(f"State shape: {state_shape}")
print(f"Number of actions: {n_actions}")

# Create agent
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

agent = DQNAgent(
    state_shape=state_shape,
    n_actions=n_actions,
    device=device,
    lr=1e-4,
    gamma=0.99,
    epsilon_start=1.0,
    epsilon_end=0.01,
    epsilon_decay=10000,
    target_update_freq=1000,
    buffer_size=100000,
    batch_size=32
)

# Training parameters
total_steps = 1_000_000
learning_starts = 10_000  # Start training after this many steps
train_freq = 4  # Train every N steps
save_freq = 100_000  # Save model every N steps
eval_freq = 10_000  # Evaluate every N steps

# Statistics
episode_rewards = []
episode_lengths = []
current_episode_reward = 0
current_episode_length = 0

# Load existing model if available
model_path = "models/dqn_pong.pth"
if os.path.exists(model_path):
    agent.load(model_path)
    print(f"Loaded model from {model_path}")
else:
    os.makedirs("models", exist_ok=True)
    print("Starting fresh training")

# Training loop
state, info = env.reset()
pbar = tqdm(range(total_steps), desc="Training")

for step in pbar:
    # Select action
    action = agent.select_action(state, training=True)
    
    # Take step
    next_state, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    
    # Store transition
    agent.store_transition(state, action, reward, next_state, done)
    
    # Update statistics
    current_episode_reward += reward
    current_episode_length += 1
    
    # Train
    if step >= learning_starts and step % train_freq == 0:
        loss = agent.train_step()
        if loss is not None:
            pbar.set_postfix({
                'epsilon': f'{agent.get_epsilon():.3f}',
                'loss': f'{loss:.4f}',
                'ep_reward': f'{current_episode_reward:.1f}' if done else 'N/A'
            })
    
    # Handle episode end
    if done:
        episode_rewards.append(current_episode_reward)
        episode_lengths.append(current_episode_length)
        current_episode_reward = 0
        current_episode_length = 0
        state, info = env.reset()
    else:
        state = next_state
    
    # Save model
    if step > 0 and step % save_freq == 0:
        agent.save(model_path)
        print(f"\nModel saved at step {step}")
    
    # Evaluate
    if step > 0 and step % eval_freq == 0 and len(episode_rewards) > 0:
        avg_reward = np.mean(episode_rewards[-10:]) if len(episode_rewards) >= 10 else np.mean(episode_rewards)
        avg_length = np.mean(episode_lengths[-10:]) if len(episode_lengths) >= 10 else np.mean(episode_lengths)
        print(f"\nStep {step}: Avg reward (last 10): {avg_reward:.2f}, Avg length: {avg_length:.1f}")

# Final save
agent.save(model_path)
env.close()
print(f"\nTraining complete! Model saved to {model_path}")
print(f"Total episodes: {len(episode_rewards)}")
if len(episode_rewards) > 0:
    print(f"Final average reward (last 100): {np.mean(episode_rewards[-100:]):.2f}")


State shape: (4, 84, 84)
Number of actions: 6
Using device: cuda
Loaded model from models/dqn_pong.pth


Training:   1%|          | 10045/1000000 [00:11<20:35, 801.30it/s, epsilon=0.010, loss=0.0304, ep_reward=N/A]


Step 10000: Avg reward (last 10): -21.00, Avg length: 782.0


Training:   2%|‚ñè         | 20053/1000000 [00:34<40:23, 404.42it/s, epsilon=0.010, loss=0.0601, ep_reward=N/A]


Step 20000: Avg reward (last 10): -20.80, Avg length: 791.2


Training:   3%|‚ñé         | 30050/1000000 [00:58<38:42, 417.72it/s, epsilon=0.010, loss=0.0595, ep_reward=N/A]


Step 30000: Avg reward (last 10): -21.00, Avg length: 819.6


Training:   4%|‚ñç         | 40102/1000000 [01:17<28:33, 560.05it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]


Step 40000: Avg reward (last 10): -21.00, Avg length: 770.0


Training:   5%|‚ñå         | 50056/1000000 [01:34<27:46, 570.10it/s, epsilon=0.010, loss=0.0008, ep_reward=N/A]


Step 50000: Avg reward (last 10): -20.90, Avg length: 826.0


Training:   6%|‚ñå         | 60097/1000000 [01:52<27:36, 567.27it/s, epsilon=0.010, loss=0.0301, ep_reward=N/A]


Step 60000: Avg reward (last 10): -21.00, Avg length: 770.0


Training:   7%|‚ñã         | 70043/1000000 [02:10<27:05, 572.16it/s, epsilon=0.010, loss=0.0597, ep_reward=N/A]  


Step 70000: Avg reward (last 10): -20.90, Avg length: 785.5


Training:   8%|‚ñä         | 80037/1000000 [02:29<35:57, 426.49it/s, epsilon=0.010, loss=0.0005, ep_reward=N/A]


Step 80000: Avg reward (last 10): -20.90, Avg length: 783.4


Training:   9%|‚ñâ         | 90037/1000000 [02:53<35:46, 423.91it/s, epsilon=0.010, loss=0.0596, ep_reward=N/A]


Step 90000: Avg reward (last 10): -21.00, Avg length: 764.0


Training:  10%|‚ñà         | 100064/1000000 [03:16<34:40, 432.54it/s, epsilon=0.010, loss=0.0304, ep_reward=N/A]


Model saved at step 100000

Step 100000: Avg reward (last 10): -21.00, Avg length: 794.4


Training:  11%|‚ñà         | 110078/1000000 [03:37<30:44, 482.52it/s, epsilon=0.010, loss=0.0598, ep_reward=N/A]


Step 110000: Avg reward (last 10): -21.00, Avg length: 776.0


Training:  12%|‚ñà‚ñè        | 120077/1000000 [03:58<30:01, 488.39it/s, epsilon=0.010, loss=0.0304, ep_reward=N/A]


Step 120000: Avg reward (last 10): -21.00, Avg length: 764.0


Training:  13%|‚ñà‚ñé        | 130070/1000000 [04:19<30:10, 480.56it/s, epsilon=0.010, loss=0.0005, ep_reward=N/A]


Step 130000: Avg reward (last 10): -21.00, Avg length: 770.0


Training:  14%|‚ñà‚ñç        | 140073/1000000 [04:40<29:58, 478.08it/s, epsilon=0.010, loss=0.0008, ep_reward=N/A]


Step 140000: Avg reward (last 10): -21.00, Avg length: 788.0


Training:  15%|‚ñà‚ñå        | 150039/1000000 [05:01<30:23, 466.17it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]


Step 150000: Avg reward (last 10): -21.00, Avg length: 764.0


Training:  16%|‚ñà‚ñå        | 160087/1000000 [05:22<29:07, 480.61it/s, epsilon=0.010, loss=0.0009, ep_reward=N/A]


Step 160000: Avg reward (last 10): -21.00, Avg length: 772.8


Training:  17%|‚ñà‚ñã        | 170080/1000000 [05:43<28:35, 483.75it/s, epsilon=0.010, loss=0.0303, ep_reward=N/A]  


Step 170000: Avg reward (last 10): -21.00, Avg length: 788.0


Training:  18%|‚ñà‚ñä        | 180055/1000000 [06:04<28:33, 478.44it/s, epsilon=0.010, loss=0.0006, ep_reward=N/A]  


Step 180000: Avg reward (last 10): -21.00, Avg length: 783.8


Training:  19%|‚ñà‚ñâ        | 190062/1000000 [06:25<28:13, 478.13it/s, epsilon=0.010, loss=0.0008, ep_reward=N/A]  


Step 190000: Avg reward (last 10): -21.00, Avg length: 781.8


Training:  20%|‚ñà‚ñà        | 200088/1000000 [06:46<29:04, 458.57it/s, epsilon=0.010, loss=0.0304, ep_reward=N/A]  


Model saved at step 200000

Step 200000: Avg reward (last 10): -21.00, Avg length: 788.4


Training:  21%|‚ñà‚ñà        | 210037/1000000 [07:08<31:02, 424.25it/s, epsilon=0.010, loss=0.0597, ep_reward=N/A]  


Step 210000: Avg reward (last 10): -21.00, Avg length: 788.1


Training:  22%|‚ñà‚ñà‚ñè       | 220061/1000000 [07:31<29:45, 436.87it/s, epsilon=0.010, loss=0.0302, ep_reward=N/A]


Step 220000: Avg reward (last 10): -21.00, Avg length: 776.2


Training:  23%|‚ñà‚ñà‚ñé       | 230037/1000000 [07:54<29:28, 435.37it/s, epsilon=0.010, loss=0.0008, ep_reward=N/A]


Step 230000: Avg reward (last 10): -20.90, Avg length: 801.8


Training:  24%|‚ñà‚ñà‚ñç       | 240061/1000000 [08:17<29:21, 431.31it/s, epsilon=0.010, loss=0.0303, ep_reward=N/A]


Step 240000: Avg reward (last 10): -21.00, Avg length: 791.0


Training:  25%|‚ñà‚ñà‚ñå       | 250065/1000000 [08:40<28:32, 437.99it/s, epsilon=0.010, loss=0.0595, ep_reward=N/A]


Step 250000: Avg reward (last 10): -20.90, Avg length: 786.6


Training:  26%|‚ñà‚ñà‚ñå       | 260058/1000000 [09:03<27:45, 444.34it/s, epsilon=0.010, loss=0.0303, ep_reward=N/A]


Step 260000: Avg reward (last 10): -21.00, Avg length: 764.0


Training:  27%|‚ñà‚ñà‚ñã       | 270065/1000000 [09:26<27:59, 434.56it/s, epsilon=0.010, loss=0.0008, ep_reward=N/A]


Step 270000: Avg reward (last 10): -21.00, Avg length: 764.0


Training:  28%|‚ñà‚ñà‚ñä       | 280053/1000000 [09:49<27:34, 435.20it/s, epsilon=0.010, loss=0.0304, ep_reward=N/A]


Step 280000: Avg reward (last 10): -21.00, Avg length: 764.0


Training:  29%|‚ñà‚ñà‚ñâ       | 290037/1000000 [10:12<27:15, 434.08it/s, epsilon=0.010, loss=0.0598, ep_reward=N/A]  


Step 290000: Avg reward (last 10): -21.00, Avg length: 776.0


Training:  30%|‚ñà‚ñà‚ñà       | 300053/1000000 [10:35<27:30, 424.10it/s, epsilon=0.010, loss=0.0301, ep_reward=N/A]  


Model saved at step 300000

Step 300000: Avg reward (last 10): -21.00, Avg length: 774.8


Training:  31%|‚ñà‚ñà‚ñà       | 310040/1000000 [10:57<26:25, 435.23it/s, epsilon=0.010, loss=0.0006, ep_reward=N/A]  


Step 310000: Avg reward (last 10): -20.90, Avg length: 785.3


Training:  32%|‚ñà‚ñà‚ñà‚ñè      | 320059/1000000 [11:20<26:10, 433.06it/s, epsilon=0.010, loss=0.0008, ep_reward=N/A]


Step 320000: Avg reward (last 10): -21.00, Avg length: 764.0


Training:  33%|‚ñà‚ñà‚ñà‚ñé      | 330051/1000000 [11:43<25:13, 442.64it/s, epsilon=0.010, loss=0.0006, ep_reward=N/A]


Step 330000: Avg reward (last 10): -21.00, Avg length: 770.2


Training:  34%|‚ñà‚ñà‚ñà‚ñç      | 340059/1000000 [12:06<25:08, 437.36it/s, epsilon=0.010, loss=0.0305, ep_reward=N/A]


Step 340000: Avg reward (last 10): -20.90, Avg length: 789.8


Training:  35%|‚ñà‚ñà‚ñà‚ñå      | 350044/1000000 [12:29<24:27, 442.93it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]


Step 350000: Avg reward (last 10): -20.90, Avg length: 777.4


Training:  36%|‚ñà‚ñà‚ñà‚ñå      | 360066/1000000 [12:52<24:15, 439.81it/s, epsilon=0.010, loss=0.0008, ep_reward=N/A]


Step 360000: Avg reward (last 10): -21.00, Avg length: 770.2


Training:  37%|‚ñà‚ñà‚ñà‚ñã      | 370037/1000000 [13:14<23:53, 439.34it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]


Step 370000: Avg reward (last 10): -21.00, Avg length: 772.8


Training:  38%|‚ñà‚ñà‚ñà‚ñä      | 380070/1000000 [13:37<23:30, 439.56it/s, epsilon=0.010, loss=0.0008, ep_reward=N/A]


Step 380000: Avg reward (last 10): -21.00, Avg length: 778.8


Training:  39%|‚ñà‚ñà‚ñà‚ñâ      | 390053/1000000 [14:00<23:21, 435.25it/s, epsilon=0.010, loss=0.0333, ep_reward=N/A]


Step 390000: Avg reward (last 10): -21.00, Avg length: 784.8


Training:  40%|‚ñà‚ñà‚ñà‚ñà      | 400036/1000000 [14:23<29:12, 342.43it/s, epsilon=0.010, loss=0.0305, ep_reward=N/A]


Model saved at step 400000

Step 400000: Avg reward (last 10): -21.00, Avg length: 776.0


Training:  41%|‚ñà‚ñà‚ñà‚ñà      | 410056/1000000 [14:46<22:21, 439.81it/s, epsilon=0.010, loss=0.0602, ep_reward=N/A]


Step 410000: Avg reward (last 10): -21.00, Avg length: 764.0


Training:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 420054/1000000 [15:09<22:03, 438.24it/s, epsilon=0.010, loss=0.0008, ep_reward=N/A]


Step 420000: Avg reward (last 10): -21.00, Avg length: 782.0


Training:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 430053/1000000 [15:32<21:51, 434.53it/s, epsilon=0.010, loss=0.0302, ep_reward=N/A]  


Step 430000: Avg reward (last 10): -21.00, Avg length: 782.1


Training:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 440038/1000000 [15:55<21:31, 433.64it/s, epsilon=0.010, loss=0.0303, ep_reward=N/A]  


Step 440000: Avg reward (last 10): -21.00, Avg length: 798.6


Training:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 450078/1000000 [16:18<20:55, 438.15it/s, epsilon=0.010, loss=0.0302, ep_reward=N/A]  


Step 450000: Avg reward (last 10): -20.90, Avg length: 794.6


Training:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 460041/1000000 [16:41<20:53, 430.92it/s, epsilon=0.010, loss=0.0008, ep_reward=N/A]  


Step 460000: Avg reward (last 10): -21.00, Avg length: 782.0


Training:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 470053/1000000 [17:04<19:02, 463.91it/s, epsilon=0.010, loss=0.0598, ep_reward=N/A]  


Step 470000: Avg reward (last 10): -21.00, Avg length: 788.0


Training:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 480075/1000000 [17:26<19:23, 446.78it/s, epsilon=0.010, loss=0.0302, ep_reward=N/A]  


Step 480000: Avg reward (last 10): -21.00, Avg length: 764.0


Training:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 490043/1000000 [17:48<18:38, 456.08it/s, epsilon=0.010, loss=0.0304, ep_reward=N/A]  


Step 490000: Avg reward (last 10): -21.00, Avg length: 782.0


Training:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 500051/1000000 [18:10<19:07, 435.56it/s, epsilon=0.010, loss=0.0306, ep_reward=N/A]  


Model saved at step 500000

Step 500000: Avg reward (last 10): -21.00, Avg length: 764.0


Training:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 510056/1000000 [18:32<17:46, 459.31it/s, epsilon=0.010, loss=0.0598, ep_reward=N/A]  


Step 510000: Avg reward (last 10): -21.00, Avg length: 769.6


Training:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 520069/1000000 [18:54<17:46, 449.82it/s, epsilon=0.010, loss=0.0008, ep_reward=N/A]  


Step 520000: Avg reward (last 10): -21.00, Avg length: 788.0


Training:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 530073/1000000 [19:16<17:16, 453.26it/s, epsilon=0.010, loss=0.0006, ep_reward=N/A]  


Step 530000: Avg reward (last 10): -21.00, Avg length: 777.6


Training:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 540041/1000000 [19:38<17:24, 440.45it/s, epsilon=0.010, loss=0.0303, ep_reward=N/A]


Step 540000: Avg reward (last 10): -21.00, Avg length: 842.0


Training:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 550081/1000000 [20:01<16:42, 448.92it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]


Step 550000: Avg reward (last 10): -21.00, Avg length: 764.0


Training:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 560084/1000000 [20:23<16:00, 457.99it/s, epsilon=0.010, loss=0.0600, ep_reward=N/A]  


Step 560000: Avg reward (last 10): -21.00, Avg length: 812.2


Training:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 570059/1000000 [20:45<15:49, 452.99it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]  


Step 570000: Avg reward (last 10): -21.00, Avg length: 776.2


Training:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 580061/1000000 [21:07<15:23, 454.81it/s, epsilon=0.010, loss=0.0303, ep_reward=N/A]


Step 580000: Avg reward (last 10): -21.00, Avg length: 776.0


Training:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 590069/1000000 [21:29<15:07, 451.52it/s, epsilon=0.010, loss=0.0008, ep_reward=N/A]


Step 590000: Avg reward (last 10): -21.00, Avg length: 770.0


Training:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 600038/1000000 [21:51<15:33, 428.38it/s, epsilon=0.010, loss=0.0302, ep_reward=N/A]  


Model saved at step 600000

Step 600000: Avg reward (last 10): -21.00, Avg length: 766.8


Training:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 610083/1000000 [22:13<14:18, 454.36it/s, epsilon=0.010, loss=0.0302, ep_reward=N/A]  


Step 610000: Avg reward (last 10): -21.00, Avg length: 776.0


Training:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 620061/1000000 [22:35<14:07, 448.45it/s, epsilon=0.010, loss=0.0626, ep_reward=N/A]  


Step 620000: Avg reward (last 10): -21.00, Avg length: 770.0


Training:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 630048/1000000 [22:57<13:38, 451.74it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]  


Step 630000: Avg reward (last 10): -21.00, Avg length: 764.0


Training:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 640049/1000000 [23:19<13:08, 456.69it/s, epsilon=0.010, loss=0.0303, ep_reward=N/A]  


Step 640000: Avg reward (last 10): -20.90, Avg length: 791.6


Training:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 650059/1000000 [23:41<12:49, 454.74it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]  


Step 650000: Avg reward (last 10): -21.00, Avg length: 782.0


Training:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 660065/1000000 [24:04<13:31, 418.95it/s, epsilon=0.010, loss=0.0305, ep_reward=N/A]  


Step 660000: Avg reward (last 10): -20.90, Avg length: 772.0


Training:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 670051/1000000 [24:27<12:36, 435.92it/s, epsilon=0.010, loss=0.0008, ep_reward=N/A]  


Step 670000: Avg reward (last 10): -21.00, Avg length: 793.8


Training:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 680043/1000000 [24:50<12:08, 439.11it/s, epsilon=0.010, loss=0.0302, ep_reward=N/A]  


Step 680000: Avg reward (last 10): -21.00, Avg length: 812.2


Training:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 690060/1000000 [25:13<11:47, 438.27it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]  


Step 690000: Avg reward (last 10): -21.00, Avg length: 794.4


Training:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 700049/1000000 [25:36<11:53, 420.52it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]


Model saved at step 700000

Step 700000: Avg reward (last 10): -21.00, Avg length: 764.0


Training:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 710078/1000000 [25:59<11:05, 435.40it/s, epsilon=0.010, loss=0.0303, ep_reward=N/A]


Step 710000: Avg reward (last 10): -21.00, Avg length: 782.0


Training:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 720057/1000000 [26:22<10:39, 437.89it/s, epsilon=0.010, loss=0.0627, ep_reward=N/A]


Step 720000: Avg reward (last 10): -21.00, Avg length: 764.0


Training:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 730044/1000000 [26:45<10:36, 424.36it/s, epsilon=0.010, loss=0.0304, ep_reward=N/A]


Step 730000: Avg reward (last 10): -21.00, Avg length: 788.0


Training:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 740038/1000000 [27:08<09:48, 441.53it/s, epsilon=0.010, loss=0.0008, ep_reward=N/A]


Step 740000: Avg reward (last 10): -21.00, Avg length: 775.2


Training:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 750036/1000000 [27:31<09:50, 423.28it/s, epsilon=0.010, loss=0.0008, ep_reward=N/A]


Step 750000: Avg reward (last 10): -21.00, Avg length: 776.0


Training:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 760078/1000000 [27:54<09:07, 438.39it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]


Step 760000: Avg reward (last 10): -21.00, Avg length: 770.0


Training:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 770063/1000000 [28:17<08:50, 433.16it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]


Step 770000: Avg reward (last 10): -21.00, Avg length: 770.0


Training:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 780065/1000000 [28:40<08:24, 436.33it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]  


Step 780000: Avg reward (last 10): -21.00, Avg length: 768.6


Training:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 790048/1000000 [29:03<07:52, 444.06it/s, epsilon=0.010, loss=0.0600, ep_reward=N/A]  


Step 790000: Avg reward (last 10): -21.00, Avg length: 768.8


Training:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 800039/1000000 [29:26<08:02, 414.77it/s, epsilon=0.010, loss=0.0008, ep_reward=N/A]  


Model saved at step 800000

Step 800000: Avg reward (last 10): -20.90, Avg length: 791.9


Training:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 810068/1000000 [29:49<07:17, 434.12it/s, epsilon=0.010, loss=0.0302, ep_reward=N/A]


Step 810000: Avg reward (last 10): -21.00, Avg length: 795.6


Training:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 820062/1000000 [30:12<06:54, 434.02it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]


Step 820000: Avg reward (last 10): -21.00, Avg length: 772.8


Training:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 830054/1000000 [30:35<06:37, 427.33it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]  


Step 830000: Avg reward (last 10): -21.00, Avg length: 788.0


Training:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 840060/1000000 [30:58<06:17, 423.19it/s, epsilon=0.010, loss=0.0304, ep_reward=N/A]  


Step 840000: Avg reward (last 10): -21.00, Avg length: 770.0


Training:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 850069/1000000 [31:21<05:42, 438.37it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]  


Step 850000: Avg reward (last 10): -21.00, Avg length: 791.2


Training:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 860036/1000000 [31:44<05:26, 428.79it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]


Step 860000: Avg reward (last 10): -21.00, Avg length: 770.0


Training:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 870060/1000000 [32:07<04:56, 438.53it/s, epsilon=0.010, loss=0.0006, ep_reward=N/A]


Step 870000: Avg reward (last 10): -21.00, Avg length: 770.0


Training:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 880079/1000000 [32:30<04:31, 441.62it/s, epsilon=0.010, loss=0.0302, ep_reward=N/A]  


Step 880000: Avg reward (last 10): -21.00, Avg length: 806.0


Training:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 890053/1000000 [32:53<04:17, 427.32it/s, epsilon=0.010, loss=0.0008, ep_reward=N/A]


Step 890000: Avg reward (last 10): -21.00, Avg length: 766.8


Training:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 900060/1000000 [33:16<03:56, 422.11it/s, epsilon=0.010, loss=0.0599, ep_reward=N/A]


Model saved at step 900000

Step 900000: Avg reward (last 10): -21.00, Avg length: 782.0


Training:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 910057/1000000 [33:39<03:31, 424.30it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]  


Step 910000: Avg reward (last 10): -20.60, Avg length: 850.2


Training:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 920046/1000000 [34:02<03:03, 436.61it/s, epsilon=0.010, loss=0.0006, ep_reward=N/A]  


Step 920000: Avg reward (last 10): -21.00, Avg length: 854.0


Training:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 930075/1000000 [34:25<02:40, 434.33it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]


Step 930000: Avg reward (last 10): -21.00, Avg length: 782.0


Training:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 940074/1000000 [34:48<02:15, 441.91it/s, epsilon=0.010, loss=0.0006, ep_reward=N/A]


Step 940000: Avg reward (last 10): -20.90, Avg length: 815.7


Training:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 950037/1000000 [35:11<01:54, 436.78it/s, epsilon=0.010, loss=0.0305, ep_reward=N/A]


Step 950000: Avg reward (last 10): -21.00, Avg length: 764.0


Training:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 960067/1000000 [35:34<01:31, 438.01it/s, epsilon=0.010, loss=0.0303, ep_reward=N/A]


Step 960000: Avg reward (last 10): -20.90, Avg length: 852.1


Training:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 970067/1000000 [35:57<01:08, 434.09it/s, epsilon=0.010, loss=0.0007, ep_reward=N/A]  


Step 970000: Avg reward (last 10): -21.00, Avg length: 830.1


Training:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 980047/1000000 [36:20<00:46, 432.33it/s, epsilon=0.010, loss=0.0304, ep_reward=N/A]  


Step 980000: Avg reward (last 10): -21.00, Avg length: 770.0


Training:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 990041/1000000 [36:43<00:23, 422.61it/s, epsilon=0.010, loss=0.0303, ep_reward=N/A]  


Step 990000: Avg reward (last 10): -21.00, Avg length: 776.0


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000000/1000000 [37:06<00:00, 449.13it/s, epsilon=0.010, loss=0.0303, ep_reward=N/A] 



Training complete! Model saved to models/dqn_pong.pth
Total episodes: 1276
Final average reward (last 100): -20.98


## Training Analysis & Answers to Your Questions

**Has it learned anything?**
Looking at your training output:
- Average reward: **-20.40 to -21.00** (basically random play - losing every point)
- For Pong, rewards range from -21 (lose all points) to +21 (win all points)
- **The agent hasn't learned much yet** - it's still playing randomly

**Should you increase to 1,000,000 steps?**
**YES!** DQN typically needs:
- **100,000 steps**: Just starting to learn (you're here)
- **500,000 steps**: Beginning to show improvement
- **1,000,000+ steps**: Actually learning to play well
- **10,000,000 steps**: Master-level play

**Why does it "learn so fast"?**
It's not actually learning fast - the **loss is decreasing** (which is good), but the **rewards aren't improving yet**. This is normal! The network is learning the Q-function, but it takes time for that to translate to better gameplay.

**Are we using a premade agent?**
**NO!** We built everything from scratch:
- ‚úÖ Custom Model class (CNN architecture)
- ‚úÖ Custom ReplayBuffer (experience replay)
- ‚úÖ Custom DQNAgent (with target network, epsilon-greedy, etc.)
- ‚úÖ Custom training loop

This is a **fully custom implementation** - not using stable-baselines3 or any pre-made agent!


In [15]:
# Evaluate the trained agent
import ale_py
import numpy as np

# Create evaluation environment
eval_env = gym.make("ALE/Pong-v5")
eval_env = PreprocessAtari(eval_env)
eval_env = FrameStack(eval_env, n_frames=4)

# Load the trained model
model_path = "models/dqn_pong.pth"
if os.path.exists(model_path):
    # Recreate agent with same parameters
    state_shape = eval_env.observation_space.shape
    n_actions = eval_env.action_space.n
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    eval_agent = DQNAgent(
        state_shape=state_shape,
        n_actions=n_actions,
        device=device,
        lr=1e-4,
        gamma=0.99,
        epsilon_start=1.0,
        epsilon_end=0.01,
        epsilon_decay=10000,
        target_update_freq=1000,
        buffer_size=100000,
        batch_size=32
    )
    eval_agent.load(model_path)
    print(f"‚úÖ Loaded model from {model_path}")
    print(f"   Model was trained for {eval_agent.steps} steps")
else:
    print("‚ùå No model found to evaluate")
    eval_env.close()

# Run evaluation episodes
n_eval_episodes = 10
eval_rewards = []
eval_lengths = []

print(f"\nRunning {n_eval_episodes} evaluation episodes (no exploration, greedy policy)...")
for episode in range(n_eval_episodes):
    state, info = eval_env.reset()
    episode_reward = 0
    episode_length = 0
    
    while True:
        # Use greedy policy (no exploration)
        action = eval_agent.select_action(state, training=False)
        state, reward, terminated, truncated, info = eval_env.step(action)
        done = terminated or truncated
        
        episode_reward += reward
        episode_length += 1
        
        if done:
            break
    
    eval_rewards.append(episode_reward)
    eval_lengths.append(episode_length)
    print(f"Episode {episode+1}: Reward = {episode_reward:+.1f}, Length = {episode_length}")

eval_env.close()

# Print summary
print(f"\n{'='*50}")
print(f"Evaluation Summary ({n_eval_episodes} episodes):")
print(f"  Average Reward: {np.mean(eval_rewards):.2f}")
print(f"  Best Reward: {np.max(eval_rewards):.2f}")
print(f"  Worst Reward: {np.min(eval_rewards):.2f}")
print(f"  Average Length: {np.mean(eval_lengths):.1f} steps")
print(f"{'='*50}")

# Interpretation
avg_reward = np.mean(eval_rewards)
if avg_reward < -19:
    print("üìâ Status: Still playing randomly (needs more training)")
    print("   ‚Üí Increase training to 1,000,000+ steps")
elif avg_reward < -10:
    print("üìà Status: Starting to learn (showing some improvement)")
    print("   ‚Üí Continue training to see more improvement")
elif avg_reward < 0:
    print("üéØ Status: Learning! (better than random)")
    print("   ‚Üí Keep training to reach positive rewards")
elif avg_reward < 10:
    print("üèÜ Status: Playing well! (winning some games)")
    print("   ‚Üí Excellent progress!")
else:
    print("üåü Status: Master level! (consistently winning)")


‚úÖ Loaded model from models/dqn_pong.pth
   Model was trained for 100000 steps

Running 10 evaluation episodes (no exploration, greedy policy)...
Episode 1: Reward = -21.0, Length = 764
Episode 2: Reward = -21.0, Length = 764
Episode 3: Reward = -21.0, Length = 764
Episode 4: Reward = -21.0, Length = 764
Episode 5: Reward = -21.0, Length = 764
Episode 6: Reward = -21.0, Length = 764
Episode 7: Reward = -21.0, Length = 764
Episode 8: Reward = -21.0, Length = 764
Episode 9: Reward = -21.0, Length = 764
Episode 10: Reward = -21.0, Length = 764

Evaluation Summary (10 episodes):
  Average Reward: -21.00
  Best Reward: -21.00
  Worst Reward: -21.00
  Average Length: 764.0 steps
üìâ Status: Still playing randomly (needs more training)
   ‚Üí Increase training to 1,000,000+ steps


In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    """
    DQN Model for Atari games.
    Takes stacked frames as input and outputs Q-values for each action.
    """
    def __init__(self, n_actions, n_frames=4):
        """
        Args:
            n_actions: Number of possible actions (e.g., 6 for Pong)
            n_frames: Number of stacked frames (default: 4)
        """
        super(Model, self).__init__()
        
        # Convolutional layers to process the image frames
        self.conv1 = nn.Conv2d(n_frames, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        
        # Calculate the size of the flattened feature map
        # Input shape: (n_frames, 84, 84) after preprocessing (or 210x160x3 raw)
        # After conv layers, we need to calculate the output size
        # For standard Atari preprocessing (84x84), the output is 7x7x64
        self.fc1 = nn.Linear(7 * 7 * 64, 512)
        self.fc2 = nn.Linear(512, n_actions)
        
    def forward(self, x):
        """
        Forward pass through the network.
        
        Args:
            x: Input tensor of shape (batch_size, n_frames, height, width)
        
        Returns:
            Q-values for each action, shape (batch_size, n_actions)
        """
        # Apply convolutional layers with ReLU activation
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        
        # Flatten the feature map
        x = x.view(x.size(0), -1)
        
        # Apply fully connected layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x