
# LunarLander RL Template: Q-Learning and DQN introduction

This notebook provides a reinforcement learning scaffold for **LunarLander-v3** (discrete) in **Gymnasium**.

You will be introduced to 2 different approaches:
- **A) Discretized Tabular Q-Learning**: bins the continuous state into discrete buckets, then learns a dictionary-based Q-table.
- **B) DQN (Deep Q-Network)**: uses your **MLP** to approximate the Q-function, with experience replay and a target network.

You will notice one will perform much better than the other.

In [1]:
#@title Environment setup for Colab
# This cell sets up all dependencies for the LunarLander notebook on Google Colab.
import sys, subprocess, os

def pipi(*args):
    subprocess.check_call([sys.executable, "-m", "pip", "install", *args])

# Upgrade base tools
pipi("--upgrade", "pip", "setuptools", "wheel")

# Gymnasium + Box2D
try:
    pipi("gymnasium[box2d]")
    try:
        pipi("box2d-py")
    except subprocess.CalledProcessError:
        print("[install] box2d-py failed; using Box2D fallback.")
        pipi("Box2D")
except subprocess.CalledProcessError:
    print("[install] gymnasium[box2d] failed; trying gymnasium and Box2D separately.")
    pipi("gymnasium")
    pipi("Box2D")

# Rendering / video / DL
pipi("pygame", "imageio", "imageio-ffmpeg", "matplotlib", "torch")

print("Setup complete. Continue below.")


[install] box2d-py failed; using Box2D fallback.
Setup complete. Continue below.


In [2]:

import os, glob, random, math, time
from dataclasses import dataclass
from typing import Callable, Optional, Tuple, Dict

import numpy as np
import gymnasium as gym
from gymnasium.wrappers import RecordVideo

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from tqdm import tqdm

from IPython.display import Video, display

# ==== Global Configuration ====
ENV_ID = "LunarLander-v3"       # Discrete environment
SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# On Colab, store videos in /content for easy preview
VIDEO_DIR = "/content/videos"
os.makedirs(VIDEO_DIR, exist_ok=True)

MAX_STEPS = 2500
RNG = np.random.default_rng(SEED)

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if DEVICE == "cuda":
    torch.cuda.manual_seed_all(SEED)

print(f"Using device: {DEVICE}")


Using device: cuda


### Some helper functions

In [3]:

def make_env(env_id: str = ENV_ID, seed: int = SEED, render_mode: Optional[str] = None):
    """Create and seed a Gymnasium environment with a safe fallback."""
    try:
        env = gym.make(env_id, render_mode=render_mode)
    except Exception as e:
        print(f"[make_env] Could not create '{env_id}' ({e}). Falling back to 'LunarLander-v3'.")
        env = gym.make("LunarLander-v3", render_mode=render_mode)

    try:
        env.reset(seed=seed)
    except TypeError:
        env.reset()
    try:
        env.action_space.seed(seed)
        env.observation_space.seed(seed)
    except Exception:
        pass
    return env


def rollout_episode(env, policy: Callable, max_steps: int = MAX_STEPS, capture_frames: bool = False):
    """Run a single episode with the provided policy callable: action = policy(obs, action_space)."""
    frames = []
    obs, info = env.reset(seed=SEED)
    total_reward = 0.0
    for t in range(max_steps):
        if capture_frames and hasattr(env, "render"):
            frame = env.render()
            if frame is not None:
                frames.append(frame)
        action = policy(obs, env.action_space)
        obs, reward, terminated, truncated, info = env.step(action)
        total_reward += float(reward)
        if terminated or truncated:
            break
    return total_reward, frames


In [4]:

from collections import deque

@dataclass
class RewardTracker:
    window: int = 100
    def __post_init__(self):
        self.history = []
        self._dq = deque(maxlen=self.window)
    def update(self, ret: float):
        self.history.append(ret)
        self._dq.append(ret)
    @property
    def moving_avg(self) -> float:
        return float(np.mean(self._dq)) if self._dq else 0.0


In [5]:

def random_policy(obs, action_space):
    return action_space.sample()

def record_and_display(policy_fn: Callable, env_id: str = ENV_ID, seed: int = SEED,
                       video_dir: str = VIDEO_DIR, max_steps: int = MAX_STEPS):
    os.makedirs(video_dir, exist_ok=True)
    env = make_env(env_id, seed=seed, render_mode="rgb_array")
    env = RecordVideo(env, video_dir, episode_trigger=lambda e: True, name_prefix="demo")
    total, _ = rollout_episode(env, policy_fn, max_steps=max_steps, capture_frames=False)
    env.close()

    mp4s = sorted(glob.glob(os.path.join(video_dir, "*.mp4")))
    latest = mp4s[-1] if mp4s else None
    print(f"Episode return: {total:.2f}")
    if latest:
        display(Video(latest, embed=True, html_attributes="controls loop autoplay"))
    else:
        print("No video found. If on Colab, ensure imageio-ffmpeg is installed.")
    return total, latest

# Example video of a random policy
_ = record_and_display(random_policy)


  from pkg_resources import resource_stream, resource_exists
  logger.warn(
  IMAGEMAGICK_BINARY = r"C:\Program Files\ImageMagick-6.8.8-Q16\magick.exe"


Episode return: -351.16


In [6]:
def select_action(agent, obs, action_space, epsilon: float = 0.0):
  return agent.select_action(obs, action_space, epsilon)

def to_tensor(obs: np.ndarray) -> torch.Tensor:
    return torch.as_tensor(obs, dtype=torch.float32, device=DEVICE).unsqueeze(0)

In [7]:
def record_agent_video(agent: nn.Module, env_id: str = ENV_ID, seed: int = SEED,
                       video_dir: str = VIDEO_DIR, max_steps: int = MAX_STEPS):
    os.makedirs(video_dir, exist_ok=True)
    env = make_env(env_id, seed=seed, render_mode="rgb_array")
    env = RecordVideo(env, video_dir, episode_trigger=lambda e: True, name_prefix="dqn")
    total = 0.0
    obs, _ = env.reset(seed=seed)
    for _ in range(max_steps):
        obs = to_tensor(obs)
        a = select_action(agent, obs, env.action_space, epsilon=0.0)
        obs, r, term, trunc, _ = env.step(a)
        total += float(r)
        if term or trunc:
            break
    env.close()
    mp4s = sorted(glob.glob(os.path.join(video_dir, "*.mp4")))
    latest = mp4s[-1] if mp4s else None
    print(f"Agent video return: {total:.2f}")
    if latest:
        display(Video(latest, embed=True, html_attributes="controls loop autoplay"))
    return total, latest



## A) Tabular Q-Learning (with state discretization)

LunarLander observations are continuous. We **bin** each dimension into a small number of buckets to get a discrete state key. Then we apply vanilla Q-learning:
```
Q[s,a] ← Q[s,a] + α (r + γ max_a' Q[s',a'] − Q[s,a])
```
Good for learning dynamics, but **DQN**s are preferred for function approximation and scalability.

See what results you can get with the tabular Q-learning approach.


In [8]:
class Discretizer:
    """Uniformly discretize each observation dimension into `bins` buckets."""
    def __init__(self, low: np.ndarray, high: np.ndarray, bins: int = 8):
        self.bins = bins
        low = np.where(np.isfinite(low), low, -1.0) # ensure bounds are not infinite ranges
        high = np.where(np.isfinite(high), high, 1.0)
        self.low = low
        self.high = high

    # converts an observation to a discretized key that can be used to access q-values in a q-table
    def encode(self, obs: np.ndarray) -> Tuple[int, ...]:
        ratios = (obs - self.low) / (self.high - self.low + 1e-8)
        ratios = np.clip(ratios, 0.0, 1.0)
        idxs = (ratios * self.bins).astype(int)
        idxs = np.clip(idxs, 0, self.bins - 1)
        return tuple(int(i) for i in idxs)

In [11]:
from collections import defaultdict

class LunarLanderAgent:
    def __init__(
        self,
        env: gym.Env,
        learning_rate: float,
        initial_epsilon: float,
        epsilon_decay: float,
        final_epsilon: float,
        discount_factor: float = 0.95,
    ):
        """Initialize a Q-Learning agent.

        Args:
            env: The training environment
            learning_rate: How quickly to update Q-values (0-1)
            initial_epsilon: Starting exploration rate (usually 1.0)
            epsilon_decay: How much to reduce epsilon each episode
            final_epsilon: Minimum exploration rate (usually 0.1)
            discount_factor: How much to value future rewards (0-1)
        """
        self.env = env

        self.disc = Discretizer(env.observation_space.low, env.observation_space.high)

        # Q-table: maps (state, action) to expected reward
        # defaultdict automatically creates entries with zeros for new states
        self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))

        # the point of the q table is to ACCURATELY map a certain action to its consequence

        self.lr = learning_rate
        self.discount_factor = discount_factor  # How much we care about future rewards

        # Exploration parameters
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon

        # Track learning progress
        self.training_error = []

    def select_action(self, obs: tuple[int, int, bool], action_space = None, epsilon = None) -> int:
        """Choose an action using epsilon-greedy strategy.

        Returns:
            action: 0, 1, 2, or 3 (left, right, up, down)
        """
        if (self.epsilon < np.random.random()):
          # select a random action
          return self.env.action_space.sample()

        # choose the most optimal action
        discretized_obs = self.disc.encode(obs) # need to discretize the observation in order to find the corresponding action in our q-table
        optimal_action = np.argmax(self.q_values[discretized_obs]) # q-table key is a discretized observation.
                                                                   # value is a list of the possible rewards indexed by each action state
        return optimal_action

    def update(
        self,
        obs: tuple[int, int, bool], # basically the state of the agent right now in the environment (e.g. speed, x/y position, etc.)
        action: int,
        reward: float, # numerical reward for takingn a certain action
        terminated: bool,
        next_obs: tuple[int, int, bool], # state of the agent after taking "action"
    ):
        """Update Q-value based on experience.

        This is the heart of Q-learning: learn from (state, action, reward, next_state)
        """
        obs = self.disc.encode(obs)
        next_obs = self.disc.encode(next_obs)

        # What should the Q-value be? (Bellman equation, defined as V(s) = max_a(R(s, a) + g*V(s')))
        # a simplified version would be V(s) = R(s, a) + max(g*V(s')), which is basically saying that
        # the expected value is equal the reward of the current state and action + the max/best q-value possible for the next state * some discount factor

        # What's the best we could do from the next state?
        # (Zero if episode terminated - no future rewards possible)
        best_next_q = future_q_value = (not terminated) * np.max(self.q_values[next_obs])  # calculate the best q-value possible for the next state that will be used in the Bellman equation

        target = reward + self.discount_factor * best_next_q # calculate target value using the Bellman equation
        # taerget is what the q-value for obs SHOULD be

        # How wrong was our current estimate?
        temporal_difference = target - self.q_values[obs][action]

        # Update our estimate in the direction of the error
        # Learning rate controls how big steps we take
        self.q_values[obs][action] = (
            self.q_values[obs][action] + self.lr * temporal_difference
        )

        # Track learning progress (useful for debugging)
        self.training_error.append(temporal_difference)

    def decay_epsilon(self):
        """Reduce exploration rate after each episode."""
        self.epsilon *= self.epsilon_decay

In [9]:
from tqdm import tqdm

def train_q_agent(agent, n_episodes):
  for episode in tqdm(range(n_episodes)):
      # Start a new landing
      obs, info = env.reset()
      done = False

      while not done:
          # Agent chooses action (initially random, gradually more intelligent)
          action = agent.select_action(obs)

          # Take action and observe result
          next_obs, reward, terminated, truncated, info = env.step(action)

          # Learn from this experience
          agent.update(obs, action, reward, terminated, next_obs)

          # Move to next state
          done = terminated or truncated
          obs = next_obs

      if episode % 1000 == 0:
        print("Reward:", reward)

      # Reduce exploration rate (agent becomes less random over time)
      agent.decay_epsilon()

In [10]:
# Training hyperparameters
learning_rate = 0.01        # @param How fast to learn (higher = faster but less stable)
n_episodes = 50_000        # @param Number of landings to practice (may need to increase this)
start_epsilon = 1.0         # @param Start with 100% random actions
epsilon_decay = 0.997 # @param Reduce exploration over time
min_epsilon = 0.1         # @param Always keep some exploration

# Create environment and agent
env = make_env(ENV_ID, SEED, render_mode=None)
env = gym.wrappers.RecordEpisodeStatistics(env, buffer_length=n_episodes)

q_agent = LunarLanderAgent(
    env=env,
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=min_epsilon,
)

NameError: name 'LunarLanderAgent' is not defined

In [None]:
train_q_agent(q_agent, n_episodes)

  0%|          | 15/50000 [00:00<21:18, 39.10it/s]

Reward: -100


  2%|▏         | 1014/50000 [00:17<12:05, 67.55it/s]

Reward: -100


  4%|▍         | 2020/50000 [00:29<08:25, 94.97it/s]

Reward: -100


  6%|▌         | 3011/50000 [00:41<08:46, 89.31it/s]

Reward: -100


  8%|▊         | 4019/50000 [01:00<09:25, 81.28it/s]

Reward: -100


 10%|█         | 5021/50000 [01:11<07:42, 97.35it/s]

Reward: -100


 12%|█▏        | 6014/50000 [01:23<09:06, 80.51it/s]

Reward: -100


 14%|█▍        | 7017/50000 [01:35<09:24, 76.11it/s]

Reward: -100


 16%|█▌        | 8019/50000 [01:49<07:35, 92.19it/s]

Reward: -100


 18%|█▊        | 9011/50000 [02:04<07:27, 91.63it/s]

Reward: -100


 20%|██        | 10017/50000 [02:19<07:32, 88.45it/s]

Reward: -100


 22%|██▏       | 11013/50000 [02:33<06:56, 93.52it/s]

Reward: -100


 24%|██▍       | 12010/50000 [02:45<06:47, 93.34it/s]

Reward: -100


 26%|██▌       | 13011/50000 [02:57<06:29, 95.05it/s]

Reward: -100


 28%|██▊       | 14012/50000 [03:09<06:26, 93.18it/s]

Reward: -100


 30%|███       | 15013/50000 [03:20<06:38, 87.79it/s]

Reward: -100


 32%|███▏      | 16018/50000 [03:35<06:05, 93.00it/s]

Reward: -100


 34%|███▍      | 17019/50000 [03:46<05:47, 94.92it/s]

Reward: -100


 36%|███▌      | 18020/50000 [03:58<05:33, 96.01it/s]

Reward: -100


 38%|███▊      | 19009/50000 [04:09<06:16, 82.38it/s]

Reward: -100


 40%|████      | 20013/50000 [04:21<05:36, 89.13it/s]

Reward: -100


 42%|████▏     | 21008/50000 [04:34<08:56, 54.04it/s]

Reward: -100


 44%|████▍     | 22012/50000 [04:50<05:21, 87.13it/s]

Reward: -100


 46%|████▌     | 23009/50000 [05:02<05:43, 78.47it/s]

Reward: -100


 48%|████▊     | 24007/50000 [05:17<10:32, 41.10it/s]

Reward: -100


 50%|█████     | 25010/50000 [05:33<04:46, 87.21it/s]

Reward: -100


 52%|█████▏    | 26016/50000 [05:45<04:21, 91.62it/s]

Reward: -100


 54%|█████▍    | 27010/50000 [05:59<04:59, 76.64it/s]

Reward: -100


 56%|█████▌    | 28010/50000 [06:12<04:15, 85.96it/s]

Reward: -100


 58%|█████▊    | 29013/50000 [06:26<03:51, 90.72it/s]

Reward: -100


 60%|██████    | 30011/50000 [06:40<04:36, 72.20it/s]

Reward: -100


 62%|██████▏   | 31009/50000 [06:55<04:34, 69.23it/s]

Reward: -100


 64%|██████▍   | 32018/50000 [07:10<04:54, 61.11it/s]

Reward: -100


 66%|██████▌   | 33017/50000 [07:22<04:09, 68.13it/s]

Reward: -100


 68%|██████▊   | 34013/50000 [07:34<03:41, 72.26it/s]

Reward: -100


 70%|███████   | 35010/50000 [07:46<04:07, 60.45it/s]

Reward: -100


 72%|███████▏  | 36009/50000 [07:58<03:39, 63.82it/s]

Reward: -100


 74%|███████▍  | 37007/50000 [08:09<03:29, 61.90it/s]

Reward: -100


 76%|███████▌  | 38016/50000 [08:28<03:48, 52.56it/s]

Reward: -100


 78%|███████▊  | 39015/50000 [08:42<02:00, 91.45it/s]

Reward: -100


 80%|████████  | 40017/50000 [08:54<01:52, 88.78it/s]

Reward: -100


 82%|████████▏ | 41014/50000 [09:06<01:42, 87.52it/s]

Reward: -100


 84%|████████▍ | 42012/50000 [09:18<01:24, 94.14it/s]

Reward: -100


 86%|████████▌ | 43018/50000 [09:30<01:20, 87.23it/s]

Reward: -100


 88%|████████▊ | 44016/50000 [09:42<01:02, 95.02it/s]

Reward: -100


 90%|█████████ | 45010/50000 [09:54<01:03, 78.26it/s]

Reward: -100


 92%|█████████▏| 46014/50000 [10:06<00:46, 86.11it/s]

Reward: -100


 94%|█████████▍| 47021/50000 [10:18<00:33, 88.17it/s]

Reward: -100


 96%|█████████▌| 48018/50000 [10:30<00:21, 93.75it/s]

Reward: -100


 98%|█████████▊| 49020/50000 [10:42<00:10, 96.44it/s]

Reward: -100


100%|██████████| 50000/50000 [10:54<00:00, 76.41it/s]


### Visualize your tabular Q-learning lunar lander


In [None]:
record_agent_video(q_agent)

  logger.warn(


Agent video return: -207.11


(-207.11096966756867, '/content/videos/dqn-episode-0.mp4')


## B) Double DQN (Deep Q-Network)

We train a "policy" Q-network (`DQNModel`) with:
- **Experience Replay** buffer
- **Target Network** (periodically updated)
  - this separate network is used to calculate target Q-values, which decouples action selection (done by the policy network) and action evaluation
  - typically less prone to overestimating Q-values than standard single network DQN
- **ε-greedy** exploration
- **Huber loss** and **Adam** optimizer

Complete any TODOs.




Why 2 networks is better than 1:

### Bad Teacher (One Network)
```
Teacher: "2 + 2 = 4"
*Student starts learning*
Teacher: "Actually 2 + 2 = 7" (teacher also learning, changes answer)
*Student confused*
Teacher: "No wait, 2 + 2 = 15" (keeps changing)
Student: *Gives up, learns nothing*
```

### Good Teacher (Two Networks)
```
Teacher: "2 + 2 = 5" (wrong, but confident and consistent)
*Student practices for 1000 problems using this rule*
Student learns to consistently predict 5

Teacher updates knowledge: "2 + 2 = 4.5" (closer to truth)
*Student practices another 1000 problems*
Student improves to predict 4.5

Eventually: Both converge to 4 (truth)

In [12]:
class DQNModel(nn.Module):
    """Q-network architecture.

    For LunarLander:
      - Input: observation vector shape [8]
      - Output: Q-values for each action (shape [n_actions], 4)


    Purpose of the Network: given an observation, output the best action to take
      - better than tabular q-learning
          tabular has a limited amount of states that it can accomodate - since it's a fixed table
          in dqn, we replace the table with a network, which is able to accomodate any observation
    """
    def __init__(self, obs_dim: int, n_actions: int, hidden: int = 128):
        super().__init__()

        # simple MLP
        self.net = nn.Sequential(
            nn.Linear(obs_dim, hidden), # input -> hidden layer 1
            nn.ReLU(), # activation
            nn.Linear(hidden, hidden), # hidden layer 1 -> hidden layer 2
            nn.ReLU(),
            nn.Linear(hidden, n_actions) # hidden layer 2 -> output
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
      return self.net(x)

    def select_action(self, obs: np.ndarray, action_space, epsilon: float = 0.0) -> int:
      """Map model outputs to a valid discrete action.

      - With probability epsilon, choose a random action.
      - Otherwise, choose argmax Q-value from the model.
      """
      if (np.random.random() < epsilon):
        return action_space.sample();

      # choose best action
      output = self.forward(obs)
      return output.argmax().item(); # torchtensor.argmax() retruns the index of the largest item in the torch tensor, item() converts to int
      # hint: similar to how you implemented epsilon-greedy in tabular q-learning, but how do you find the max q-value from the model?
      # think about what the network represents or approximates



In [None]:
from collections import namedtuple

Transition = namedtuple(
    "Transition", ["state", "action", "next_state", "reward", "done"]
)

class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return (
            random.sample(self.memory, batch_size)
            if batch_size < len(self.memory)
            else self.memory
        )

    def __len__(self):
        return len(self.memory)

In [None]:
from dataclasses import dataclass

@dataclass
class DQNConfig:
  num_episodes: int = 500 # @param
  gamma: float = 0.99 # @param
  learning_rate: float = 1e-4 # @param
  tau: float = 0.005 # @param
  batch_size: int = 128 # @param
  epsilon: float = 1.0 # @param
  epsilon_decay: float = 0.995 # @param
  epsilon_min: float = 0.01 # @param
  eval_interval: int = 100 # @param
  eval_episodes: int = 5 # @param
  max_grad_norm: float = 10.0 # @param


In [None]:
# def linear_epsilon(step: int, start: float, end: float, decay_steps: int) -> float:
#     if step >= decay_steps:
#         return end
#     return start + (end - start) * (step / float(decay_steps))

@torch.no_grad()
def evaluate_model(model: nn.Module, episodes: int = 5, env_id: str = ENV_ID, seed: int = SEED):
    scores = []
    for ep in range(episodes):
        env = make_env(env_id, seed=seed + ep, render_mode=None)
        total = 0.0
        obs, _ = env.reset(seed=seed + ep)
        for _ in range(MAX_STEPS):
            obs = to_tensor(obs)
            a = select_action(model, obs, env.action_space, epsilon=0.0)
            obs, r, term, trunc, _ = env.step(a)
            total += float(r)
            if term or trunc:
                break
        env.close()
        scores.append(total)
    mean_ret = float(np.mean(scores))
    print(f"Eval over {episodes} episodes — mean return: {mean_ret:.2f}")
    return mean_ret

def dqn_train(cfg: DQNConfig):
    # Initialize the environment
    env = gym.make(ENV_ID, render_mode="human")

    n_observations = env.observation_space.shape[0]
    n_actions = env.action_space.n

    policy_net = DQNModel(n_observations, n_actions).to(DEVICE)
    target_net = DQNModel(n_observations, n_actions).to(DEVICE)
    target_net.load_state_dict(policy_net.state_dict())

    replay_memory = ReplayMemory(10_000)
    """
    what's the point of the replay memory?
      replay memory allows the network to randomly sample a previous experience - basically ensures that as its learning new experiences, it doesn't forget old ones
      without replay memory - the network will overfit to a specific sequence of experiences (state 2 always comes after state 1).
      Also, without replay memory - the network will forget early experience as it learns/fine-tunes parameters to new ones

    """

    optimizer = optim.AdamW(policy_net.parameters(), lr=cfg.learning_rate)
    # smooth l1 loss is implementation of Huber loss (MSE for small errors, L1 for larger errors)
    criterion = nn.SmoothL1Loss()

    epsilon = cfg.epsilon

    best_reward = float("-inf")

    for episode in tqdm(range(cfg.num_episodes)):
      state, _ = env.reset(seed=SEED)
      state = to_tensor(state)
      tracker = RewardTracker()
      total_reward = 0.0

      while True:
        action = select_action(policy_net, state, env.action_space, epsilon)
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        reward = torch.tensor([reward], device=DEVICE)
        next_state = to_tensor(next_state)
        replay_memory.push(state, torch.tensor([[action]], device=DEVICE), next_state, reward, done)

        state = next_state
        total_reward += reward

        # check that we have enough memories available to sample one
        if len(replay_memory) >= cfg.batch_size:
            transitions = replay_memory.sample(cfg.batch_size)
            states, actions, next_states, rewards, dones = zip(*transitions)

            states_batch = torch.cat(states)
            next_states_batch = torch.cat(next_states)
            actions_batch = torch.cat(actions)
            rewards = torch.tensor(rewards, device=DEVICE)
            dones = torch.tensor(dones, device=DEVICE)

            # target network calculates target q-values (action evaluation)
            q_target = (
                cfg.gamma * target_net(next_states_batch).detach().max(-1)[0] * ~dones
                + rewards
            )

            # policy network determines action selection - updated more often
            q_policy = policy_net(states_batch).gather(1, actions_batch)

            # Calculate the Huber loss (remember that the Huber loss behaves like MSE for errors < q_target and L1 for errors > q_target)
            loss = criterion(q_policy, q_target.unsqueeze(1))

            optimizer.zero_grad()
            loss.backward()

            # In-place gradient clipping to stabilize training
            torch.nn.utils.clip_grad_norm_(policy_net.parameters(), cfg.max_grad_norm) # get the parameters of the policy net
            # norm is a vector symbolizing the change in the parameters
            # we don't want to drastically affect the parameters, we want to progress gradually, so we clip the gradients

            optimizer.step()

        # Update target network (target network is updated at the same rate as the policy network, but the updates are far smaller
        # updated by copying weights from the policy network and making it smaller/FAR smaller increments (multiplying by tau)
        for target_param, main_param in zip(target_net.parameters(), policy_net.parameters()):
            target_param.data.copy_(cfg.tau * main_param.data + (1 - cfg.tau) * target_param.data)

        if done:
            tracker.update(total_reward)
            if total_reward > best_reward:
                best_reward = total_reward
                torch.save(policy_net.state_dict(), "best_policy.pth")
            if episode % 25 == 0:
                print(f"Episode {episode}, Reward: {total_reward}, Epsilon: {epsilon:.3f}")
            if episode % cfg.eval_interval == 0:
                evaluate_model(policy_net, episodes=cfg.eval_episodes)
            break


      epsilon *= cfg.epsilon_decay # decay epsilon to reduce random decision making over time
      if (epsilon < min_epsilon):
        epsilon = min_epsilon;

    env.close()
    return policy_net, target_net, replay_memory, tracker



In [None]:
# default config should work well, but you are encouraged to try different hyperparams;
# e.g., increasing the number of episodes or experimenting with decay rate
cfg = DQNConfig()

In [None]:
# will raise NotImplementedError until you implement the TODOs in DQNModel
policy_net, target_net, rb, tracker = dqn_train(cfg)

  0%|          | 1/500 [00:02<21:30,  2.59s/it]

Episode 0, Reward: tensor([-202.4721], device='cuda:0', dtype=torch.float64), Epsilon: 1.000
Eval over 5 episodes — mean return: -167.86


  5%|▌         | 26/500 [01:04<20:04,  2.54s/it]

Episode 25, Reward: tensor([-122.7987], device='cuda:0', dtype=torch.float64), Epsilon: 0.882


 10%|█         | 51/500 [02:10<21:00,  2.81s/it]

Episode 50, Reward: tensor([-78.6794], device='cuda:0', dtype=torch.float64), Epsilon: 0.778


 15%|█▌        | 76/500 [03:14<19:06,  2.70s/it]

Episode 75, Reward: tensor([-78.7171], device='cuda:0', dtype=torch.float64), Epsilon: 0.687


 20%|██        | 100/500 [04:16<16:34,  2.49s/it]

Episode 100, Reward: tensor([-27.3364], device='cuda:0', dtype=torch.float64), Epsilon: 0.606


 20%|██        | 101/500 [04:20<18:15,  2.75s/it]

Eval over 5 episodes — mean return: -128.47


 25%|██▌       | 126/500 [05:22<16:57,  2.72s/it]

Episode 125, Reward: tensor([6.6593], device='cuda:0', dtype=torch.float64), Epsilon: 0.534


 30%|███       | 151/500 [06:30<15:44,  2.71s/it]

Episode 150, Reward: tensor([-2.2614], device='cuda:0', dtype=torch.float64), Epsilon: 0.471


 35%|███▌      | 176/500 [07:56<15:10,  2.81s/it]

Episode 175, Reward: tensor([-7.1898], device='cuda:0', dtype=torch.float64), Epsilon: 0.416


 40%|████      | 200/500 [12:03<44:37,  8.93s/it]

Episode 200, Reward: tensor([-5.9410], device='cuda:0', dtype=torch.float64), Epsilon: 0.367


 40%|████      | 201/500 [12:08<38:16,  7.68s/it]

Eval over 5 episodes — mean return: -48.08


 45%|████▌     | 226/500 [16:48<1:06:41, 14.61s/it]

Episode 225, Reward: tensor([79.2949], device='cuda:0', dtype=torch.float64), Epsilon: 0.324


 50%|█████     | 251/500 [22:08<51:24, 12.39s/it]

Episode 250, Reward: tensor([131.5366], device='cuda:0', dtype=torch.float64), Epsilon: 0.286


 55%|█████▌    | 276/500 [25:06<17:49,  4.77s/it]

Episode 275, Reward: tensor([2.2172], device='cuda:0', dtype=torch.float64), Epsilon: 0.252


 60%|██████    | 300/500 [29:17<58:46, 17.63s/it]

Episode 300, Reward: tensor([158.0073], device='cuda:0', dtype=torch.float64), Epsilon: 0.222


 60%|██████    | 301/500 [29:38<1:01:46, 18.63s/it]

Eval over 5 episodes — mean return: 63.35


 65%|██████▌   | 326/500 [34:10<47:59, 16.55s/it]

Episode 325, Reward: tensor([148.2658], device='cuda:0', dtype=torch.float64), Epsilon: 0.196


 70%|███████   | 351/500 [38:19<10:07,  4.08s/it]

Episode 350, Reward: tensor([-12.5994], device='cuda:0', dtype=torch.float64), Epsilon: 0.173


 75%|███████▌  | 376/500 [39:44<05:46,  2.79s/it]

Episode 375, Reward: tensor([37.9656], device='cuda:0', dtype=torch.float64), Epsilon: 0.153


 80%|████████  | 400/500 [42:14<10:29,  6.30s/it]

Episode 400, Reward: tensor([44.4481], device='cuda:0', dtype=torch.float64), Epsilon: 0.135


 80%|████████  | 401/500 [42:17<09:07,  5.53s/it]

Eval over 5 episodes — mean return: 86.28


 85%|████████▌ | 426/500 [44:33<07:28,  6.06s/it]

Episode 425, Reward: tensor([247.5941], device='cuda:0', dtype=torch.float64), Epsilon: 0.119


 90%|█████████ | 451/500 [46:39<02:42,  3.31s/it]

Episode 450, Reward: tensor([33.1427], device='cuda:0', dtype=torch.float64), Epsilon: 0.105


 95%|█████████▌| 476/500 [48:46<01:44,  4.37s/it]

Episode 475, Reward: tensor([16.0930], device='cuda:0', dtype=torch.float64), Epsilon: 0.092


100%|██████████| 500/500 [50:43<00:00,  6.09s/it]


In [None]:
best_policy_net_state = torch.load("best_policy.pth")
best_policy_net = DQNModel(8, 4).to(DEVICE)
best_policy_net.load_state_dict(best_policy_net_state)

<All keys matched successfully>

### Visualize your lunar lander


In [None]:

# print(f"Final moving average: {tr.moving_avg:.2f}")
record_agent_video(best_policy_net)


  logger.warn(


Agent video return: 219.50


(219.5039913449972, '/content/videos/dqn-episode-0.mp4')