In [1]:
%load_ext autoreload
%autoreload 2

import time
import argparse

from agents.dqn_agent import DQNAgent
import env_configs

import os
import time

import gymnasium as gym
from gymnasium import wrappers
import numpy as np
import torch
from infrastructure import pytorch_util as ptu
import tqdm

from infrastructure import utils
from infrastructure.logger import Logger
from infrastructure.replay_buffer import MemoryEfficientReplayBuffer, ReplayBuffer

from scripting_utils import make_logger, make_config

MAX_NVIDEO = 2

## Defining the Arguments

In [2]:
class Args:
  def __init__(self):
    self.config_file = "experiments/dqn/car_racing.yaml"
    self.metrics_interval = 10000
    self.video_interval = 50000
    self.num_eval_trajectories = 10
    self.num_render_trajectories = 1
    self.seed = 1
    self.no_gpu = False
    self.which_gpu = 0
    self.log_interval = 1000

args = Args()

# create directory for logging
logdir_prefix = "hw3_dqn_"  # keep for autograder

config = make_config(args.config_file)
logger = make_logger(logdir_prefix, config)

########################
logging outputs to  /Users/karl/development/rl-testing/data/hw3_dqn_dqn_CarRacing-v2_d0.99_tu2000_lr0.0001_doubleq_clip10.0_05-05-2024_22-41-26
########################


In [3]:
# set random seeds
np.random.seed(args.seed)
torch.manual_seed(args.seed)
ptu.init_gpu(use_gpu=not args.no_gpu, gpu_id=args.which_gpu)

Using CPU.


In [4]:
# make the gym environment
env = config["make_env"]()
eval_env = config["make_env"]()
render_env = config["make_env"](render=True)

exploration_schedule = config["exploration_schedule"]
discrete = isinstance(env.action_space, gym.spaces.Discrete)

assert discrete, "DQN only supports discrete action spaces"

CarRacing-v2
False
CarRacing-v2
False
CarRacing-v2
True


In [6]:
agent = DQNAgent(
  env.observation_space.shape,
  env.action_space.n,
  **config["agent_kwargs"],
)

(4, 96, 96)
Sequential(
  (0): PreprocessCarRacing()
  (1): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
  (2): ReLU()
  (3): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
  (4): ReLU()
  (5): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Flatten(start_dim=1, end_dim=-1)
  (8): Linear(in_features=4096, out_features=512, bias=True)
  (9): ReLU()
  (10): Linear(in_features=512, out_features=5, bias=True)
)
(4, 96, 96)
Sequential(
  (0): PreprocessCarRacing()
  (1): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
  (2): ReLU()
  (3): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
  (4): ReLU()
  (5): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Flatten(start_dim=1, end_dim=-1)
  (8): Linear(in_features=4096, out_features=512, bias=True)
  (9): ReLU()
  (10): Linear(in_features=512, out_features=5, bias=True)
)


In [7]:
# simulation timestep, will be used for video saving
if "model" in dir(env):
  fps = 1 / env.model.opt.timestep
elif "render_fps" in env.env.metadata:
  fps = env.env.metadata["render_fps"]
else:
  fps = 4

ep_len = env.spec.max_episode_steps

# Replay buffer
if len(env.observation_space.shape) == 3:
  print("using memory-efficient replay buffer")
  stacked_frames = True
  frame_history_len = env.observation_space.shape[0]
  assert frame_history_len == 4, "only support 4 stacked frames"
  replay_buffer = MemoryEfficientReplayBuffer(
    frame_history_len=frame_history_len
  )
elif len(env.observation_space.shape) == 1:
  print("using normal replay buffer")
  stacked_frames = False
  replay_buffer = ReplayBuffer()
else:
  raise ValueError(
    f"Unsupported observation space shape: {env.observation_space.shape}"
  )

using memory-efficient replay buffer


In [8]:
def reset_env_training():
  # nonlocal observation

  observation, info = env.reset()

  # assert not isinstance(
  #   observation, tuple
  # ), "env.reset() must return np.ndarray - make sure your Gym version uses the old step API"
  
  observation = np.asarray(observation)
  
  if isinstance(replay_buffer, MemoryEfficientReplayBuffer):
    replay_buffer.on_reset(observation=observation[-1, ...])

  return observation, info

In [9]:
observation, info = reset_env_training()

for step in tqdm.trange(config["total_steps"], dynamic_ncols=True):
  epsilon = exploration_schedule.value(step)
  
  # Compute action
  action = agent.get_action(observation, epsilon)

  # Step the environment
  next_observation, reward, terminated, truncated, info = env.step(action) # done got replaced by terminated and truncated
  next_observation = np.asarray(next_observation)

  # TODO(student): Add the data to the replay buffer
  if isinstance(replay_buffer, MemoryEfficientReplayBuffer):
    # We're using the memory-efficient replay buffer,
    # so we only insert next_observation (not observation)
    if not stacked_frames:
      print("WARNING: Stacked frames not enabled, but using memory-efficient replay buffer")

    replay_buffer.insert(
      action=action,
      reward=reward,
      next_observation=next_observation[-1], # if stacked_frames else next_observation, # Only insert the last frame (frame stacking)
      terminated=terminated,
    )

  else:
    # We're using the regular replay buffer
    replay_buffer.insert(
      observation=observation,
      action=action,
      reward=reward,
      next_observation=next_observation,
      terminated=terminated,
    )

  # Handle episode termination
  if terminated or truncated:
    observation, info = reset_env_training()

    if "episode" in info:
      logger.log_scalar(info["episode"]["r"], "train_return", step)
      logger.log_scalar(info["episode"]["l"], "train_ep_len", step)
  else:
    observation = next_observation

  # Main DQN training loop
  if step >= config["learning_starts"]:
    # TODO(student): Sample config["batch_size"] samples from the replay buffer
    batch = replay_buffer.sample(config["batch_size"])

    # Convert to PyTorch tensors
    batch = ptu.from_numpy(batch)

    # TODO(student): Train the agent. `batch` is a dictionary of numpy arrays,
    update_info = agent.update(
      batch["observations"],
      batch["actions"],
      batch["rewards"],
      batch["next_observations"],
      batch["terminateds"],
      step,
    )

    # Logging code
    update_info["epsilon"] = epsilon
    update_info["lr"] = agent.lr_scheduler.get_last_lr()[0]

    if step % args.log_interval == 0:
      for k, v in update_info.items():
        logger.log_scalar(v, k, step)
      logger.flush()

  if step % args.metrics_interval == 0:
    # Evaluate
    trajectories = utils.sample_n_trajectories(
      eval_env,
      agent,
      args.num_eval_trajectories,
      ep_len,
    )
    returns = [t["episode_statistics"]["r"] for t in trajectories]
    ep_lens = [t["episode_statistics"]["l"] for t in trajectories]

    logger.log_scalar(np.mean(returns), "eval_return", step)
    logger.log_scalar(np.mean(ep_lens), "eval_ep_len", step)

    if len(returns) > 1:
      logger.log_scalar(np.std(returns), "eval/return_std", step)
      logger.log_scalar(np.max(returns), "eval/return_max", step)
      logger.log_scalar(np.min(returns), "eval/return_min", step)
      logger.log_scalar(np.std(ep_lens), "eval/ep_len_std", step)
      logger.log_scalar(np.max(ep_lens), "eval/ep_len_max", step)
      logger.log_scalar(np.min(ep_lens), "eval/ep_len_min", step)

  if args.num_render_trajectories > 0 and step % args.video_interval == 0:
    video_trajectories = utils.sample_n_trajectories(
      render_env,
      agent,
      args.num_render_trajectories,
      ep_len,
      render=True,
    )

    logger.log_paths_as_videos(
      video_trajectories,
      step,
      fps=fps,
      max_videos_to_save=args.num_render_trajectories,
      video_title="eval_rollouts",
    )

  logger.warn(
  2%|▏         | 21329/1000000 [14:00<59:26:51,  4.57it/s]  