In [1]:
debug = False

In [2]:
### TODO ###

### DISCUSSION POINTS ###
# We need to make n_eval 100 for fair comparison
# For SB3 we use vector environment not the normal one. Does this influence training time etc?

In [3]:
import gymnasium as gym
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
from torch.autograd import Variable
from collections import deque, namedtuple
from stable_baselines3 import DQN, A2C
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from gymnasium.wrappers import TimeLimit
from stable_baselines3.common import logger
from stable_baselines3.common.logger import configure
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import glob
import io
import base64
import imageio
import gymnasium as gym
from IPython.display import HTML, display
import os

In [4]:
### STORING EVERYTHING ###

# Base path
base_path = "/Users/maartendoekhie/Desktop/lunar_lander"

# Folders
folders = {
    "models": os.path.join(base_path, "models"),
    "videos": os.path.join(base_path, "videos"),
    "results": os.path.join(base_path, "results"),
    "logs": os.path.join(base_path, "logs"),
    "plots": os.path.join(base_path, "plots"),
    "logs_dqn": os.path.join(base_path, "logs", "dqn_tensorboard"),
    "logs_a2c": os.path.join(base_path, "logs", "a2c_tensorboard"),
}

save_dir = folders["results"]

# Make directories
for path in folders.values():
    os.makedirs(path, exist_ok=True)

In [5]:
# Making the lunar lander environment with the parameters advised by gym standards
env = gym.make(
    "LunarLander-v3",
    render_mode="rgb_array",
    continuous=False,
    gravity=-10.0,
    enable_wind=False,
    wind_power=15.0,
    turbulence_power=1.5,
)

env = TimeLimit(env, max_episode_steps=2000)  # or whatever you want

max_steps = 2000  # or 1000 if you keep default

env_vec = DummyVecEnv([
    lambda: TimeLimit(gym.make("LunarLander-v3"), max_episode_steps=max_steps)
])
env_vec = VecMonitor(env_vec)


In [6]:
# Action space: What can your agent do?

# 0: do nothing
# 1: fire left orientation engine
# 2: fire main engine
# 3: fire right orientation engine

action = env.action_space # not needed for now
action_size = 4

# Observation space: What can your agent see?

# The state is an 8-dimensional vector:

# the coordinates of the lander in x & y,
# its linear velocities in x & y, its angle,
# its angular velocity,
# and two booleans that represent whether each leg is in contact with the ground or not.

state = env.observation_space.shape # not needed for now
state_size = 8

### Vectorized version of the environment ### 

In [7]:
### HYPER PARAMETERS ###

if debug == True:
  # Replay Memory
  replay_buffer_size = 10000 
  minibatch = 32

  # learning
  learning_rate = 5e-4
  gamma = 0.99 
  interpolation_parameter = 1e-3

  # training
  number_episodes = 5 
  max_time_steps = 200

  # epsilon gready policy
  epsilon_starting_value = 1.0
  epsilon_ending_value = 0.01
  epsilon_decay_value = 0.995

  # evaluation
  n_eval_episodes = 3

  # SB3
  total_timesteps = 10000 
  batch_size = minibatch
  buffer_size = replay_buffer_size

else: 
  # Replay Memory
  replay_buffer_size = 100000
  minibatch = 150

  # learning
  learning_rate = 5e-4
  gamma = 0.99 
  interpolation_parameter = 1e-3 

  # training
  number_episodes = 5000
  max_time_steps = 1000

  # epsilon gready policy
  epsilon_starting_value = 1.0
  epsilon_ending_value = 0.01
  epsilon_decay_value = 0.995

  # evaluation
  n_eval_episodes = 10

  # SB3
  total_timesteps = 50000000
  batch_size = minibatch
  buffer_size = replay_buffer_size

In [8]:
### CUSTOM DQN AGENT SETUP ###

# Consisting of the DQN model, Agent (with implemented learning and training), Replay Memory

### DQN MODEL SETUP ###

# We wan't to start with a simple 2 layer fc NN. Starting from state size to action size
# Shallow problems 1 to 2 hidden layers and complex tasks 3 to 4 hidden layers
# Underfitting: add more layers/neurons, overfitting: add droppout, regularization or reduce size
# Slow learning: reduce depth or learning rate

class DQN_custom(nn.Module):
  def __init__(self, state_size, action_size, seed = 4):
    super(DQN_custom, self).__init__()
    self.seed = torch.manual_seed(seed)
    self.model = nn.Sequential(
        nn.Linear(state_size, 64),
        nn.ReLU(),
        nn.Linear(64,64),
        nn.ReLU(),
        nn.Linear(64, action_size)
    )
  def forward(self, state):
    return self.model(state)

# the memory can store an event: s, a,r, s' and done (with a maximum capacity: i.e. hyper parameter)
# the memory can give a sample: s, a, r, s' and done in a minibatch. i.e. matrices: S, A, R, R_, D

# 1. s:      the current state (before the action was taken)
#            This is the input to the Q-network and represents the agent’s current observation of the environment.

# 2. a:      the action taken by the agent in state s
#            This action is chosen according to an exploration policy (e.g., ε-greedy) and used to interact with the environment.

# 3. r:      the reward received after taking action a in state s
#            This tells the agent how good or bad the outcome of that action was.

# 4. s':     the next state observed after taking action a
#            This is the new observation received from the environment after applying the action.

# 5. done:   a boolean flag indicating if the episode ended after this step
#            If done is True, then s' is the terminal state and no further actions should be taken.


### DQN MEMORY ###

class Memory(object):

  def __init__(self, capacity):
    self.capacity = capacity
    self.memory = deque(maxlen=capacity)

  def store(self, event):
    self.memory.append(event)

  def sample(self, batch_size):
    if len(self.memory) < batch_size:# Not enough samples yet: tell the agent to skip learning

      return None

    else:
      minibatch = random.sample(self.memory, batch_size)
      state_list      = []
      action_list     = []
      reward_list     = []
      next_state_list = []
      done_list       = []

      for experiences in minibatch: # for every experience in the minibatch, we have a sars', done.
        if experiences is not None:
          state, action, reward, next_state, done = experiences
          state_list.append(state)
          action_list.append(action)
          reward_list.append(reward)
          next_state_list.append(next_state)
          done_list.append(done)

          # 1. states:      shape = (batch_size, 8)
          #    Each row is a full state vector from LunarLander (8 floats: x, y, vx, vy, angle, angular_vel, leg1_contact, leg2_contact)

          # 2. actions:     shape = (batch_size, 1)
          #    Each row contains the action taken (integer 0–3), one per experience

          # 3. rewards:     shape = (batch_size, 1)
          #    Each row contains the scalar reward received after taking the action

          # 4. next_states: shape = (batch_size, 8)
          #    Each row is the resulting state after the action was taken (same format as states)

          # 5. dones:       shape = (batch_size, 1)
          #    Each row is 1.0 if the episode ended after this transition, else 0.0

          # We wan't to create matrices of these events. with size explained above

      S = torch.from_numpy(np.vstack(state_list)).float()
      A = torch.from_numpy(np.vstack(action_list)).long()
      R = torch.from_numpy(np.vstack(reward_list)).float()
      R_ = torch.from_numpy(np.vstack(next_state_list)).float()
      D = torch.from_numpy(np.vstack(done_list).astype(np.uint8)).float()

      return S, A, R, R_, D

### DQN AGENT ###

class Agent():

  def __init__(self, state_size, action_size):
      self.action_size = action_size
      self.state_size = state_size

      self.local_qnetwork = DQN_custom(state_size, action_size)
      self.target_qnetwork = DQN_custom(state_size, action_size)

      self.optimizer = optim.Adam(self.local_qnetwork.parameters(), lr=learning_rate)
      self.memory = Memory(replay_buffer_size)
      self.t_step = 0

  def step(self, state, action, reward, next_state, done):
      self.memory.store((state, action, reward, next_state, done))
      self.t_step = (self.t_step + 1) % 4
      if self.t_step == 0:
          experiences = self.memory.sample(minibatch)
          if experiences is not None:
              self.learn(experiences, gamma)

  def get_action(self, state, epsilon):
    state = torch.from_numpy(state).float().unsqueeze(0) # pytorch expects a tensor batch
    self.local_qnetwork.eval() # avoid unwanted updates

    with torch.no_grad(): # passing the state through the network without calculating gradients.
        action_values = self.local_qnetwork(state)  # [Q(state, action_0), Q(state, action_1), ...]
    self.local_qnetwork.train() # putting network back into the training network

    if random.random() > epsilon: # exploration, exploitation tradeoff, epsilon-greedy exploration?
        return np.argmax(action_values.cpu().data.numpy()) # get the action with the highest q-value, exploitation part
    else:
        return random.choice(np.arange(self.action_size)) # else the agent will pick a random move. This happens when the agent, decides to explore rather than exploit.

  def learn(self, experiences, gamma): # training an AI agent in reinforcement learning, bellman adaptation for deep q learning is used (maybe here use something from the lecture slides)

      states, actions, rewards, next_states, dones = experiences

      next_q_targets = self.target_qnetwork(next_states).detach().max(1)[0].unsqueeze(1) # Check pictures

      q_targets = self.bellman_optimality(rewards, next_q_targets, dones, gamma) # Bellman equation on the target network,

      q_expected = self.local_qnetwork(states).gather(1, actions) # expected calculaton on the local network

      loss = nn.MSELoss() # (Q_target - Q_expected)^2
      loss = loss(q_expected, q_targets)

      self.optimizer.zero_grad() # classic!
      loss.backward()
      self.optimizer.step()

      self.td_update(self.local_qnetwork, self.target_qnetwork, interpolation_parameter) # See temporal difference updating from slides > implement in report.

  def bellman_optimality(self, rewards, next_q_values, dones, gamma):
    return rewards + gamma * next_q_values * (1 - dones)

  def td_update(self, local_qnetwork, target_qnetwork, interpolation_parameter):
    local_parameters = list(local_qnetwork.parameters())
    target_parameters = list(target_qnetwork.parameters())

    # Loop through each parameter index
    for i in range(len(local_parameters)):
        local_param = local_parameters[i]
        target_param = target_parameters[i]

        # Perform the soft update (temporal difference update)
        updated_param = interpolation_parameter * local_param.data + (1.0 - interpolation_parameter) * target_param.data
        target_param.data.copy_(updated_param)

  def train(self, env, number_episodes, max_time_steps,epsilon_starting_value, epsilon_ending_value, epsilon_decay_value):
      episode_scores = [] # stores all episode scores
      moving_avg_scores = [] # stores moving average of bandbreedte van 100
      epsilon_values = []
      episode_lengths = []
      cumulative_timesteps = 0
      timestep_counts = []  # to store total timesteps after each episode

      epsilon = epsilon_starting_value

      for i in range(number_episodes): # episodes
          state, info = env.reset() # resetting the environment, which returns the observation of the initial state and things we don't need. Check lunar lander description
          score = 0 # setting straight the score counter

          for j in range(max_time_steps): # timesteps within an episode
              action = self.get_action(state, epsilon) # epsion is the starting value (in the first loop)
              next_state, reward, done, _, _, = env.step(action) # it gives these values and more, which we don't need. Check lunar lander description
              self.step(state, action, reward, next_state, done) # letting the agent take a step. For which the next state becomes the current state
              state = next_state
              score += reward
              if done:
                  break

          episode_lengths.append(j+1)

          episode_scores.append(score)
          cumulative_timesteps += (j + 1)  # episode length is number of steps taken
          timestep_counts.append(cumulative_timesteps)

          # added average scoring because otherwise it stopped with learning at 200 way earlier (without being trained properly because of noise)
          if len(episode_scores) >= 100:
              avg_score = np.mean(episode_scores[-100:])
          else:
              avg_score = np.mean(episode_scores)

          moving_avg_scores.append(avg_score)

          epsilon = max(epsilon_ending_value, epsilon * epsilon_decay_value) # what does the decay rate mean for instance?

          epsilon_values.append(epsilon)

          if i % 100 == 0:
            print(f"Episode {i} | Average Score (last 100 episodes): {avg_score:.2f}")

            if avg_score >= 200.0: # specified by env rules to be a solution
                break

      return moving_avg_scores, episode_scores, epsilon_values, episode_lengths, timestep_counts # use these to plot the noise (and explain in the report)

In [9]:
#agent = Agent(state_size, action_size)
#moving_avg_scores_DQN, episode_scores_DQN, epsilon_values_DQN, episode_lengths_DQN, timestep_counts_DQN = agent.train(env, number_episodes, max_time_steps, epsilon_starting_value, epsilon_ending_value, epsilon_decay_value)

#np.save(os.path.join(save_dir, "moving_avg_scores_DQN.npy"), moving_avg_scores_DQN)
#np.save(os.path.join(save_dir, "episode_scores_DQN.npy"), episode_scores_DQN)
#np.save(os.path.join(save_dir, "epsilon_values_DQN.npy"), epsilon_values_DQN)
#np.save(os.path.join(save_dir, "episode_lengths_DQN.npy"), episode_lengths_DQN)
#np.save(os.path.join(save_dir, "timestep_counts_DQN.npy"), timestep_counts_DQN)

In [10]:
# Directly saving the trained q network and local network

#torch.save(agent.local_qnetwork.state_dict(), os.path.join(folders["models"], "local_qnetwork_DQN.pt"))
#torch.save(agent.target_qnetwork.state_dict(), os.path.join(folders["models"], "target_qnetwork_DQN.pt"))

In [11]:
# Evaluating the agent's performance for env and n_eval_episodes 
def evaluate_agent(agent, env, n_eval_episodes=10):
    evaluation_rewards = []
    evaluation_lengths = []

    for episode in range(n_eval_episodes):
        state, _ = env.reset()
        done = False
        total_reward = 0
        steps = 0

        while not done:
            # Use greedy policy: epsilon = 0
            action = agent.get_action(state, epsilon=0.0)
            next_state, reward, done, _, _ = env.step(action.item())
            state = next_state
            total_reward += reward
            steps += 1

        evaluation_rewards.append(total_reward)
        evaluation_lengths.append(steps)

    mean_reward = np.mean(evaluation_rewards)
    std_reward = np.std(evaluation_rewards)

    return evaluation_rewards, evaluation_lengths, mean_reward, std_reward

In [12]:
#eval_rewards_DQN, eval_lengths_DQN, mean_reward_DQN, std_reward_DQN = evaluate_agent(agent, env, n_eval_episodes=n_eval_episodes)

# Save DQN evaluation results
#np.save(os.path.join(save_dir, "eval_rewards_DQN.npy"), eval_rewards_DQN)
#np.save(os.path.join(save_dir, "eval_lengths_DQN.npy"), eval_lengths_DQN)
#np.save(os.path.join(save_dir, "mean_reward_DQN.npy"), mean_reward_DQN)
#np.save(os.path.join(save_dir, "std_reward_DQN.npy"), std_reward_DQN)

In [13]:
########################
### STABLE BASELINES ###
########################

# Consisting of a comparison DQN model and a A2C model. With a training logger to evaluate parameters during training
class TrainingLogger(BaseCallback):
    def __init__(self, verbose=0, moving_avg_window=100, stop_score=200):
        super().__init__(verbose)
        self.moving_avg_window = moving_avg_window
        self.stop_score = stop_score

        # Tracking all needed info
        self.episode_rewards = []
        self.episode_lengths = []
        self.timesteps = []
        self.epsilon_values = []  # For DQN only
        self.moving_avg_scores = []

    def _on_step(self) -> bool:
        done_array = np.array(self.locals["dones"])

        for i, done in enumerate(done_array):
            if done:
                ep_info = self.locals["infos"][i].get("episode", None)
                if ep_info:
                    self.episode_rewards.append(ep_info["r"])
                    self.episode_lengths.append(ep_info["l"])
                    self.timesteps.append(self.num_timesteps)

                    # Moving average of last N rewards
                    window = self.episode_rewards[-self.moving_avg_window:]
                    avg = np.mean(window)
                    self.moving_avg_scores.append(avg)

                    # Epsilon tracking
                    if hasattr(self.model, "exploration_rate"):
                        self.epsilon_values.append(self.model.exploration_rate)
                    print(f"Step: {self.num_timesteps}, Moving Avg: {avg:.2f}")

                    # Early stopping
                    if avg >= self.stop_score:
                        print(f"Solved! Moving average over last {self.moving_avg_window} episodes: {avg:.2f}")
                        return False

        return True

def evaluate_sb3_agent(agent, env, n_eval_episodes=10):
    eval_rewards = []
    eval_lengths = []

    for _ in range(n_eval_episodes):
        obs = env.reset()
        done = False
        total_reward = 0
        steps = 0

        while not done:
            action, _ = agent.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            total_reward += reward
            steps += 1

        eval_rewards.append(total_reward)
        eval_lengths.append(steps)

    mean_reward = np.mean(eval_rewards)
    std_reward = np.std(eval_rewards)

    return eval_rewards, eval_lengths, mean_reward, std_reward

In [14]:
### DQN SB3 ###

# Train DQN
dqn_model = DQN(
    "MlpPolicy",
    env_vec,
    verbose=1,
    tensorboard_log=folders["logs_dqn"]
)

callback_DQN = TrainingLogger(moving_avg_window=100, stop_score=200)
dqn_model.learn(total_timesteps=total_timesteps, callback=callback_DQN)
mean_reward_dqn, _ = evaluate_policy(dqn_model, env_vec, n_eval_episodes=10)

dqn_model.save(os.path.join(folders["models"], "sb3_dqn_model"))

# For DQN SB3
moving_avg_scores_DQN_SB3 = callback_DQN.moving_avg_scores
episode_scores_DQN_SB3 = callback_DQN.episode_rewards
epsilon_values_DQN_SB3 = callback_DQN.epsilon_values  # Only meaningful for DQN
episode_lengths_DQN_SB3 = callback_DQN.episode_lengths
timestep_counts_DQN_SB3 = callback_DQN.timesteps

# Training SB3 DQN
np.save(os.path.join(save_dir, "moving_avg_scores_DQN_SB3.npy"), moving_avg_scores_DQN_SB3)
np.save(os.path.join(save_dir, "episode_scores_DQN_SB3.npy"), episode_scores_DQN_SB3)
np.save(os.path.join(save_dir, "epsilon_values_DQN_SB3.npy"), epsilon_values_DQN_SB3)
np.save(os.path.join(save_dir, "episode_lengths_DQN_SB3.npy"), episode_lengths_DQN_SB3)
np.save(os.path.join(save_dir, "timestep_counts_DQN_SB3.npy"), timestep_counts_DQN_SB3)

eval_rewards_DQN_SB3, eval_lengths_DQN_SB3, mean_reward_DQN_SB3, std_reward_DQN_SB3 = evaluate_sb3_agent(dqn_model, env_vec, n_eval_episodes=n_eval_episodes)

# Save SB3 DQN evaluation results
np.save(os.path.join(save_dir, "eval_rewards_DQN_SB3.npy"), eval_rewards_DQN_SB3)
np.save(os.path.join(save_dir, "eval_lengths_DQN_SB3.npy"), eval_lengths_DQN_SB3)
np.save(os.path.join(save_dir, "mean_reward_DQN_SB3.npy"), mean_reward_DQN_SB3)
np.save(os.path.join(save_dir, "std_reward_DQN_SB3.npy"), std_reward_DQN_SB3)

Using cpu device
Logging to /Users/maartendoekhie/Desktop/lunar_lander/logs/dqn_tensorboard/DQN_8
Step: 109, Moving Avg: -176.56
Step: 248, Moving Avg: -126.52
Step: 339, Moving Avg: -112.92
Step: 397, Moving Avg: -117.74
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 99.2     |
|    ep_rew_mean      | -118     |
|    exploration_rate | 1        |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4288     |
|    time_elapsed     | 0        |
|    total_timesteps  | 397      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 1.26     |
|    n_updates        | 74       |
----------------------------------
Step: 524, Moving Avg: -122.63
Step: 609, Moving Avg: -120.69
Step: 672, Moving Avg: -124.83
Step: 754, Moving Avg: -137.18
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 94.2     |
|    ep_rew_mean      | 

In [15]:
### A2C SB3 ###
# Train A2C
a2c_model = A2C(
    "MlpPolicy",
    env_vec,
    verbose=1,
     tensorboard_log=folders["logs_a2c"]
)

callback_A2C = TrainingLogger(moving_avg_window=100, stop_score=200)
a2c_model.learn(total_timesteps=total_timesteps, callback=callback_A2C)
mean_reward_a2c, _ = evaluate_policy(a2c_model, env_vec, n_eval_episodes=10)

a2c_model.save(os.path.join(folders["models"], "sb3_a2c_model"))

# For A2C SB3
moving_avg_scores_a2c = callback_A2C.moving_avg_scores
episode_scores_a2c = callback_A2C.episode_rewards
episode_lengths_a2c = callback_A2C.episode_lengths
timestep_counts_a2c = callback_A2C.timesteps

# Training A2C
np.save(os.path.join(save_dir, "moving_avg_scores_A2C.npy"), moving_avg_scores_a2c)
np.save(os.path.join(save_dir, "episode_scores_A2C.npy"), episode_scores_a2c)
np.save(os.path.join(save_dir, "episode_lengths_A2C.npy"), episode_lengths_a2c)
np.save(os.path.join(save_dir, "timestep_counts_A2C.npy"), timestep_counts_a2c)

eval_rewards_A2C, eval_lengths_A2C, mean_reward_A2C, std_reward_A2C = evaluate_sb3_agent(a2c_model, env_vec, n_eval_episodes=n_eval_episodes)

# Save SB3 A2C evaluation results
np.save(os.path.join(save_dir, "eval_rewards_A2C.npy"), eval_rewards_A2C)
np.save(os.path.join(save_dir, "eval_lengths_A2C.npy"), eval_lengths_A2C)
np.save(os.path.join(save_dir, "mean_reward_A2C.npy"), mean_reward_A2C)
np.save(os.path.join(save_dir, "std_reward_A2C.npy"), std_reward_A2C)

Using cpu device
Logging to /Users/maartendoekhie/Desktop/lunar_lander/logs/a2c_tensorboard/A2C_7
Step: 103, Moving Avg: -191.99
Step: 204, Moving Avg: -307.13
Step: 279, Moving Avg: -246.96
Step: 377, Moving Avg: -263.46
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 94.2     |
|    ep_rew_mean        | -263     |
| time/                 |          |
|    fps                | 1600     |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.26    |
|    explained_variance | -0.0809  |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 2.21     |
|    value_loss         | 8.3      |
------------------------------------
Step: 508, Moving Avg: -226.52
Step: 687, Moving Avg: -264.32
Step: 865, Moving Avg: -290.55
------------------------------------
| rollout/         

KeyboardInterrupt: 

In [None]:
#######################
### DISPLAY VIDEO'S ###
#######################

def record_agent_video(agent, env, episode_idx, output_dir="videos", fps=30):
    os.makedirs(output_dir, exist_ok=True)
    state, _ = env.reset()
    done = False
    frames = []

    while not done:
        frame = env.render()
        if frame.ndim == 3:  # Ensure it's an RGB image
            frames.append(frame)
        action = agent.get_action(state, epsilon=0.0)
        state, _, done, _, _ = env.step(action.item())

    env.close()
    filename = os.path.join(output_dir, f"episode_{episode_idx}.mp4")
    imageio.mimsave(filename, frames, fps=fps)
    return filename

def record_agent_video_SB3(agent, env, episode_idx, output_dir="videos", fps=30):
    os.makedirs(output_dir, exist_ok=True)
    state = env.reset()
    done = False
    frames = []

    while not done:
        frame = env.render()
        if frame.ndim == 3:  # Ensure RGB
            frames.append(frame)

        action, _ = agent.predict(state, deterministic=True)
        obs, reward, done, info = env.step(action)

    filename = os.path.join(output_dir, f"episode_{episode_idx}.mp4")
    imageio.mimsave(filename, frames, fps=fps)
    return filename

def display_videos_grid(video_paths, videos_per_row=5, video_width=200):
    html = "<table>"
    for i in range(0, len(video_paths), videos_per_row):
        html += "<tr>"
        for video_path in video_paths[i:i + videos_per_row]:
            with open(video_path, "rb") as f:
                encoded = base64.b64encode(f.read()).decode("ascii")
            html += f"""
            <td>
                <video width="{video_width}" controls autoplay loop muted>
                    <source src="data:video/mp4;base64,{encoded}" type="video/mp4">
                </video>
            </td>
            """
        html += "</tr>"
    html += "</table>"
    display(HTML(html))

In [None]:
video_paths_DQN = []

for episode_idx in range(n_eval_episodes):
    video_path = record_agent_video(agent, env, episode_idx, output_dir="videos_DQN")
    video_paths_DQN.append(video_path)

### COMMENT IN IF YOU WAN'T TO SEE VIDEO'S ###
# display_videos_grid(video_paths_DQN)

video_paths_DQN_SB3 = []

for episode_idx in range(n_eval_episodes):
    video_path = record_agent_video_SB3(dqn_model, env_vec, episode_idx, output_dir="videos_DQN_SB3")
    video_paths_DQN_SB3.append(video_path)

### COMMENT IN IF YOU WAN'T TO SEE VIDEO'S ###
# display_videos_grid(video_paths_DQN_SB3)

video_paths_A2C_SB3 = []
for episode_idx in range(n_eval_episodes):
    video_path = record_agent_video_SB3(a2c_model, env_vec, episode_idx, output_dir="videos_A2C_SB3")
    video_paths_A2C_SB3.append(video_path)

### COMMENT IN IF YOU WAN'T TO SEE VIDEO'S ###
# display_videos_grid(video_paths_A2C_SB3)



In [None]:
### MAKE SURE IT IS SAVED ###

print("Saved files:")
for category, path in folders.items():
    print(f"\n[{category}]")
    print(os.listdir(path))

Saved files:

[models]
['local_qnetwork_DQN.pt', 'sb3_a2c_model.zip', 'target_qnetwork_DQN.pt', 'sb3_dqn_model.zip']

[videos]
[]

[results]
['epsilon_values_DQN_SB3.npy', 'episode_scores_DQN.npy', 'timestep_counts_DQN.npy', 'eval_lengths_DQN_SB3.npy', 'std_reward_DQN_SB3.npy', 'eval_rewards_A2C.npy', '.DS_Store', 'moving_avg_scores_DQN_SB3.npy', 'mean_reward_DQN.npy', 'eval_rewards_DQN_SB3.npy', 'episode_scores_DQN_SB3.npy', 'moving_avg_scores_A2C.npy', 'sensitivity', 'std_reward_A2C.npy', 'eval_lengths_A2C.npy', 'episode_lengths_DQN.npy', 'mean_reward_A2C.npy', 'epsilon_values_DQN.npy', 'timestep_counts_DQN_SB3.npy', 'mean_reward_DQN_SB3.npy', 'eval_rewards_DQN.npy', 'timestep_counts_A2C.npy', 'sensitivity_results.csv', 'episode_scores_A2C.npy', 'episode_lengths_DQN_SB3.npy', 'episode_lengths_A2C.npy', 'std_reward_DQN.npy', 'eval_lengths_DQN.npy', 'moving_avg_scores_DQN.npy']

[logs]
['.DS_Store', 'a2c_tensorboard', 'dqn_tensorboard']

[plots]
[]

[logs_dqn]
['DQN_4', 'DQN_3', 'DQN_2