# Implementation of Deep Deterministic Policy Gradient (DDPG)

Marina Bermúdez Granados <br />
<b>Access to all results:</b> https://shorturl.at/JORru

## Libraries

In [1]:
# Download gymnasium
!pip -q install gymnasium
!pip -q install Box2D
!pip -q install mujoco

!apt-get -q update
!apt-get -q install -y libgl1-mesa-glx libosmesa6-dev libglfw3 patchelf
!pip -q install imageio imageio-ffmpeg
!pip -q install pyvirtualdisplay

"apt-get" no se reconoce como un comando interno o externo,
programa o archivo por lotes ejecutable.
"apt-get" no se reconoce como un comando interno o externo,
programa o archivo por lotes ejecutable.


In [2]:
import copy
import random
import time
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

import glob
import io
import base64

import torch
import torch.nn as nn

import gymnasium as gym

import torch.nn.functional as F
from torch.optim import Adam

from gymnasium.wrappers import RecordVideo
from IPython.display import HTML
from IPython import display

from pyvirtualdisplay import Display
import imageio


import warnings
warnings.filterwarnings("ignore")

<br />

## Mount Directory

In [3]:
# Specify mode
mode = "local"

# Adjustments
if mode == "local":
    root =  ""

elif mode == "drive":
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    root = '/content/drive/My Drive/Colab Notebooks/ATCI/'

    # Build root folder
    if not os.path.exists(root):
        os.makedirs(root)

elif mode == "kaggle":
    from pathlib import Path
    from zipfile import ZIP_DEFLATED, ZipFile
    from os import PathLike
    from IPython.display import FileLink

    def zip_dir(zip_name, source_dir):
        src_path = Path(source_dir).expanduser().resolve(strict=True)
        with ZipFile(zip_name, 'w', ZIP_DEFLATED) as zf:
            for file in src_path.rglob('*'):
                zf.write(file, file.relative_to(src_path.parent))
    root = "/kaggle/working/"

<br />

## Utility Functions

In [None]:
def show_video(video_folder, wait_time=5):
  mp4list = []
  timeout = time.time() + wait_time

  while time.time() < timeout and not mp4list:
    mp4list = glob.glob(f"{video_folder}*.mp4")
    time.sleep(0.5)  # check every 0.5 seconds

  mp4list = glob.glob(f"{video_folder}*.mp4")
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    display.display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else:
    print("Could not find video")


def str2list(init_string):
  print(init_string)
  final_list = init_string.replace("np.float64(", "")
  final_list = final_list.replace("np.float32(", "")
  final_list = final_list.replace("device='cuda:0'", "")
  final_list = final_list.replace("tensor(", "")
  final_list = final_list.replace(")", "")
  final_list = final_list.replace(",", "")
  final_list = final_list.replace("[", "")
  final_list = final_list.replace("]", "")

  final_list = final_list.split(" ")
  final_list = [x for x in final_list if x.strip()]
  return [float(i) for i in final_list]


def plot_episodes(dataset, colors, root, name_experiment, show=True):

  # Common parameters
  x = range(1, len(dataset)+1)
  lowess = sm.nonparametric.lowess
  f = 0.05

  # Rewards + Losses across Episodes
  fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

  ax1.plot(x, dataset["Accumulated Reward"], color=colors["default"][0])
  ax1.plot(x, lowess(dataset["Accumulated Reward"], x, frac=f)[:, 1], color=colors["mean"][0])
  ax1.set_title("Accumulated Rewards across Episodes")
  ax1.grid(True)

  ax2.plot(x, dataset["Mean Actor losses"], color=colors["default"][1])
  ax2.plot(x, lowess(dataset["Mean Actor losses"], x, frac=f)[:, 1], color=colors["mean"][1])
  ax2.set_title("Mean Actor Loss across Episodes")
  ax2.grid(True)

  ax3.plot(x, dataset["Mean Critic losses"], color=colors["default"][2])
  ax3.plot(x, lowess(dataset["Mean Critic losses"], x, frac=f)[:, 1], color=colors["mean"][2])
  ax3.set_title("Mean Critic Loss across Episodes")
  ax3.grid(True)

  plt.tight_layout()
  plt.savefig(f"{root}{name_experiment}_episodic_metrics.jpg")

  if show:
    plt.show()


def plot_run(row, colors, root, name_experiment, show=True, loaded=False):

  # Common parameters
  lowess = sm.nonparametric.lowess
  f = 0.05

  # Conversions
  if loaded:
    rewards = str2list(row["Rewards"])
    actor = str2list(row["Actor losses"])
    critic = str2list(row["Critic losses"])
  else:
    rewards = row["Rewards"]
    actor = row["Actor losses"]
    critic = row["Critic losses"]
  x = range(1, len(rewards)+1)

  fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

  ax1.plot(x, rewards, color=colors["default"][0])
  ax1.plot(x, lowess(rewards, x, frac=f)[:, 1], color=colors["mean"][0])
  ax1.set_title("Rewards across Time Steps")
  ax1.grid(True)

  ax2.plot(x, actor, color=colors["default"][1])
  ax2.plot(x, lowess(actor, x, frac=f)[:, 1], color=colors["mean"][1])
  ax2.set_title("Actor Loss across Time Steps")
  ax2.grid(True)

  ax3.plot(x, critic, color=colors["default"][2])
  ax3.plot(x, lowess(critic, x, frac=f)[:, 1], color=colors["mean"][2])
  ax3.set_title("Critic Loss across Time Steps")
  ax3.grid(True)

  plt.tight_layout()
  plt.savefig(f"{root}{name_experiment}_metrics.jpg")

  if show:
    plt.show()


def plot_from_dataset(dataset, root, name_experiment,
                      load_dataset=False, colors=None, show=True,
                      max_min=True, verbose=True):

  if load_dataset:
    dataset = pd.read_csv(f"{root}/results.csv")

  if colors is None:
    default_colors = ["#fd7f6f", "#7eb0d5", "#b2e061", "#bd7ebe", "#ffb55a", "#fdcce5", "#8bd3c7"]
    mean_colors = ["#aa372f", "#32688a", "#558500", "#7b407d", "#a06300", "#9c7087", "#458c82"]

    colors = {
        "default": default_colors,
        "mean": mean_colors
    }

  # Rewards + Losses across Episodes
  if verbose:
    print("Episode Summary:")
  plot_episodes(dataset, colors, root, name_experiment, show)

  # Early episode
  if verbose:
    print("\n\nEarly run(10):")
  plot_run(dataset.iloc[10], colors, root, f"{name_experiment}_early", show, loaded=load_dataset)

  # Best episode
  if max_min:
    best = dataset.iloc[dataset["Accumulated Reward"].idxmax()]
    best_ind = dataset["Accumulated Reward"].idxmax()
  else:
    best = dataset.iloc[dataset["Accumulated Reward"].idxmin()]
    best_ind = dataset["Accumulated Reward"].idxmin()

  if verbose:
    print(f"\n\nBest run({best_ind}):")
  plot_run(best, colors, root, f"{name_experiment}_best", show, loaded=load_dataset)

<br >

## Deep Deterministic Policy Gradient

Paper: https://arxiv.org/abs/1509.02971

<br >

### Replay Buffer

In [5]:
class ReplayBuffer:
  def __init__(self, action_space, observation_space, batch_size, capacity=1000000):
    """
    Replay Buffer
    :action_space = Dimension of action space
    :observation_space = Dimension of observation space
    :batch_size = Batch size
          - Batch size 16 for pixel problems
          - Batch size 62 for low dimensional problems
    :capacity = Maximum size of the buffer (10^6 default)
    """

    # Buffer Capacity and Batch Size
    self.capacity = capacity
    self.batch_size = batch_size

    # Save Buffers as numpy arrays
    self.s = np.zeros([capacity, observation_space], dtype=np.float32)
    self.a = np.zeros([capacity, action_space], dtype=np.float32)
    self.r = np.zeros([capacity, 1], dtype=np.float32)

    self.s_prime = np.zeros([capacity, observation_space], dtype=np.float32)
    self.result = np.zeros([capacity, 1], dtype=bool)

    # Buffer Size and Pointer
    self.buffer_size = 0
    self.buffer_ptr = 0

  def push(self, s, a, r, s_prime, result):
    # Ensure Pointer is within the given capacity
    self.buffer_ptr = self.buffer_ptr % self.capacity

    # Save observation space
    self.s[self.buffer_ptr] = s
    self.a[self.buffer_ptr] = a
    self.r[self.buffer_ptr] = r

    self.s_prime[self.buffer_ptr] = s_prime
    self.result[self.buffer_ptr] = result

    # Increase Size and move Pointer
    self.buffer_size += 1
    self.buffer_ptr += 1

  def sample(self):
    # Ensure Buffer Size is within the given capacity
    self.buffer_size = min(self.buffer_size, self.capacity)

    # Ensure there are enough samples
    assert self.buffer_size >= self.batch_size, "Not enough samples!"

    # Sample random batch
    sample = np.random.choice(self.buffer_size, self.batch_size, replace=False)

    # Return sample: s, a, r, s', result
    return self.s[sample], self.a[sample], self.r[sample], self.s_prime[sample], self.result[sample]

  def __len__(self):
    # Return Legal Buffer Size
    return min(self.buffer_size, self.capacity)

<br >

### Noise: Ornstein-Uhlenbeck process

In [6]:
class Noise:
  def __init__(self, action_space, mean=0.0, theta=0.15, sigma=0.2):
    """
    Ornstein-Uhlenbeck process
    :mean = Mean value
    :theta = Theta hyperparameter (0.15 deafult)
    :sigma = Sigma hyperparameter (0.2 default)
    :seed = Random seed (0 default)
    """
    # Action space and x
    self.action_space = action_space
    self.x = mean * np.ones(action_space)

    # Parameters from formula
    self.mean = mean
    self.theta = theta
    self.sigma = sigma

  def sample(self):
    # Compute noise
    dx = self.theta * (self.mean - self.x) + self.sigma * np.array([random.random() for _ in range(len(self.x))])
    self.x += dx

    # Return sample
    return self.x

  def reset(self):
    # Reset x to default initialization
    self.x = self.mean * np.ones(self.action_space)

<br >

### Actor - Critic Architectures

In [7]:
class Actor(nn.Module):
  def __init__(self, input, output, init_weights=0.003):
    """
    Actor network
    :input = Input dimension
    :output: Output dimension
    :init_weights = Starting weights
    """
    super(Actor, self).__init__()

    # Layers
    self.fc1 = nn.Linear(input, 128)
    self.fc2 = nn.Linear(128, 128)
    self.out = nn.Linear(128, output)

    # Initialize weights
    self._init_weights(init_weights)

  def _init_weights(self, init_weights):
    # Set initial weights
    self.out.weight.data.uniform_(-init_weights, init_weights)
    self.out.bias.data.uniform_(-init_weights, init_weights)

  def forward(self, s):
    # Build architecture
    x = self.fc1(s)
    x = F.relu(x)
    x = self.fc2(x)
    x = F.relu(x)
    return torch.tanh(self.out(x))



class Critic(nn.Module):
  def __init__(self, input, init_weights=0.003):
    """
    Critic network
    :input = Input dimension
    :output: Output dimension
    :init_weights = Starting weights
    """
    super(Critic, self).__init__()

    # Layers
    self.fc1 = nn.Linear(input, 128)
    self.fc2 = nn.Linear(128, 128)
    self.out = nn.Linear(128, 1)

    # Initialize weights
    self._init_weights(init_weights)

  def _init_weights(self, init_weights):
    # Set initial weights
    self.out.weight.data.uniform_(-init_weights, init_weights)
    self.out.bias.data.uniform_(-init_weights, init_weights)

  def forward(self, s, a):
    # Build architecture
    x = torch.cat((s, a), dim=-1)
    x = self.fc1(x)
    x = F.relu(x)
    x = self.fc2(x)
    x = F.relu(x)
    return self.out(x)

<br >

### DDPG

In [None]:
class DDPG:
  def __init__(self, env, M, T, hyperparameters, seed=0):
    """
    Deep Deterministic Policy Gradient algorithm
    :env = Gymnasium Environment
    :M = Number of episodes
    :T = Number of steps
    :hyperparameters = Other hyperparameters
    :seed = Random seed (0 default)
    """
    # Random seed
    self.seed = seed

    # Hyperparameters
    self.hyperparameters = hyperparameters
    self.gamma = hyperparameters["gamma"]
    self.tau = hyperparameters["tau"]
    self.batch_size = hyperparameters["batch_size"]

    # Environment
    self.env = env
    self.observation_space = env.observation_space.shape[0]

    self.action_space = env.action_space.shape[0]
    self.action_space_low_bound = env.action_space.low[0]
    self.action_space_high_bound = env.action_space.high[0]

    # Device
    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Networks and Optimizers
    self.U_local = Actor(self.observation_space, self.action_space, self.seed).to(self.device)
    self.U_target = Actor(self.observation_space, self.action_space, self.seed).to(self.device)
    self.U_target.load_state_dict(self.U_local.state_dict())

    self.U_optimizer = Adam(self.U_local.parameters(),
                            lr=hyperparameters["actor_lr"])

    self.Q_local = Critic(self.observation_space + self.action_space, self.seed).to(self.device)
    self.Q_target = Critic(self.observation_space + self.action_space, self.seed).to(self.device)
    self.Q_target.load_state_dict(self.Q_local.state_dict())

    self.Q_optimizer = Adam(self.Q_local.parameters(),
                            lr=hyperparameters["critic_lr"],
                            weight_decay=hyperparameters["critic_wd"])

    # Replay Buffer
    self.R = ReplayBuffer(self.action_space, self.observation_space, self.batch_size)

    # Noise
    self.N = Noise(self.action_space)

    # Episodes and Time Steps
    self.M = M
    self.T = T

  def _take_action(self, state, noise):
    # Format state for network
    state = torch.from_numpy(state).float().to(self.device)

    # Generate action
    self.U_local.eval()
    with torch.no_grad():
      action = self.U_local(state).cpu().numpy()
    self.U_local.train()

    # Add noise
    if noise:
      action += self.N.sample()

    # Return action
    return np.clip(action, self.action_space_low_bound, self.action_space_high_bound)

  def _update(self):
    # Ensure there are enough samples, else return -1
    if len(self.R) < self.batch_size:
        return -1, -1

    # Sample random minibatch and format
    s_batch, a_batch, r_batch, s_prime_batch, result_batch = self.R.sample()

    s_batch = torch.from_numpy(s_batch).float().to(self.device)
    a_batch = torch.from_numpy(a_batch).float().to(self.device)
    r_batch = torch.from_numpy(r_batch).float().to(self.device)
    s_prime_batch = torch.from_numpy(s_prime_batch).float().to(self.device)
    result_batch = torch.from_numpy(result_batch).float().to(self.device)

    # Activate gradient (for critic)
    s_batch.requires_grad_()
    a_batch.requires_grad_()

    # Bellman
    with torch.no_grad():
      action_prime = self.U_target(s_prime_batch)
      Q_prime = self.Q_target(s_prime_batch, action_prime)
      Q = r_batch + (self.gamma * Q_prime * (1-result_batch))

    # Update critic
    Q_pred = self.Q_local(s_batch, a_batch)
    Q_loss = F.mse_loss(Q_pred, Q)

    self.Q_optimizer.zero_grad()
    Q_loss.backward()
    self.Q_optimizer.step()

    # Update actor
    a_pred = self.U_local(s_batch)
    U_loss = -self.Q_local(s_batch, a_pred).mean()

    self.U_optimizer.zero_grad()
    U_loss.backward()
    self.U_optimizer.step()

    # Soft updates
    self._soft_update(self.U_local, self.U_target)
    self._soft_update(self.Q_local, self.Q_target)

    # Return losses
    return U_loss.data, Q_loss.data

  def _soft_update(self, local_model, target_model):
    # Apply soft update to local/target networks
    for t_param, l_param in zip(target_model.parameters(), local_model.parameters()):
      t_param.data.copy_(self.tau * l_param.data + (1.0 - self.tau) * t_param.data)


  def train(self, root, save_every=100, load_networks=False, verbose=True):

    # Load checkpoint
    if load_networks:
      self.U_local.load_state_dict(torch.load(f"{root}/network_U.pth"))
      self.Q_local.load_state_dict(torch.load(f"{root}/network_Q.pth"))

    results = {}

    # Episode loop
    for m in range(0, self.M):

      # Losses and Rewards
      all_U_losses = []
      all_Q_losses = []
      all_rewards = []
      acc_reward = 0

      # Reset environment and Internal state
      state, _ = self.env.reset()
      self.N.reset()

      # Show progress
      if verbose:
        print(f"Starting episode {m+1}/{self.M}...")

      # Measure time
      t0 = time.time()

      # Step loop
      for t in range(0, self.T):

        # Pick action + Noise
        action = self._take_action(state, True)

        # Apply action
        next, reward, terminated, truncated, _ = self.env.step(action)

        # Save in Replay Buffer
        self.R.push(state, action, reward, next, terminated or truncated)

        # Update state and rewards
        state = next
        acc_reward += reward

        # Update networks
        U_loss, Q_loss = self._update()

        # Store results
        all_U_losses.append(U_loss)
        all_Q_losses.append(Q_loss)
        all_rewards.append(reward)

        # Break Episode when finishing
        if terminated or truncated:
          break

      # Stop timer
      tf = time.time() - t0

    # Save Episode's metrics in json
      mean_U_loss = np.mean([x.cpu().item() if torch.is_tensor(x) and x.is_cuda else x
                            for x in all_U_losses])
      mean_Q_loss = np.mean([x.cpu().item() if torch.is_tensor(x) and x.is_cuda else x
                            for x in all_Q_losses])
      mean_rewards = np.mean([x.cpu().item() if torch.is_tensor(x) and x.is_cuda else x
                              for x in all_rewards])
      results[m] = {
          "Actor losses": all_U_losses,
          "Mean Actor losses": mean_U_loss,
          "Critic losses": all_Q_losses,
          "Mean Critic losses": mean_Q_loss,
          "Rewards": all_rewards,
          "Mean Rewards": mean_rewards,
          "Accumulated Reward": acc_reward,
          "Steps": t,
          "Time": tf,
          "Terminated": terminated,
          "Truncated": truncated
      }

      # Save Episode's metrics in pandas dataset
      results_dataset = pd.DataFrame.from_dict(results, orient="index",
                                              columns=["Actor losses", "Mean Actor losses",
                                                        "Critic losses", "Mean Critic losses",
                                                        "Rewards", "Mean Rewards", "Accumulated Reward",
                                                        "Steps", "Time", "Terminated", "Truncated"])
      # Show progress
      if verbose:
        print(f"Results:  Actor Loss = {mean_U_loss}, Critic Loss = {mean_Q_loss}, Rewards = {mean_rewards}")
        print(f"          Accumulated Reward = {acc_reward}, Time = {tf}, Time Steps = {t+1}")
        print(f"          Terminated = {terminated}, Truncated = {truncated}\n")

      # Checkpoint
      if m % save_every == 0:
        if verbose:
          print("Checkpoint!")
        torch.save(self.U_local.state_dict(), f"{root}/network_U.pth")
        torch.save(self.Q_local.state_dict(), f"{root}/network_Q.pth")
        results_dataset.to_csv(f"{root}/results.csv")


    # Save (final) networks and results
    torch.save(self.U_local.state_dict(), f"{root}/network_U.pth")
    torch.save(self.Q_local.state_dict(), f"{root}/network_Q.pth")
    results_dataset.to_csv(f"{root}/results.csv")

    return results, results_dataset


  def test(self, M, T, root, load_networks=False, verbose=True, show=True):
    # Load networks if necessary
    if load_networks:
      self.U_local.load_state_dict(torch.load(f"{root}/network_U.pth"))
      self.Q_local.load_state_dict(torch.load(f"{root}/network_Q.pth"))

    results = {}

    # Prepare Recording
    reset = self.env

    # Episode loop
    for m in range(0, M):

      # Rewards
      frames = []
      all_rewards = []
      acc_reward = 0
      
      # Reset environment
      self.env = gym.wrappers.RecordVideo(self.env, video_folder=root, 
                                    episode_trigger=lambda episode_id: True)
      state, _ = self.env.reset()

      # Measure time
      t0 = time.time()

      # Step loop
      for t in range(0, T):

        # Pick action -Noise
        action = self._take_action(state, False)

        # Render video
        frame = self.env.render()
        frames.append(frame)

        # Apply action
        state, reward, terminated, truncated, _ = self.env.step(action)

        # Save rewards
        all_rewards.append(reward)
        acc_reward += reward

        # Break Episode when finishing
        if terminated or truncated:
            # Take one dummy step to ensure video flush (important in Colab)
            try:
                self.env.step(self.env.action_space.sample())
            except:
                pass 


      # Stop timer
      tf = time.time() - t0

      # Save metrics in json
      mean_rewards = np.mean([x.cpu().item() if torch.is_tensor(x) and x.is_cuda else x
                          for x in all_rewards])
      
      results[m] = {
          "Rewards": all_rewards,
          "Mean Rewards": mean_rewards,
          "Accumulated Reward": acc_reward,
          "Steps": t,
          "Time": tf,
          "Terminated": terminated,
          "Truncated": truncated
      }

      # Show results
      if verbose:
        print(f"Rewards (mean) = {mean_rewards}, Accumulated Reward = {acc_reward}, Time = {tf}, Time Steps = {t+1}")
        print(f"Terminated = {terminated}, Truncated = {truncated}\n")

      # Show video
      if show:
        show_video(root)

    # Wait to ensure proper saving 
    time.sleep(10)

    # Close and Reset environment
    self.env.close()
    self.env = reset
    self.env.reset()

    return results

<br >
<br >

## Environment 1: Pendulum

In [9]:
# Build folder
name_experiment = "pendulum_s1"
folder_experiment = f"{root}{name_experiment}/"
if not os.path.exists(folder_experiment):
    os.makedirs(folder_experiment)

In [10]:
# Prepare perparameters
M = 800
T = 500
seed = 0

hp = {
    "gamma": 0.99,
    "tau": 0.001,
    "batch_size": 64,

    "actor_lr": 0.001,

    "critic_lr": 0.002,
    "critic_wd": 0.01,

    "noise_theta": 0.15,
    "noise_alpha": 0.2,
}

In [11]:
# Prepare environment
env = gym.make('Pendulum-v1', render_mode="rgb_array", g=9.81)
env.reset(seed=seed)

print("Action space: ", env.action_space)
print("Observation space: ", env.observation_space)
print("Max. in observation space: ", env.observation_space.high)
print("Min. in observation space: ", env.observation_space.low)

Action space:  Box(-2.0, 2.0, (1,), float32)
Observation space:  Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)
Max. in observation space:  [1. 1. 8.]
Min. in observation space:  [-1. -1. -8.]


In [None]:
# Train model
ddpg = DDPG(env, M, T, hp, seed=seed)
training_json, training_dataset = ddpg.train(folder_experiment)

In [None]:
# Generate plots
plot_from_dataset(training_dataset, folder_experiment, "pendulum", load_dataset=False)
#plot_from_dataset(None, folder_experiment, "pendulum", load_dataset=True)

In [None]:
# Download results in kaggle
if mode == "kaggle":
    zip_dir(f"data.zip", folder_experiment)
    FileLink(f"data.zip")

In [None]:
# Test
ddpg = DDPG(env, M, T, hp, seed=seed)
testing_json = ddpg.test(3, T, folder_experiment, load_networks=True)

<br />

Testing on different seed:

In [None]:
seed = 10

# Build folder
name_experiment = "pendulum_s2"
folder_experiment = f"{root}{name_experiment}/"
if not os.path.exists(folder_experiment):
    os.makedirs(folder_experiment)

# Prepare environment
env = gym.make('Pendulum-v1', render_mode="rgb_array", g=9.81)
env.reset(seed=seed)

# Training
ddpg = DDPG(env, M, T, hp, seed=seed)
training_json, training_dataset = ddpg.train(folder_experiment)

In [None]:
# Generate plots
plot_from_dataset(training_dataset, folder_experiment, "pendulum", load_dataset=False)
#plot_from_dataset(None, folder_experiment, "pendulum", load_dataset=True)

# Download results in kaggle
if mode == "kaggle":
    zip_dir(f"data.zip", folder_experiment)
    FileLink(f"data.zip")

In [None]:
# Test
ddpg = DDPG(env, M, T, hp, seed=seed)
testing_json = ddpg.test(3, T, folder_experiment, load_networks=True)

<br />
<br />

## Environment 2: Bipedal Walking

In [None]:
# Build folder
name_experiment = "walking_s1"
folder_experiment = f"{root}{name_experiment}/"
if not os.path.exists(folder_experiment):
    os.makedirs(folder_experiment)

In [None]:
# Prepare perparameters
M = 800
T = 1000
seed = 0

hp = {
    "gamma": 0.99,
    "tau": 0.001,
    "batch_size": 64,

    "actor_lr": 0.001,

    "critic_lr": 0.002,
    "critic_wd": 0.01,

    "noise_theta": 0.15,
    "noise_alpha": 0.2,
}

In [None]:
# Prepare environment
env = gym.make("BipedalWalker-v3", hardcore=False, render_mode="rgb_array")
env.reset(seed=seed)

print("Action space: ", env.action_space)
print("Observation space: ", env.observation_space)
print("Max. in observation space: ", env.observation_space.high)
print("Min. in observation space: ", env.observation_space.low)

In [None]:
# Train model
ddpg = DDPG(env, M, T, hp, seed=seed)
training_json, training_dataset = ddpg.train(folder_experiment)

In [None]:
# Generate plots
plot_from_dataset(training_dataset, folder_experiment, "walking", load_dataset=False)
#plot_from_dataset(None, folder_experiment, "walking", load_dataset=True)

In [None]:
# Download results in kaggle
if mode == "kaggle":
    zip_dir(f"data.zip", folder_experiment)
    FileLink(f"data.zip")

In [None]:
# Test
ddpg = DDPG(env, M, T, hp, seed=seed)
testing_json = ddpg.test(3, T, folder_experiment, load_networks=True)

<br />

Testing on different seed:

In [None]:
seed = 10

# Build folder
name_experiment = "walking_s2"
folder_experiment = f"{root}{name_experiment}/"
if not os.path.exists(folder_experiment):
    os.makedirs(folder_experiment)

# Prepare environment
env = gym.make("BipedalWalker-v3", hardcore=False, render_mode="rgb_array")
env.reset(seed=seed)

# Training
ddpg = DDPG(env, M, T, hp, seed=seed)
training_json, training_dataset = ddpg.train(folder_experiment)

In [None]:
# Generate plots
plot_from_dataset(training_dataset, folder_experiment, "walking", load_dataset=False)
#plot_from_dataset(None, folder_experiment, "walking", load_dataset=True)

# Download results in kaggle
if mode == "kaggle":
    zip_dir(f"data.zip", folder_experiment)
    FileLink(f"data.zip")

In [None]:
# Test
ddpg = DDPG(env, M, T, hp, seed=seed)
testing_json = ddpg.test(3, T, folder_experiment, load_networks=True)

<br />
<br />

## Environment 3: Robot Arm

In [None]:
# Build folder
name_experiment = "robot_s1"
folder_experiment = f"{root}{name_experiment}/"
if not os.path.exists(folder_experiment):
    os.makedirs(folder_experiment)

In [None]:
# Prepare perparameters
M = 800
T = 500
seed = 0

hp = {
    "gamma": 0.99,
    "tau": 0.001,
    "batch_size": 64,

    "actor_lr": 0.001,

    "critic_lr": 0.002,
    "critic_wd": 0.01,

    "noise_theta": 0.15,
    "noise_alpha": 0.2,
}

In [None]:
# Prepare environment
env = gym.make('Pusher-v5', render_mode="rgb_array")
env.reset(seed=seed)

print("Action space: ", env.action_space)
print("Observation space: ", env.observation_space)
print("Max. in observation space: ", env.observation_space.high)
print("Min. in observation space: ", env.observation_space.low)

In [None]:
# Train model
ddpg = DDPG(env, M, T, hp, seed=seed)
training_json, training_dataset = ddpg.train(folder_experiment)

In [None]:
# Generate plots
plot_from_dataset(training_dataset, folder_experiment, "robot", load_dataset=False)
#plot_from_dataset(None, folder_experiment, "robot", load_dataset=True)

In [None]:
# Download results in kaggle
if mode == "kaggle":
    zip_dir(f"data.zip", folder_experiment)
    FileLink(f"data.zip")

In [None]:
# Test
d = Display(visible=0, size=(1400, 900))
d.start()

ddpg = DDPG(env, M, T, hp, seed=seed)
testing_json = ddpg.test(3, T, folder_experiment, load_networks=True)

<br />

Testing on different seed:

In [None]:
seed = 10

# Build folder
name_experiment = "robot_s2"
folder_experiment = f"{root}{name_experiment}/"
if not os.path.exists(folder_experiment):
    os.makedirs(folder_experiment)

# Prepare environment
env = gym.make('Pusher-v5', render_mode="rgb_array")
env.reset(seed=seed)

# Training
ddpg = DDPG(env, M, T, hp, seed=seed)
training_json, training_dataset = ddpg.train(folder_experiment)

In [None]:
# Generate plots
plot_from_dataset(training_dataset, folder_experiment, "robot", load_dataset=False)
#plot_from_dataset(None, folder_experiment, "robot", load_dataset=True)

# Download results in kaggle
if mode == "kaggle":
    zip_dir(f"data.zip", folder_experiment)
    FileLink(f"data.zip")

In [None]:
# Test
d = Display(visible=0, size=(1400, 900))
d.start()

ddpg = DDPG(env, M, T, hp, seed=seed)
testing_json = ddpg.test(3, T, folder_experiment, load_networks=True)

<br />