# d4rl outils

In [1]:
!pip install gymnasium[mujoco]

Collecting gymnasium[mujoco]
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium[mujoco])
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Collecting mujoco>=2.1.5 (from gymnasium[mujoco])
  Downloading mujoco-3.2.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco>=2.1.5->gymnasium[mujoco])
  Downloading glfw-2.8.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.p39.p310.p311.p312.p313-none-manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Downloading mujoco-3.2.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
[?

In [2]:
!pip install torch numpy matplotlib



In [3]:
!pip install just-d4rl

Collecting just-d4rl
  Downloading just_d4rl-0.2407.5-py3-none-any.whl.metadata (2.0 kB)
Downloading just_d4rl-0.2407.5-py3-none-any.whl (8.2 kB)
Installing collected packages: just-d4rl
Successfully installed just-d4rl-0.2407.5


In [4]:
!pip install stable_baselines3

Collecting stable_baselines3
  Downloading stable_baselines3-2.4.1-py3-none-any.whl.metadata (4.5 kB)
Downloading stable_baselines3-2.4.1-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.0/184.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: stable_baselines3
Successfully installed stable_baselines3-2.4.1


# Videos

In [18]:
import gymnasium as gym
import torch
import numpy as np
import os
import imageio
from stable_baselines3.common import policies
from just_d4rl import d4rl_offline_dataset
from stable_baselines3.common.torch_layers import FlattenExtractor, MlpExtractor
import torch.nn as nn


# Define UnconditionalPolicy
class UnconditionalPolicy(policies.ActorCriticPolicy):
    def __init__(
        self,
        observation_space,
        action_space,
        hidden_size=1024,
        depth=2,
        learning_rate=1e-3,
    ):
        net_arch = dict(
            pi=[hidden_size] * depth,
            vf=[64]
        )
        super().__init__(
            observation_space=observation_space,
            action_space=action_space,
            lr_schedule=lambda _: learning_rate,
            net_arch=net_arch,
            activation_fn=nn.ReLU,
            features_extractor_class=FlattenExtractor,
            optimizer_class=torch.optim.Adam,
        )

    def _build(self, lr_schedule):
        self.features_extractor = self.features_extractor_class(self.observation_space)
        self.features_dim = self.features_extractor.features_dim

        self.mlp_extractor = MlpExtractor(
            self.features_dim,
            net_arch=self.net_arch,
            activation_fn=self.activation_fn,
        )

        self.value_net = nn.Linear(self.mlp_extractor.latent_dim_vf, 1)

        latent_dim_pi = self.mlp_extractor.latent_dim_pi
        self.action_net, self.log_std = self.action_dist.proba_distribution_net(
            latent_dim=latent_dim_pi,
            log_std_init=self.log_std_init
        )


# Load the UnconditionalPolicy model
def load_model(model_class, model_path, observation_space, action_space):
    model = model_class(
        observation_space=observation_space,
        action_space=action_space,
        hidden_size=1024,
        depth=2
    )
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cuda')))
    model.eval()
    return model


# Set the OpenGL backend for headless rendering
os.environ["MUJOCO_GL"] = "egl"


# Function to save frames to video
def save_video_from_frames(frames, video_filename):
    writer = imageio.get_writer(video_filename, fps=60)
    for frame in frames:
        writer.append_data(frame)
    writer.close()
    print(f"Video saved at: {video_filename}")


# Evaluate the model and save video
def evaluate_model_and_save_video(model, env_name, video_filename, state_mean, state_std, num_evaluations=10, desired_return=12000 * 0.001):
    env = gym.make(env_name, render_mode="rgb_array")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    frames = []
    total_rewards = []

    for ep in range(num_evaluations):
        obs, _ = env.reset()
        total_reward = 0
        done = False
        desired_return_ep = desired_return  # Reset desired return for this episode

        while not done:
            # Normalize state
            state = (torch.tensor(obs, dtype=torch.float32).to(device) - state_mean) / state_std
            state = state.unsqueeze(0)

            # Add desired return to state
            desired_return_tensor = torch.tensor([[desired_return_ep]], dtype=torch.float32).to(device)
            augmented_state = torch.cat([state, desired_return_tensor], dim=1)

            # Predict action
            action = model._predict(augmented_state, deterministic=True).detach().cpu().numpy().flatten()
            action = np.clip(action, env.action_space.low, env.action_space.high)  # Clip actions

            # Step environment
            obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward

            # Update desired return
            desired_return_ep -= reward * 0.001
            frame = env.render()
            frames.append(frame)

        print(f"Sum of Reward for this episode: {total_reward}")
        total_rewards.append(total_reward)

    save_video_from_frames(frames, video_filename)
    avg_reward = np.mean(total_rewards)
    print(f"Average Reward over {num_evaluations} evaluations: {avg_reward}")
    return avg_reward


# Main function
def main():
    dataset_name = "halfcheetah-medium-expert-v2"
    model_path = f"trained_mle_model_{dataset_name}.pth"
    video_filename = f"./{dataset_name}_evaluation.mp4"

    env_name = dataset_name.split('-')[0].replace('halfcheetah', 'HalfCheetah') + '-v4'
    env = gym.make(env_name, render_mode="rgb_array")

    action_space = env.action_space
    augmented_obs_space = gym.spaces.Box(
        low=np.concatenate([env.observation_space.low, [-np.inf]]),
        high=np.concatenate([env.observation_space.high, [np.inf]]),
        dtype=np.float32
    )

    # Load the model
    model = load_model(UnconditionalPolicy, model_path, augmented_obs_space, action_space)

    # Load dataset statistics for normalization
    dataset = d4rl_offline_dataset(dataset_name)
    states = dataset['observations']
    state_mean = torch.tensor(states.mean(axis=0), dtype=torch.float32).to('cuda' if torch.cuda.is_available() else 'cpu')
    state_std = torch.tensor(states.std(axis=0) + 1e-6, dtype=torch.float32).to('cuda' if torch.cuda.is_available() else 'cpu')

    # Evaluate the model and save video
    avg_reward = evaluate_model_and_save_video(
        model,
        env_name,
        video_filename,
        state_mean,
        state_std,
        num_evaluations=10,
        desired_return=12000 * 0.001
    )

    print(f"Final Average Reward for {dataset_name}: {avg_reward}")


if __name__ == "__main__":
    main()

  logger.deprecation(
  model.load_state_dict(torch.load(model_path, map_location=torch.device('cuda')))
load datafile: 100%|██████████| 9/9 [00:05<00:00,  1.77it/s]


Dataset loaded and saved at: /root/.d4rl/datasets/halfcheetah_medium_expert-v2.hdf5
Sum of Reward for this episode: 11624.95245774274
Sum of Reward for this episode: 11218.757056662911
Sum of Reward for this episode: 11279.934292811216
Sum of Reward for this episode: 11289.573013770796
Sum of Reward for this episode: 11457.571741994941
Sum of Reward for this episode: 11162.789090940802
Sum of Reward for this episode: 11290.620105226268
Sum of Reward for this episode: 11247.596560491667
Sum of Reward for this episode: 11258.79190017032
Sum of Reward for this episode: 11190.02640530622
Video saved at: ./halfcheetah-medium-expert-v2_evaluation.mp4
Average Reward over 10 evaluations: 11302.061262511788
Final Average Reward for halfcheetah-medium-expert-v2: 11302.061262511788


In [19]:
import gymnasium as gym
import torch
import numpy as np
import os
import imageio
from stable_baselines3.common import policies
from just_d4rl import d4rl_offline_dataset
from stable_baselines3.common.torch_layers import FlattenExtractor, MlpExtractor
import torch.nn as nn


# Define UnconditionalPolicy
class UnconditionalPolicy(policies.ActorCriticPolicy):
    def __init__(
        self,
        observation_space,
        action_space,
        hidden_size=1024,
        depth=2,
        learning_rate=1e-3,
    ):
        net_arch = dict(
            pi=[hidden_size] * depth,
            vf=[64]
        )
        super().__init__(
            observation_space=observation_space,
            action_space=action_space,
            lr_schedule=lambda _: learning_rate,
            net_arch=net_arch,
            activation_fn=nn.ReLU,
            features_extractor_class=FlattenExtractor,
            optimizer_class=torch.optim.Adam,
        )

    def _build(self, lr_schedule):
        self.features_extractor = self.features_extractor_class(self.observation_space)
        self.features_dim = self.features_extractor.features_dim

        self.mlp_extractor = MlpExtractor(
            self.features_dim,
            net_arch=self.net_arch,
            activation_fn=self.activation_fn,
        )

        self.value_net = nn.Linear(self.mlp_extractor.latent_dim_vf, 1)

        latent_dim_pi = self.mlp_extractor.latent_dim_pi
        self.action_net, self.log_std = self.action_dist.proba_distribution_net(
            latent_dim=latent_dim_pi,
            log_std_init=self.log_std_init
        )


# Load the UnconditionalPolicy model
def load_model(model_class, model_path, observation_space, action_space):
    model = model_class(
        observation_space=observation_space,
        action_space=action_space,
        hidden_size=1024,
        depth=2
    )
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cuda')))
    model.eval()
    return model


# Set the OpenGL backend for headless rendering
os.environ["MUJOCO_GL"] = "egl"


# Function to save frames to video
def save_video_from_frames(frames, video_filename):
    writer = imageio.get_writer(video_filename, fps=60)
    for frame in frames:
        writer.append_data(frame)
    writer.close()
    print(f"Video saved at: {video_filename}")


# Evaluate the model and save video
def evaluate_model_and_save_video(model, env_name, video_filename, state_mean, state_std, num_evaluations=10, desired_return=3600 * 0.001):
    env = gym.make(env_name, render_mode="rgb_array")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    frames = []
    total_rewards = []

    for ep in range(num_evaluations):
        obs, _ = env.reset()
        total_reward = 0
        done = False
        desired_return_ep = desired_return  # Reset desired return for this episode

        while not done:
            # Normalize state
            state = (torch.tensor(obs, dtype=torch.float32).to(device) - state_mean) / state_std
            state = state.unsqueeze(0)

            # Add desired return to state
            desired_return_tensor = torch.tensor([[desired_return_ep]], dtype=torch.float32).to(device)
            augmented_state = torch.cat([state, desired_return_tensor], dim=1)

            # Predict action
            action = model._predict(augmented_state, deterministic=True).detach().cpu().numpy().flatten()
            action = np.clip(action, env.action_space.low, env.action_space.high)  # Clip actions

            # Step environment
            obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward

            # Update desired return
            desired_return_ep -= reward * 0.001
            frame = env.render()
            frames.append(frame)

        print(f"Sum of Reward for this episode: {total_reward}")
        total_rewards.append(total_reward)

    save_video_from_frames(frames, video_filename)
    avg_reward = np.mean(total_rewards)
    print(f"Average Reward over {num_evaluations} evaluations: {avg_reward}")
    return avg_reward


# Main function
def main():
    dataset_name = "hopper-medium-expert-v2"
    model_path = f"trained_mle_model_{dataset_name}.pth"
    video_filename = f"./{dataset_name}_evaluation.mp4"

    env_name = dataset_name.split('-')[0].replace('hopper', 'Hopper') + '-v4'
    env = gym.make(env_name, render_mode="rgb_array")

    action_space = env.action_space
    augmented_obs_space = gym.spaces.Box(
        low=np.concatenate([env.observation_space.low, [-np.inf]]),
        high=np.concatenate([env.observation_space.high, [np.inf]]),
        dtype=np.float32
    )

    # Load the model
    model = load_model(UnconditionalPolicy, model_path, augmented_obs_space, action_space)

    # Load dataset statistics for normalization
    dataset = d4rl_offline_dataset(dataset_name)
    states = dataset['observations']
    state_mean = torch.tensor(states.mean(axis=0), dtype=torch.float32).to('cuda' if torch.cuda.is_available() else 'cpu')
    state_std = torch.tensor(states.std(axis=0) + 1e-6, dtype=torch.float32).to('cuda' if torch.cuda.is_available() else 'cpu')

    # Evaluate the model and save video
    avg_reward = evaluate_model_and_save_video(
        model,
        env_name,
        video_filename,
        state_mean,
        state_std,
        num_evaluations=10,
        desired_return=3600 * 0.001
    )

    print(f"Final Average Reward for {dataset_name}: {avg_reward}")


if __name__ == "__main__":
    main()

  logger.deprecation(
  model.load_state_dict(torch.load(model_path, map_location=torch.device('cuda')))
load datafile: 100%|██████████| 9/9 [00:03<00:00,  2.84it/s]


Dataset loaded and saved at: /root/.d4rl/datasets/hopper_medium_expert-v2.hdf5
Sum of Reward for this episode: 3384.514099871218
Sum of Reward for this episode: 2494.3416450073314
Sum of Reward for this episode: 2579.4993052006303
Sum of Reward for this episode: 3566.669651624607
Sum of Reward for this episode: 3632.1642348205987
Sum of Reward for this episode: 3368.264849530412
Sum of Reward for this episode: 1574.8869895142625
Sum of Reward for this episode: 3567.374677517645
Sum of Reward for this episode: 3592.376119709841
Sum of Reward for this episode: 3508.5736931077454
Video saved at: ./hopper-medium-expert-v2_evaluation.mp4
Average Reward over 10 evaluations: 3126.866526590429
Final Average Reward for hopper-medium-expert-v2: 3126.866526590429


In [20]:
import gymnasium as gym
import torch
import numpy as np
import os
import imageio
from stable_baselines3.common import policies
from just_d4rl import d4rl_offline_dataset
from stable_baselines3.common.torch_layers import FlattenExtractor, MlpExtractor
import torch.nn as nn


# Define UnconditionalPolicy
class UnconditionalPolicy(policies.ActorCriticPolicy):
    def __init__(
        self,
        observation_space,
        action_space,
        hidden_size=1024,
        depth=2,
        learning_rate=1e-3,
    ):
        net_arch = dict(
            pi=[hidden_size] * depth,
            vf=[64]
        )
        super().__init__(
            observation_space=observation_space,
            action_space=action_space,
            lr_schedule=lambda _: learning_rate,
            net_arch=net_arch,
            activation_fn=nn.ReLU,
            features_extractor_class=FlattenExtractor,
            optimizer_class=torch.optim.Adam,
        )

    def _build(self, lr_schedule):
        self.features_extractor = self.features_extractor_class(self.observation_space)
        self.features_dim = self.features_extractor.features_dim

        self.mlp_extractor = MlpExtractor(
            self.features_dim,
            net_arch=self.net_arch,
            activation_fn=self.activation_fn,
        )

        self.value_net = nn.Linear(self.mlp_extractor.latent_dim_vf, 1)

        latent_dim_pi = self.mlp_extractor.latent_dim_pi
        self.action_net, self.log_std = self.action_dist.proba_distribution_net(
            latent_dim=latent_dim_pi,
            log_std_init=self.log_std_init
        )


# Load the UnconditionalPolicy model
def load_model(model_class, model_path, observation_space, action_space):
    model = model_class(
        observation_space=observation_space,
        action_space=action_space,
        hidden_size=1024,
        depth=2
    )
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cuda')))
    model.eval()
    return model


# Set the OpenGL backend for headless rendering
os.environ["MUJOCO_GL"] = "egl"


# Function to save frames to video
def save_video_from_frames(frames, video_filename):
    writer = imageio.get_writer(video_filename, fps=60)
    for frame in frames:
        writer.append_data(frame)
    writer.close()
    print(f"Video saved at: {video_filename}")


# Evaluate the model and save video
def evaluate_model_and_save_video(model, env_name, video_filename, state_mean, state_std, num_evaluations=10, desired_return=5000 * 0.001):
    env = gym.make(env_name, render_mode="rgb_array")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    frames = []
    total_rewards = []

    for ep in range(num_evaluations):
        obs, _ = env.reset()
        total_reward = 0
        done = False
        desired_return_ep = desired_return  # Reset desired return for this episode

        while not done:
            # Normalize state
            state = (torch.tensor(obs, dtype=torch.float32).to(device) - state_mean) / state_std
            state = state.unsqueeze(0)

            # Add desired return to state
            desired_return_tensor = torch.tensor([[desired_return_ep]], dtype=torch.float32).to(device)
            augmented_state = torch.cat([state, desired_return_tensor], dim=1)

            # Predict action
            action = model._predict(augmented_state, deterministic=True).detach().cpu().numpy().flatten()
            action = np.clip(action, env.action_space.low, env.action_space.high)  # Clip actions

            # Step environment
            obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward

            # Update desired return
            desired_return_ep -= reward * 0.001
            frame = env.render()
            frames.append(frame)

        print(f"Sum of Reward for this episode: {total_reward}")
        total_rewards.append(total_reward)

    save_video_from_frames(frames, video_filename)
    avg_reward = np.mean(total_rewards)
    print(f"Average Reward over {num_evaluations} evaluations: {avg_reward}")
    return avg_reward


# Main function
def main():
    dataset_name = "walker2d-medium-expert-v2"
    model_path = f"trained_mle_model_{dataset_name}.pth"
    video_filename = f"./{dataset_name}_evaluation.mp4"

    env_name = dataset_name.split('-')[0].replace('walker2d', 'Walker2d') + '-v4'
    env = gym.make(env_name, render_mode="rgb_array")

    action_space = env.action_space
    augmented_obs_space = gym.spaces.Box(
        low=np.concatenate([env.observation_space.low, [-np.inf]]),
        high=np.concatenate([env.observation_space.high, [np.inf]]),
        dtype=np.float32
    )

    # Load the model
    model = load_model(UnconditionalPolicy, model_path, augmented_obs_space, action_space)

    # Load dataset statistics for normalization
    dataset = d4rl_offline_dataset(dataset_name)
    states = dataset['observations']
    state_mean = torch.tensor(states.mean(axis=0), dtype=torch.float32).to('cuda' if torch.cuda.is_available() else 'cpu')
    state_std = torch.tensor(states.std(axis=0) + 1e-6, dtype=torch.float32).to('cuda' if torch.cuda.is_available() else 'cpu')

    # Evaluate the model and save video
    avg_reward = evaluate_model_and_save_video(
        model,
        env_name,
        video_filename,
        state_mean,
        state_std,
        num_evaluations=10,
        desired_return=5000 * 0.001
    )

    print(f"Final Average Reward for {dataset_name}: {avg_reward}")


if __name__ == "__main__":
    main()

  model.load_state_dict(torch.load(model_path, map_location=torch.device('cuda')))
load datafile: 100%|██████████| 9/9 [00:04<00:00,  1.82it/s]


Dataset loaded and saved at: /root/.d4rl/datasets/walker2d_medium_expert-v2.hdf5
Sum of Reward for this episode: 4904.8224526156
Sum of Reward for this episode: 4870.28563317747
Sum of Reward for this episode: 4868.763126591738
Sum of Reward for this episode: 4888.087381968669
Sum of Reward for this episode: 4883.4723337021205
Sum of Reward for this episode: 4905.227923898287
Sum of Reward for this episode: 4908.204982869992
Sum of Reward for this episode: 4899.835439343304
Sum of Reward for this episode: 4883.56162887681
Sum of Reward for this episode: 4894.967023848706
Video saved at: ./walker2d-medium-expert-v2_evaluation.mp4
Average Reward over 10 evaluations: 4890.72279268927
Final Average Reward for walker2d-medium-expert-v2: 4890.72279268927
