In [14]:
# installations primiarly needed for Mujoco
!apt-get install -y \
    libgl1-mesa-dev \
    libgl1-mesa-glx \
    libglew-dev \
    libosmesa6-dev \
    software-properties-common

!apt-get install -y patchelf

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libglew-dev is already the newest version (2.2.0-4).
libgl1-mesa-dev is already the newest version (23.2.1-1ubuntu3.1~22.04.2).
libosmesa6-dev is already the newest version (23.2.1-1ubuntu3.1~22.04.2).
software-properties-common is already the newest version (0.99.22.9).
libgl1-mesa-glx is already the newest version (23.0.4-0ubuntu1~22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
patchelf is already the newest version (0.14.3-1).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.


In [15]:
# primarily RL-sepcific requirements
%pip install -f https://download.pytorch.org/whl/torch_stable.html \
                free-mujoco-py \
                einops \
                gym==0.23.1 \
                protobuf==3.20.1 \
                git+https://github.com/rail-berkeley/d4rl.git \
                mediapy \
                Pillow==9.0.0


Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting git+https://github.com/rail-berkeley/d4rl.git
  Cloning https://github.com/rail-berkeley/d4rl.git to /tmp/pip-req-build-xbb5kcnr
  Running command git clone --filter=blob:none --quiet https://github.com/rail-berkeley/d4rl.git /tmp/pip-req-build-xbb5kcnr
  Resolved https://github.com/rail-berkeley/d4rl.git to commit 71a9549f2091accff93eeff68f1f3ab2c0e0a288
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mjrl@ git+https://github.com/aravindr93/mjrl@master#egg=mjrl (from D4RL==1.1)
  Cloning https://github.com/aravindr93/mjrl (to revision master) to /tmp/pip-install-246_5q09/mjrl_48c2e63641f14133b795e0101a5d31f8
  Running command git clone --filter=blob:none --quiet https://github.com/aravindr93/mjrl /tmp/pip-install-246_5q09/mjrl_48c2e63641f14133b795e0101a5d31f8
  Resolved https://github.com/aravindr93/mjrl to commit 3871d93763d3b49c4741e6daeaebbc605fe140dc
  Preparing metadata (setup.py) ... 

In [16]:
import d4rl

In [17]:
# set mujoco env path if not already set
# %env LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/root/.mujoco/mujoco200/bin

import gym
import d4rl # Import required to register environments


env = gym.make('Walker2d-v3')
env.reset()
env.step(env.action_space.sample())
env.close()
print("mujoco-py check passed")

env = gym.make('walker2d-medium-v2')
env.reset()
env.step(env.action_space.sample())
env.close()
print("d4rl check passed")


mujoco-py check passed
d4rl check passed


In [18]:
import gym
import d4rl  # Import D4RL datasets

# Load the gym environment
env = gym.make('maze2d-umaze-v1')  # Use an environment name from the D4RL dataset
env.reset()
dataset = env.get_dataset()  # Get the dataset for the environment

# Take a single step in the environment
obs, reward, done, info = env.step(env.action_space.sample())

print(f"Observation: {obs}")
print(f"Reward: {reward}")
print(f"Done: {done}")
print(f"Info: {info}")

# Make sure to check the dataset structure
print(f"Keys in dataset: {dataset.keys()}")


load datafile: 100%|██████████| 8/8 [00:00<00:00, 19.14it/s]

Observation: [ 2.96235634  2.90449218  0.18383036 -0.06991828]
Reward: 0.0
Done: False
Info: {}
Keys in dataset: dict_keys(['actions', 'infos/goal', 'infos/qpos', 'infos/qvel', 'observations', 'rewards', 'terminals', 'timeouts'])





In [None]:
# set mujoco env path if not already set
# %env LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/root/.mujoco/mujoco200/bin

from copy import deepcopy
import gym
import d4rl  # Import required to register environments
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical


# Define the policy network


class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_dim*2)
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.prev = None

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        mu = x[:, :self.output_dim]
        log_std = x[:, self.output_dim:]
        std = F.softplus(log_std)
        return mu, std


# Define direct preference optimization loss
# Loss=-log(sigmoid(policy-ref)=(p_w-p_l)-(ref_w-ref_l)=(p_w - ref_w) - (p_l - ref_l))
# ref=model_(t-1) and policy=model_t
def dpo_loss(policy_chosen_logps, policy_rejected_logps, reference_chosen_logps, reference_rejected_logps, beta=1.0):
    pi_logratios = policy_chosen_logps - policy_rejected_logps
    ref_logratios = reference_chosen_logps - reference_rejected_logps

    logits = pi_logratios - ref_logratios

    loss = -F.logsigmoid(beta * logits).mean()
    return loss


env = gym.make('walker2d-medium-v2')
dataset = env.get_dataset()
obs = dataset['observations']
acts = dataset['actions']
print(obs.shape, acts.shape)
# Define the training loop
# Define the policy network
policy_network = PolicyNetwork(
    env.observation_space.shape[0], env.action_space.shape[0])
optimizer = torch.optim.Adam(policy_network.parameters(), lr=1e-3)
pre = deepcopy(policy_network)
# Training loop
num_epochs = 100
batch_size = 64
normal_dist = torch.distributions.Normal(0, 1)
for epoch in range(num_epochs):

    # Sample a batch of observations
    indices = np.random.choice(len(obs), batch_size)
    obs_batch = torch.tensor(obs[indices], dtype=torch.float32)
    act_batch = torch.tensor(acts[indices], dtype=torch.float32)

    # Compute the log probabilities of the actions
    mu, std = policy_network(obs_batch)
    epsilon = normal_dist.sample(mu.shape)
    model_act_sample = mu + std * epsilon
    model_act_logp = normal_dist.log_prob(epsilon)
    expert_act_logp = normal_dist.log_prob((act_batch - mu) / std)

    # Compute the log probabilities of the actions under the previous policy
    with torch.no_grad():
        if policy_network.prev == None:
            policy_network.prev = deepcopy(policy_network)

        prev_mu, prev_std = policy_network.prev(obs_batch)
        prev_model_act_logp = normal_dist.log_prob(
            (model_act_sample-prev_mu)/prev_std)
        prev_expert_act_logp = normal_dist.log_prob(
            (act_batch - prev_mu) / prev_std)

    # Compute the loss
    loss = -dpo_loss(expert_act_logp, model_act_logp,
                     prev_expert_act_logp, prev_model_act_logp, beta=1.0)
    # Optimize the policy
    if num_epochs % 10 == 1:
        policy_network.prev = deepcopy(policy_network)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print the loss
    print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch +
          1, num_epochs, loss.item()))


load datafile: 100%|██████████| 21/21 [00:05<00:00,  4.05it/s]


(1000000, 17) (1000000, 6)
Epoch [1/100], Loss: -0.6931
Epoch [2/100], Loss: -0.7222
Epoch [3/100], Loss: -0.7676
Epoch [4/100], Loss: -0.8205
Epoch [5/100], Loss: -0.9543
Epoch [6/100], Loss: -0.9667
Epoch [7/100], Loss: -1.0946
Epoch [8/100], Loss: -1.5102
Epoch [9/100], Loss: -1.6360
Epoch [10/100], Loss: -2.2516
Epoch [11/100], Loss: -2.6703
Epoch [12/100], Loss: -2.7854
Epoch [13/100], Loss: -4.3016
Epoch [14/100], Loss: -4.4027
Epoch [15/100], Loss: -6.3448
Epoch [16/100], Loss: -10.4199
Epoch [17/100], Loss: -24.2058
Epoch [18/100], Loss: -18.5903
Epoch [19/100], Loss: -31.6253
Epoch [20/100], Loss: -28.8660
Epoch [21/100], Loss: -112.2733
Epoch [22/100], Loss: -61.8056
Epoch [23/100], Loss: -836.3360
Epoch [24/100], Loss: -174.8199
Epoch [25/100], Loss: -129.5840
Epoch [26/100], Loss: -320.6694
Epoch [27/100], Loss: -249.6041
Epoch [28/100], Loss: -1080.1365
Epoch [29/100], Loss: -3662.7598
Epoch [30/100], Loss: -2764.7744
Epoch [31/100], Loss: -18833.7266
Epoch [32/100], Loss:

In [20]:
# set mujoco env path if not already set
# %env LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/root/.mujoco/mujoco200/bin

from copy import deepcopy
import gym
import d4rl  # Import required to register environments
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical


# Define the policy network


class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


# Define direct preference optimization loss
# Loss=-log(sigmoid(policy-ref)=(p_w-p_l)-(ref_w-ref_l)=(p_w - ref_w) - (p_l - ref_l))
# ref=model_(t-1) and policy=model_t
def dpo_loss(policy_chosen_logps, policy_rejected_logps, reference_chosen_logps, reference_rejected_logps, beta=1.0):
    pi_logratios = policy_chosen_logps - policy_rejected_logps
    ref_logratios = reference_chosen_logps - reference_rejected_logps

    logits = pi_logratios - ref_logratios

    loss = -F.logsigmoid(beta * logits).mean()
    return loss


env = gym.make('minigrid-fourrooms-random-v0')
dataset = env.get_dataset()
obs = dataset['observations']
acts = dataset['actions']
print(obs.shape, acts.shape)
print(acts[0])
# Define the training loop
# Define the policy network
policy_network = PolicyNetwork(
    env.observation_space.shape[0], env.action_space.shape[0])
optimizer = torch.optim.Adam(policy_network.parameters(), lr=1e-3)
pre = deepcopy(policy_network)
# Training loop
num_epochs = 100
batch_size = 64

for epoch in range(num_epochs):

    # Sample a batch of observations
    indices = np.random.choice(len(obs), batch_size)
    obs_batch = torch.tensor(obs[indices], dtype=torch.float32)
    act_batch = torch.tensor(acts[indices], dtype=torch.float32)

    # Compute the log probabilities of the actions
    p = policy_network(obs_batch)
    model_probability_distribution = Categorical(p)
    model_act = model_probability_distribution.sample()
    act_logp = model_probability_distribution.log_prob(model_act).sum(dim=-1)

    # Compute the log probabilities of the actions under the previous policy
    with torch.no_grad():
        act_prob_old = pre(obs_batch)
        act_dist_old = Categorical(act_prob_old)
        act_sample_old = act_dist_old.sample()
        act_logp_old = act_dist_old.log_prob(act_sample_old).sum(dim=-1)

    # Compute the loss
    loss = -dpo_loss(act_logp, act_logp_old, act_logp, act_logp_old)
    pre = deepcopy(policy_network)
    # Optimize the policy
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print the loss
    print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch +
          1, num_epochs, loss.item()))


AttributeError: module 'numpy' has no attribute 'bool'.
`np.bool` was a deprecated alias for the builtin `bool`. To avoid this error in existing code, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations