# Testing splitting and recombining parallel actions

In [2]:
import numpy as np
import torch
import torch.nn as nn  # noqa: F401
import safety_gymnasium
from rl_vcf.rl.utils import make_env_safety, get_actor_structure
from rl_vcf.rl.algos.ppo.core import MLPActorCritic, LOG_STD_MIN, LOG_STD_MAX
from rl_vcf.rl.algos.projected_ppo.core import max_log_diag_gaussian_ratio

In [3]:
num_envs = 3

seed = 0
device = "cuda"
envs = safety_gymnasium.vector.SafetySyncVectorEnv(
    [
        make_env_safety(
            "SafetyPointReachAvoidReset0-v0",
            i,
            seed + i,
            False,
            5,
            False,
        )
        for i in range(num_envs)
    ]
)

In [4]:
# Load state dict
loaded_state_dict = torch.load(
    "rl-policy-episode-200.pt", weights_only=True, map_location=device
)

# Construct agent from state dict
loaded_hidden_sizes, loaded_activation = get_actor_structure(
    loaded_state_dict, envs.single_observation_space, envs.single_action_space
)

agent = MLPActorCritic(
    envs.single_observation_space,
    envs.single_action_space,
    loaded_hidden_sizes,
    eval("nn." + loaded_activation + "()"),
    state_dependent_std=True,
)
agent.pi.load_state_dict(loaded_state_dict, strict=True)
agent.to(device)

# Prevent storing gradients
for p in agent.parameters():
    p.requires_grad = False

In [5]:
obs, info = envs.reset(seed=[seed + i for i in range(num_envs)])
with torch.no_grad():
    for _ in range(3):
        act = agent.act(torch.Tensor(obs).to(device))
        next_obs, rew, cost, term, trunc, info = envs.step(act.detach().cpu().numpy())
        done = np.logical_or(term, trunc)
        obs = next_obs

In [6]:
mu_base = agent.pi.mu_layer(agent.pi.net(torch.Tensor(obs).to(device)))
print(mu_base)
print(mu_base.shape)
log_std_base = agent.pi.log_std_layer(agent.pi.net(torch.Tensor(obs).to(device)))
log_std_base = torch.tanh(log_std_base)
log_std_base = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std_base + 1)
std_base = torch.exp(log_std_base)
print(std_base)
print(std_base.shape)

tensor([[ 1.6778, -2.6323],
        [-1.9176,  0.4878],
        [ 1.4327,  0.3088]], device='cuda:0')
torch.Size([3, 2])
tensor([[0.4244, 0.5352],
        [0.4533, 0.6938],
        [0.3115, 0.4269]], device='cuda:0')
torch.Size([3, 2])


In [7]:
np_mu_base = mu_base.detach().cpu().numpy()
print(np_mu_base)
print(np_mu_base.shape)
np_std_base = std_base.detach().cpu().numpy()
print(np_std_base)
print(np_std_base.shape)

[[ 1.6778263  -2.632331  ]
 [-1.9176089   0.4877513 ]
 [ 1.4326621   0.30880895]]
(3, 2)
[[0.42438495 0.5351586 ]
 [0.45326692 0.69380563]
 [0.31146035 0.4269302 ]]
(3, 2)


In [8]:
np_mu_task = np.zeros_like(np_mu_base)
print(np_mu_task)
np_std_task = np.zeros_like(np_std_base)
print(np_std_task)
np_mu_proj = np.zeros_like(np_mu_base)
print(np_mu_proj)
np_std_proj = np.zeros_like(np_std_base)
print(np_std_proj)

[[0. 0.]
 [0. 0.]
 [0. 0.]]
[[0. 0.]
 [0. 0.]
 [0. 0.]]
[[0. 0.]
 [0. 0.]
 [0. 0.]]
[[0. 0.]
 [0. 0.]
 [0. 0.]]


In [9]:
for i in range(mu_base.shape[0]):
    np_mu_task[i], np_std_task[i] = (
        np_mu_base[i] + 0.5 * np.random.randn(),
        np_std_base[i] - 0.1 * (np.minimum(np_std_base[i], np.random.rayleigh(1))),
    )
    np_mu_proj[i], np_std_proj[i] = (
        np_mu_base[i] + 0.1 * np.random.randn(),
        np_std_base[i] - 0.02 * (np.minimum(np_std_base[i], np.random.rayleigh(1))),
    )

In [10]:
print(np_mu_task)
print(np_std_task)
print(np_mu_proj)
print(np_std_proj)

[[ 1.3605912  -2.9495661 ]
 [-3.026999   -0.62163895]
 [ 1.4280748   0.3042217 ]]
[[0.38194644 0.48164272]
 [0.40794024 0.62442505]
 [0.28031433 0.38423717]]
[[ 1.7366393  -2.5735178 ]
 [-1.9233865   0.4819737 ]
 [ 1.3474997   0.22364661]]
[[0.41589725 0.5244554 ]
 [0.4442016  0.6799295 ]
 [0.30523115 0.4183916 ]]


In [11]:
mu_proj = torch.Tensor(np_mu_proj).to(mu_base.device)
print(mu_proj)
std_proj = torch.Tensor(np_std_proj).to(std_base.device)
print(std_proj)

tensor([[ 1.7366, -2.5735],
        [-1.9234,  0.4820],
        [ 1.3475,  0.2236]], device='cuda:0')
tensor([[0.4159, 0.5245],
        [0.4442, 0.6799],
        [0.3052, 0.4184]], device='cuda:0')


In [12]:
log_task_base = np.zeros(mu_base.shape[0])
log_proj_base = np.zeros(mu_base.shape[0])
for i in range(mu_base.shape[0]):
    log_task_base[i] = max_log_diag_gaussian_ratio(
        np_mu_base[i], np_std_base[i], np_mu_task[i], np_std_task[i]
    )
    log_proj_base[i] = max_log_diag_gaussian_ratio(
        np_mu_base[i], np_std_base[i], np_mu_proj[i], np_std_proj[i]
    )
print(log_task_base)
print(type(log_task_base))
print(log_proj_base)
print(type(log_proj_base))

[ 2.60592866 22.70347488  0.21159545]
<class 'numpy.ndarray'>
[0.43539584 0.04333242 1.48680115]
<class 'numpy.ndarray'>


In [24]:
print(np.size(mu_base.shape))

2


In [62]:
single_env = safety_gymnasium.make("SafetyPointReachAvoidReset0-v0")
obs, info = single_env.reset()

In [63]:
print(obs)

[ 0.          0.          9.81        0.          0.          0.
  0.          0.          0.         -0.32924723  0.37629279  0.
  0.          0.          0.          0.          0.          0.
  0.15012717  0.72919177  0.5790646   0.          0.          0.
  0.          0.          0.          0.        ]


In [64]:
mu = agent.pi.mu_layer(agent.pi.net(torch.Tensor(obs).to(device)))
print(mu)
print(mu.shape)
log_std = agent.pi.log_std_layer(agent.pi.net(torch.Tensor(obs).to(device)))
log_std = torch.tanh(log_std)
log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)
std = torch.exp(log_std)
print(std)
print(std.shape)

tensor([-2.4906,  0.1933], device='cuda:0')
torch.Size([2])
tensor([0.5551, 0.5626], device='cuda:0')
torch.Size([2])


In [65]:
print(np.size(mu.shape))

1
