# Testing splitting and recombining parallel actions

In [24]:
import numpy as np
import torch
import torch.nn as nn  # noqa: F401
import safety_gymnasium
from rl_vcf.rl.utils import make_env_safety, get_actor_structure
from rl_vcf.rl.algos.ppo.core import MLPActorCritic, LOG_STD_MIN, LOG_STD_MAX

In [42]:
num_envs = 1

seed = 0
device = "cuda"
envs = safety_gymnasium.vector.SafetySyncVectorEnv(
    [
        make_env_safety(
            "SafetyPointReachAvoidReset0-v0",
            i,
            seed + i,
            False,
            5,
            False,
        )
        for i in range(num_envs)
    ]
)

In [43]:
# Load state dict
loaded_state_dict = torch.load(
    "rl-policy-episode-200.pt", weights_only=True, map_location=device
)

# Construct agent from state dict
loaded_hidden_sizes, loaded_activation = get_actor_structure(
    loaded_state_dict, envs.single_observation_space, envs.single_action_space
)

agent = MLPActorCritic(
    envs.single_observation_space,
    envs.single_action_space,
    loaded_hidden_sizes,
    eval("nn." + loaded_activation + "()"),
    state_dependent_std=True,
)
agent.pi.load_state_dict(loaded_state_dict, strict=True)
agent.to(device)

# Prevent storing gradients
for p in agent.parameters():
    p.requires_grad = False

In [44]:
obs, info = envs.reset(seed=[seed + i for i in range(num_envs)])
with torch.no_grad():
    for _ in range(3):
        act = agent.act(torch.Tensor(obs).to(device))
        next_obs, rew, cost, term, trunc, info = envs.step(act.detach().cpu().numpy())
        done = np.logical_or(term, trunc)
        obs = next_obs

In [45]:
mu = agent.pi.mu_layer(agent.pi.net(torch.Tensor(obs).to(device)))
print(mu)
print(mu.shape)
log_std = agent.pi.log_std_layer(agent.pi.net(torch.Tensor(obs).to(device)))
log_std = torch.tanh(log_std)
log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)
std = torch.exp(log_std)
print(std)
print(std.shape)

tensor([[ 1.6778, -2.6323]], device='cuda:0')
torch.Size([1, 2])
tensor([[0.4244, 0.5352]], device='cuda:0')
torch.Size([1, 2])


In [50]:
np_mu = mu.detach().cpu().numpy()
print(np_mu)
print(np_mu.shape)
np_std = std.detach().cpu().numpy()
print(np_std)
print(np_std.shape)

[[ 1.677826  -2.6323307]]
(1, 2)
[[0.42438477 0.5351586 ]]
(1, 2)


In [51]:
np_mu_proj = np.zeros_like(np_mu)
print(np_mu_proj)
np_std_proj = np.zeros_like(np_std)
print(np_std_proj)

[[0. 0.]]
[[0. 0.]]


In [52]:
for i in range(mu.shape[0]):
    np_mu_proj[i], np_std_proj[i] = (np_mu[i] + 1, np_std[i] + 2)

In [53]:
print(np_mu_proj)
print(np_std_proj)

[[ 2.677826  -1.6323307]]
[[2.4243848 2.5351586]]


In [54]:
mu_proj = torch.Tensor(np_mu_proj).to(mu.device)
print(mu_proj)
std_proj = torch.Tensor(np_std_proj).to(std.device)
print(std_proj)

tensor([[ 2.6778, -1.6323]], device='cuda:0')
tensor([[2.4244, 2.5352]], device='cuda:0')


In [58]:
print(np.size(mu.shape))

2


In [62]:
single_env = safety_gymnasium.make("SafetyPointReachAvoidReset0-v0")
obs, info = single_env.reset()

In [63]:
print(obs)

[ 0.          0.          9.81        0.          0.          0.
  0.          0.          0.         -0.32924723  0.37629279  0.
  0.          0.          0.          0.          0.          0.
  0.15012717  0.72919177  0.5790646   0.          0.          0.
  0.          0.          0.          0.        ]


In [64]:
mu = agent.pi.mu_layer(agent.pi.net(torch.Tensor(obs).to(device)))
print(mu)
print(mu.shape)
log_std = agent.pi.log_std_layer(agent.pi.net(torch.Tensor(obs).to(device)))
log_std = torch.tanh(log_std)
log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)
std = torch.exp(log_std)
print(std)
print(std.shape)

tensor([-2.4906,  0.1933], device='cuda:0')
torch.Size([2])
tensor([0.5551, 0.5626], device='cuda:0')
torch.Size([2])


In [65]:
print(np.size(mu.shape))

1
