### actors.py

In [1]:
try:
    from easypip import easyimport
except ModuleNotFoundError:
    from subprocess import run

    assert (
        run(["pip", "install", "easypip"]).returncode == 0
    ), "Could not install easypip"
    from easypip import easyimport

easyimport("swig")
easyimport("bbrl_utils>=0.5").setup()

import gymnasium as gym
import torch
import torch.nn as nn

from bbrl.agents import Agent
from bbrl_utils.nn import build_mlp

from torch.distributions import (
    Normal,
    Independent,
    TransformedDistribution,
    TanhTransform,
)

import gymnasium as gym
from gymnasium import spaces
from typing import Any, Dict, Tuple

from pystk2_gymnasium.definitions import ActionObservationWrapper

In [2]:
# 只是模板，可以随意更改
class MyWrapper(gym.ActionWrapper):
    def __init__(self, env, option: int):
        super().__init__(env)
        self.option = option

    def action(self, action):
        # We do nothing here
        return action

In [3]:
# 自定义 Wrapper : 删除 observation space 中的 离散特征
class OnlyContinousObservationWrapper(ActionObservationWrapper):
    """Removes discrete features from the observation space."""
    
    def __init__(self, env: gym.Env, **kwargs):
        super().__init__(env, **kwargs)
        
        # 过滤掉离散特征，只保留非离散特征
        self._observation_space = env.observation_space['continuous']

    def observation(self, observation: Dict):
        return observation

In [4]:
# TP 08-sac 中的 class SquashedGaussianActor(Agent)
class SquashedGaussianActor(Agent):
    """Computes probabilities over action"""

    def __init__(self, state_dim, hidden_layers, action_dim, min_std=1e-4):
        """Creates a new Squashed Gaussian actor

        :param state_dim: The dimension of the state space
        :param hidden_layers: Hidden layer sizes
        :param action_dim: The dimension of the action space
        :param min_std: The minimum standard deviation, defaults to 1e-4
        """
        super().__init__()
        self.min_std = min_std
        backbone_dim = [state_dim] + list(hidden_layers)
        self.layers = build_mlp(backbone_dim, activation=nn.ReLU())
        self.backbone = nn.Sequential(*self.layers)
        self.last_mean_layer = nn.Linear(hidden_layers[-1], action_dim)
        self.last_std_layer = nn.Linear(hidden_layers[-1], action_dim)
        self.softplus = nn.Softplus()

        # cache_size avoids numerical infinites or NaNs when
        # computing log probabilities
        self.tanh_transform = TanhTransform(cache_size=1)

    def normal_dist(self, obs:torch.Tensor):
        """compute normal distribution given observation(s)"""

        backbone_output = self.backbone(obs)
        mean = self.last_mean_layer(backbone_output)
        std_out = self.last_std_layer(backbone_output)
        std = self.softplus(std_out) + self.min_std

        # Independent ensures that we have a multivariate
        # Gaussian with a diagonal covariance matrix (given as a vector `std`)
        return Independent(Normal(mean, std), 1)

    def forward(self, t: int, stochastic=True):
        """Computes the action a_t and its log-probability p(a_t| s_t)

        :param stochastic: True when sampling
        """

        # Computes probabilities over actions
        normal_dist = self.normal_dist(self.get(("env/env_obs", t)))
        action_dist = TransformedDistribution(normal_dist, [self.tanh_transform])
        if stochastic:
            # Uses the re-parametrization trick
            action = action_dist.rsample()
        else:
            # Directly uses the mode of the distribution
            action = self.tanh_transform(normal_dist.mode)

        log_prob = action_dist.log_prob(action)

        # This line allows to deepcopy the actor...
        self.tanh_transform._cached_x_y = [None, None]
        self.set(("action", t), action)
        self.set(("action_logprobs", t), log_prob)

In [5]:
# 没有 state 时，随机生成的 action，不需要更改
class SamplingActor(Agent):
    """Samples random actions"""

    def __init__(self, action_space: gym.Space):
        super().__init__()
        self.action_space = action_space

    def forward(self, t: int):
        self.set(("action", t), torch.LongTensor([self.action_space.sample()]))

In [6]:
# 定义 Critic 类
class ContinuousQAgent(Agent):
    def __init__(self, state_dim: int, hidden_layers: list[int], action_dim: int):
        """创建一个新的 Q 函数评论家代理: Q(s, a)

        :param state_dim: 状态空间的维数（观测的维数）
        :param hidden_layers: 神经网络的隐藏层大小列表
        :param action_dim: 动作空间的维数
        """
        super().__init__()  # 调用父类的初始化方法
        self.is_q_function = True  # 标记该代理为Q函数
        # 使用给定的状态维度和动作维度构建一个多层感知机（MLP）模型
        # 输入层大小为 状态维度 + 动作维度，输出层大小为 1（Q值）
        self.model = build_mlp(
            [state_dim + action_dim] + list(hidden_layers) + [1], activation=nn.ReLU()
        )

    def forward(self, t):
        obs = self.get(("env/env_obs", t))        # 获取在时间步t的环境观测（状态）
        action = self.get(("action", t))          # 获取在时间步t的动作
        obs_act = torch.cat((obs, action), dim=1) # 将状态和动作连接起来作为模型输入
        q_value = self.model(obs_act).squeeze(-1) # 使用模型计算Q值，squeeze(-1)用于移除多余的维度
        self.set((f"{self.prefix}q_value", t), q_value) # 将计算得到的Q值存储在字典中，以备后续使用

### pystk_actor.py

In [7]:
from typing import List, Callable
from bbrl.agents import Agents, Agent    # 不需要 Agents 了
import gymnasium as gym
import copy

In [8]:
#: The base environment name
env_name = "supertuxkart/flattened_continuous_actions-v0"      # 由于SAC算法只能用于连续动作空间，将动作空间中的 离散操作 删除

#: Player name
player_name = "SAC_agent"

In [9]:
# 在后面的 SACAlgo 类中，直接创建了 actor
# 这里的 get_actor 变得没有必要了, 但是不知道有没有其他用处
def get_actor(
    state, observation_space: gym.spaces.Space, action_space: gym.spaces.Space
) -> Agent:
    # 创建SAC的Actor和Critic网络
    actor = SquashedGaussianActor(observation_space, action_space)
    critic_1 = ContinuousQAgent(observation_space, action_space)
    critic_2 = ContinuousQAgent(observation_space, action_space)
    
    # 创建目标Critic网络
    target_critic_1 = copy.deepcopy(critic_1).with_prefix("target-critic-1/")
    target_critic_2 = copy.deepcopy(critic_2).with_prefix("target-critic-2/") 

    if state is None:
        # 如果没有预训练的状态, 返回随机采样的Actor
        return actor
    
    # 加载预训练的状态
    actor.load_state_dict(state["actor"])
    critic_1.load_state_dict(state["critic_1"]) 
    critic_2.load_state_dict(state["critic_2"])
    target_critic_1.load_state_dict(state["target_critic_1"])
    target_critic_2.load_state_dict(state["target_critic_2"])

    return Agents(actor, critic_1, critic_2, target_critic_1, target_critic_2)

In [10]:
def get_wrappers() -> List[Callable[[gym.Env], gym.Wrapper]]:
    """Returns a list of additional wrappers to be applied to the base
    environment"""
    return [
        # Example of a custom wrapper
        # lambda env: MyWrapper(env, option="1")               # 需要添加 Wrapper 时，在此处操作
        lambda env: OnlyContinousObservationWrapper(env)
    ]

### learn.py

In [11]:
try:
    from easypip import easyimport
except ModuleNotFoundError:
    from subprocess import run

    assert (
        run(["pip", "install", "easypip"]).returncode == 0
    ), "Could not install easypip"
    from easypip import easyimport

easyimport("swig")
easyimport("bbrl_utils>=0.5").setup()

import copy
import os

import torch
import torch.nn as nn
from bbrl.workspace import Workspace
from bbrl.agents import Agent, Agents, TemporalAgent, KWAgentWrapper
from bbrl_utils.algorithms import EpochBasedAlgo
from bbrl_utils.nn import build_mlp, setup_optimizer, soft_update_params
from bbrl_utils.notebook import setup_tensorboard
from omegaconf import OmegaConf
from torch.distributions import (
    Normal,
    Independent,
    TransformedDistribution,
    TanhTransform,
)
import bbrl_gymnasium  # noqa: F401

  from tqdm.autonotebook import tqdm  # noqa: F401
error: XDG_RUNTIME_DIR not set in the environment.


In [12]:
from pathlib import Path
from pystk2_gymnasium import AgentSpec
from functools import partial
import inspect
from bbrl.agents.gymnasium import ParallelGymAgent, make_env
import numpy as np

In [13]:
from torch.utils.tensorboard import SummaryWriter

2024-12-27 18:06:12.080929: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-27 18:06:12.208347: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-27 18:06:12.249297: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-27 18:06:12.496728: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
params = {
    "save_best": True,
    "base_dir": "${gym_env.env_name}/sac-S${algorithm.seed}_${current_time:}",
    "algorithm": {
        "seed": 1,
        "n_envs": 8,
        "n_steps": 32,
        "buffer_size": 1e6,
        "batch_size": 256,
        "max_grad_norm": 0.5,
        "nb_evals": 16,
        "eval_interval": 2_000,
        "learning_starts": 10_000,
        "max_epochs": 2_000,
        "discount_factor": 0.98,
        "entropy_mode": "auto",  # "auto" or "fixed"
        "init_entropy_coef": 2e-7,
        "tau_target": 0.05,
        "architecture": {
            "actor_hidden_size": [64, 64],
            "critic_hidden_size": [256, 256],
        },
    },
    "gym_env": {"env_name": f"{env_name}"},       # 修改为 supertuxkart 的环境名称，但是不确定能否使用 bbrl 库
    "actor_optimizer": {                          # 如果可行，则后面代码不需要改动
        "classname": "torch.optim.Adam",
        "lr": 3e-4,
    },
    "critic_optimizer": {
        "classname": "torch.optim.Adam",
        "lr": 3e-4,
    },
    "entropy_coef_optimizer": {
        "classname": "torch.optim.Adam",
        "lr": 3e-4,
    },
}

In [15]:
# 创建SAC算法环境类
class SACAlgo(EpochBasedAlgo):
    def __init__(self, cfg):
        super().__init__(cfg)  # 调用父类的初始化方法，传入配置参数cfg

        # TODO: 重写 self.train_env , 满足该环境要求     
        make_stkenv = partial(                                 
            make_env,                                          
            env_name, 
            wrappers=get_wrappers(),
            render_mode=None,
            autoreset=True,
            agent=AgentSpec(use_ai=False, name=player_name),
        )

        self.train_env = ParallelGymAgent(
            make_stkenv, 
            1
        ).seed(cfg.algorithm.seed)
    
        # 获取状态空间和动作空间的大小
        obs_size, act_size = self.train_env.get_obs_and_actions_sizes()
        # 断言动作空间是否为连续型，若不是则报错提示
        assert (
            self.train_env.is_continuous_action()
        ), "SAC代码专用于连续动作空间"

        # 创建一个actor（策略网络）
        self.actor = SquashedGaussianActor(
            obs_size, cfg.algorithm.architecture.actor_hidden_size, act_size
        )

        # 创建第一个评论家网络（critic_1）来估计Q值
        self.critic_1 = ContinuousQAgent(
            obs_size,  # 状态空间的大小
            cfg.algorithm.architecture.critic_hidden_size,  # 评论家网络的隐藏层大小
            act_size,  # 动作空间的大小
        ).with_prefix("critic-1/")  # 添加前缀以区分网络

        # 创建目标评论家网络target_critic_1，作为critic_1的深拷贝
        self.target_critic_1 = copy.deepcopy(self.critic_1).with_prefix(
            "target-critic-1/"
        )

        # 创建第二个评论家网络critic_2，作为SAC的双重Q网络
        self.critic_2 = ContinuousQAgent(
            obs_size,
            cfg.algorithm.architecture.critic_hidden_size,
            act_size,
        ).with_prefix("critic-2/")

        # 创建目标评论家网络target_critic_2，作为critic_2的深拷贝
        self.target_critic_2 = copy.deepcopy(self.critic_2).with_prefix(
            "target-critic-2/"
        )

        # 训练策略网络的引用，指向actor
        self.train_policy = self.actor
        # 评估策略网络的引用，使用KWAgentWrapper封装actor，并设定stochastic=False，即使用确定性策略
        self.eval_policy = KWAgentWrapper(self.actor, stochastic=False)

In [16]:
def setup_entropy_optimizers(cfg):
    # 定义设置熵优化器的函数，参数为配置文件 `cfg`

    if cfg.algorithm.entropy_mode == "auto":
        # 如果配置中的熵模式为自动模式 ("auto")，则进行以下操作：

        # 注释：优化熵系数的对数值，这略微不同于原论文中的做法，
        # 详细讨论见 https://github.com/rail-berkeley/softlearning/issues/37
        # 此注释和代码参考自稳定基线3（Stable Baselines3）的SAC实现版本

        log_entropy_coef = nn.Parameter(
            torch.log(torch.ones(1) * cfg.algorithm.init_entropy_coef)
        )  # 定义一个可学习的参数log_entropy_coef，用于存储初始熵系数的对数值
           # torch.log(torch.ones(1) * cfg.algorithm.init_entropy_coef) 将初始熵系数取对数以便直接优化其对数值

        # 调用 `setup_optimizer` 函数为 `log_entropy_coef` 参数设置优化器
        entropy_coef_optimizer = setup_optimizer(
            cfg.entropy_coef_optimizer, log_entropy_coef
        )

        # 返回熵系数优化器 `entropy_coef_optimizer` 和 `log_entropy_coef` 参数
        return entropy_coef_optimizer, log_entropy_coef
    else:
        # 如果熵模式不是自动模式，则返回两个 `None` 值，表示不进行熵系数优化
        return None, None

In [17]:
def compute_critic_loss(
    cfg,
    reward: torch.Tensor,
    must_bootstrap: torch.Tensor,
    t_actor: TemporalAgent,
    t_q_agents: TemporalAgent,
    t_target_q_agents: TemporalAgent,
    rb_workspace: Workspace,
    ent_coef: torch.Tensor,
):
    r"""Computes the critic loss for a set of $S$ transition samples

    Args:
        cfg: The experimental configuration
        reward: Tensor (2xS) of rewards
        must_bootstrap: Tensor (2xS) of indicators
        t_actor: The actor agent
        t_q_agents: The critics
        t_target_q_agents: The target of the critics
        rb_workspace: The transition workspace
        ent_coef: The entropy coefficient $\alpha$

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: The two critic losses (scalars)
    """

    # Replay the actor so we get the necessary statistics
    
    # Compute q_values from both critics with the actions present in the buffer:
    # at t, we have Q(s,a) from (s,a) in the RB
    t_q_agents(rb_workspace, t=0, n_steps=1)

    with torch.no_grad():
        # Replay the current actor on the replay buffer to get actions of the current actor
        t_actor(rb_workspace, t=1, n_steps=1)
        action_logprobs_next = rb_workspace["action_logprobs"]

        # Compute target q_values from both target critics: at t+1, we have
        # Q(s_{t+1}, a_{t+1}) from the (s_{t+1}, a_{t+1}) where a_{t+1} has been
        # replaced in the RB with the t_actor line above
        t_target_q_agents(rb_workspace, t=1, n_steps=1)

    q_values_rb_1, q_values_rb_2, post_q_values_1, post_q_values_2 = rb_workspace[
        "critic-1/q_value",
        "critic-2/q_value",
        "target-critic-1/q_value",
        "target-critic-2/q_value"
    ]
    

    # Compute temporal difference

    q_next = torch.minimum(post_q_values_1[1], post_q_values_2[1])
    v_phi = q_next - ent_coef * action_logprobs_next[1]

    target = reward[1] + cfg.algorithm.discount_factor * v_phi * must_bootstrap[1].int()
    critic_loss_1 = nn.functional.mse_loss(q_values_rb_1[0], target)
    critic_loss_2 = nn.functional.mse_loss(q_values_rb_2[0], target)
    

    return critic_loss_1, critic_loss_2

In [18]:
def compute_actor_loss(
    ent_coef, t_actor: TemporalAgent, t_q_agents: TemporalAgent, rb_workspace: Workspace
):
    r"""
    Actor loss computation
    :param ent_coef: The entropy coefficient $\alpha$
    :param t_actor: The actor agent (temporal agent)
    :param t_q_agents: The critics (as temporal agent)
    :param rb_workspace: The replay buffer (2 time steps, $t$ and $t+1$)
    """

    # Recompute the action with the current actor (at $a_t$)

    # Step 1: 使用当前 actor 重新计算当前状态下的动作 a_t 和对数概率
    t_actor(rb_workspace, t=0, n_steps=1, stochastic=True)
    action_logprobs_new = rb_workspace["action_logprobs"]

    # Compute Q-values

    # Step 2: 使用 Critic 计算当前状态下的 Q 值
    t_q_agents(rb_workspace, t=0, n_steps=1)
    q_values_1 = rb_workspace["critic-1/q_value"]
    q_values_2 = rb_workspace["critic-2/q_value"]

    # Step 3: 取两个 Q 值的最小值，减少估值偏差
    current_q_values = torch.min(q_values_1, q_values_2)

    # Compute the actor loss
    # Step 4: 计算 actor 损失

    actor_loss = ent_coef * action_logprobs_new[0] - current_q_values[0]
    
    return actor_loss.mean()

In [19]:
def run_sac(sac: SACAlgo):
    cfg = sac.cfg
    logger = sac.logger

    # init_entropy_coef is the initial value of the entropy coef alpha
    ent_coef = cfg.algorithm.init_entropy_coef
    tau = cfg.algorithm.tau_target

    # Creates the temporal actors
    t_actor = TemporalAgent(sac.train_policy)
    q_agents = TemporalAgent(Agents(sac.critic_1, sac.critic_2))
    target_q_agents = TemporalAgent(Agents(sac.target_critic_1, sac.target_critic_2))

    # Configure the optimizer
    actor_optimizer = setup_optimizer(cfg.actor_optimizer, sac.actor)
    critic_optimizer = setup_optimizer(cfg.critic_optimizer, sac.critic_1, sac.critic_2)
    entropy_coef_optimizer, log_entropy_coef = setup_entropy_optimizers(cfg) 

    # If entropy_mode is not auto, the entropy coefficient ent_coef remains
    # fixed. Otherwise, computes the target entropy
    if cfg.algorithm.entropy_mode == "auto":
        # target_entropy is \mathcal{H}_0 in the SAC and aplications paper.
        target_entropy = -np.prod(sac.train_env.action_space.shape).astype(np.float32)

    # Loops over successive replay buffers
    for rb in sac.iter_replay_buffers():     # TODO: Unknown variable: env/env_obs —— 使用自定义环境时，workspace中没有 env/env_obs
        # Implement the SAC algorithm
        rb_workspace = rb.get_shuffled(cfg.algorithm.batch_size)
        terminated, reward = rb_workspace["env/terminated", "env/reward"]

        must_boostrap = ~terminated

        if entropy_coef_optimizer is not None:
            ent_coef = torch.exp(log_entropy_coef.detach())

        # Critic update part #############################
        (critic_loss_1, critic_loss_2) = compute_critic_loss(
            cfg = cfg,
            reward = reward,
            must_bootstrap = must_boostrap,
            t_actor = t_actor,
            t_q_agents = q_agents,
            t_target_q_agents = target_q_agents,
            rb_workspace = rb_workspace,
            ent_coef = ent_coef
        )

        # 记录 Critic 损失
        logger.add_log("critic_loss_1", critic_loss_1, sac.nb_steps)
        logger.add_log("critic_loss_2", critic_loss_2, sac.nb_steps)

        # 反向传播并更新 Critic 参数
        critic_loss = critic_loss_1 + critic_loss_2
        critic_optimizer.zero_grad()
        critic_loss.backward()

        nn.utils.clip_grad_norm_(
            sac.critic_1.parameters(), cfg.algorithm.max_grad_norm
        )
        nn.utils.clip_grad_norm_(
            sac.critic_2.parameters(), cfg.algorithm.max_grad_norm
        )

        critic_optimizer.step()

        # Actor update part #############################
        actor_optimizer.zero_grad()
        actor_loss = compute_actor_loss(
            ent_coef = ent_coef,
            t_actor = t_actor,
            t_q_agents = q_agents,
            rb_workspace = rb_workspace
        )

        # 记录 Actor 损失
        logger.add_log("actor_loss", actor_loss, sac.nb_steps)

        # 反向传播并更新 Actor 参数
        actor_loss.backward()

        nn.utils.clip_grad_norm_(
            sac.actor.parameters(), cfg.algorithm.max_grad_norm
        )
        actor_optimizer.step()

        # Entropy optimizer part
        if entropy_coef_optimizer is not None:
            # See Eq. (17) of the SAC and Applications paper. The log
            # probabilities *must* have been computed when computing the actor
            # loss.
            action_logprobs_rb = rb_workspace["action_logprobs"].detach()
            entropy_coef_loss = -(
                log_entropy_coef.exp() * (action_logprobs_rb + target_entropy)
            ).mean()
            entropy_coef_optimizer.zero_grad()
            entropy_coef_loss.backward()
            entropy_coef_optimizer.step()
            logger.add_log("entropy_coef_loss", entropy_coef_loss, sac.nb_steps)
            logger.add_log("entropy_coef", ent_coef, sac.nb_steps)

        ####################################################

        # Soft update of target q function
        soft_update_params(sac.critic_1, sac.target_critic_1, tau)
        soft_update_params(sac.critic_2, sac.target_critic_2, tau)

        sac.evaluate()

In [20]:
agents = SACAlgo(OmegaConf.create(params))
run_sac(agents)

..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..


error: XDG_RUNTIME_DIR not set in the environment.


The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..


error: XDG_RUNTIME_DIR not set in the environment.


The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..


error: XDG_RUNTIME_DIR not set in the environment.


The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.
error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..


error: XDG_RUNTIME_DIR not set in the environment.


The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.
error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..


error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.
error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::....:: Antarctica Rendering Engine 2.0 ::..



error: XDG_RUNTIME_DIR not set in the environment.
error: XDG_RUNTIME_DIR not set in the environment.


The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..


error: XDG_RUNTIME_DIR not set in the environment.
error: XDG_RUNTIME_DIR not set in the environment.


The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


  0%|          | 0/2000 [00:00<?, ?it/s]

AssertionError: Unknown variable: env/env_obs

In [26]:
""" # 将 run_sac 集成到 main 程序中
if __name__ == "__main__":
    sac = SACAlgo(OmegaConf.create(params))

    # 以下部分是 run_sac 函数的主体
    cfg = sac.cfg
    logger = sac.logger

    # init_entropy_coef is the initial value of the entropy coef alpha
    ent_coef = cfg.algorithm.init_entropy_coef
    tau = cfg.algorithm.tau_target

    # Creates the temporal actors
    t_actor = TemporalAgent(sac.train_policy)
    q_agents = TemporalAgent(Agents(sac.critic_1, sac.critic_2))
    target_q_agents = TemporalAgent(Agents(sac.target_critic_1, sac.target_critic_2))

    # Configure the optimizer
    actor_optimizer = setup_optimizer(cfg.actor_optimizer, sac.actor)
    critic_optimizer = setup_optimizer(cfg.critic_optimizer, sac.critic_1, sac.critic_2)
    entropy_coef_optimizer, log_entropy_coef = setup_entropy_optimizers(cfg) 

    # If entropy_mode is not auto, the entropy coefficient ent_coef remains
    # fixed. Otherwise, computes the target entropy
    if cfg.algorithm.entropy_mode == "auto":
        # target_entropy is \mathcal{H}_0 in the SAC and aplications paper.
        target_entropy = -np.prod(sac.train_env.action_space.shape).astype(np.float32)

    # Loops over successive replay buffers
    for rb in sac.iter_replay_buffers():               # TODO: Unknown variable: env/env_obs

        # Implement the SAC algorithm
        rb_workspace = rb.get_shuffled(cfg.algorithm.batch_size)
        terminated, reward = rb_workspace["env/terminated", "env/reward"]

        must_boostrap = ~terminated

        if entropy_coef_optimizer is not None:
            ent_coef = torch.exp(log_entropy_coef.detach())

        # Critic update part #############################
        (critic_loss_1, critic_loss_2) = compute_critic_loss(
            cfg = cfg,
            reward = reward,
            must_bootstrap = must_boostrap,
            t_actor = t_actor,
            t_q_agents = q_agents,
            t_target_q_agents = target_q_agents,
            rb_workspace = rb_workspace,
            ent_coef = ent_coef
        )

        # 记录 Critic 损失
        logger.add_log("critic_loss_1", critic_loss_1, sac.nb_steps)
        logger.add_log("critic_loss_2", critic_loss_2, sac.nb_steps)

        # 反向传播并更新 Critic 参数
        critic_loss = critic_loss_1 + critic_loss_2
        critic_optimizer.zero_grad()
        critic_loss.backward()

        nn.utils.clip_grad_norm_(
            sac.critic_1.parameters(), cfg.algorithm.max_grad_norm
        )
        nn.utils.clip_grad_norm_(
            sac.critic_2.parameters(), cfg.algorithm.max_grad_norm
        )

        critic_optimizer.step()

        # Actor update part #############################
        actor_optimizer.zero_grad()
        actor_loss = compute_actor_loss(
            ent_coef = ent_coef,
            t_actor = t_actor,
            t_q_agents = q_agents,
            rb_workspace = rb_workspace
        )

        # 记录 Actor 损失
        logger.add_log("actor_loss", actor_loss, sac.nb_steps)

        # 反向传播并更新 Actor 参数
        actor_loss.backward()

        nn.utils.clip_grad_norm_(
            sac.actor.parameters(), cfg.algorithm.max_grad_norm
        )
        actor_optimizer.step()

        # Entropy optimizer part
        if entropy_coef_optimizer is not None:
            # See Eq. (17) of the SAC and Applications paper. The log
            # probabilities *must* have been computed when computing the actor
            # loss.
            action_logprobs_rb = rb_workspace["action_logprobs"].detach()
            entropy_coef_loss = -(
                log_entropy_coef.exp() * (action_logprobs_rb + target_entropy)
            ).mean()
            entropy_coef_optimizer.zero_grad()
            entropy_coef_loss.backward()
            entropy_coef_optimizer.step()
            logger.add_log("entropy_coef_loss", entropy_coef_loss, sac.nb_steps)
            logger.add_log("entropy_coef", ent_coef, sac.nb_steps)

        ####################################################

        # Soft update of target q function
        soft_update_params(sac.critic_1, sac.target_critic_1, tau)
        soft_update_params(sac.critic_2, sac.target_critic_2, tau)

        # agents.evaluate() """

..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..


error: XDG_RUNTIME_DIR not set in the environment.


The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.
error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..


error: XDG_RUNTIME_DIR not set in the environment.


The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..


error: XDG_RUNTIME_DIR not set in the environment.


The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..

error: XDG_RUNTIME_DIR not set in the environment.



The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..


error: XDG_RUNTIME_DIR not set in the environment.


The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..


error: XDG_RUNTIME_DIR not set in the environment.


The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..


error: XDG_RUNTIME_DIR not set in the environment.
error: XDG_RUNTIME_DIR not set in the environment.


The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..


error: XDG_RUNTIME_DIR not set in the environment.


The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.
error: XDG_RUNTIME_DIR not set in the environment.


The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..


error: XDG_RUNTIME_DIR not set in the environment.


The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..


error: XDG_RUNTIME_DIR not set in the environment.


The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.
error: XDG_RUNTIME_DIR not set in the environment.


The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


error: XDG_RUNTIME_DIR not set in the environment.


..:: Antarctica Rendering Engine 2.0 ::..
The path /dev/dri/ cannot be opened or is not available
The path /dev/dri/ cannot be opened or is not available
Unable to initialize SDL!: No available video device


  0%|          | 0/2000 [00:00<?, ?it/s]

AssertionError: Unknown variable: env/env_obs