# GC-SAC OPE Demo (FQE + DM/TIS/DR)

最小示例：加载 SAC checkpoint → 采样行为数据 → 训练 FQE → 计算 DM/TIS/DR 估计。


In [1]:
from pathlib import Path
import numpy as np
import torch as th

from stable_baselines3 import SAC, PPO
from stable_baselines3.common.evaluation import evaluate_policy

from gc_ope.env.get_env import get_env
from gc_ope.utils.load_config_with_hydra import load_config

from gc_ope.algorithm.ope.logged_dataset import collect_logged_dataset, compute_eval_policy_cache
from gc_ope.algorithm.ope.fqe import FQETrainer
from gc_ope.algorithm.ope.ope_input import build_ope_inputs
from gc_ope.algorithm.ope.estimators import (
    dm_estimate, tis_estimate, dr_estimate,
    dm_compute_trajectory_values, tis_compute_trajectory_values, dr_compute_trajectory_values
)

PROJECT_ROOT_DIR = Path().absolute().parent.parent.parent.parent
PROJECT_ROOT_DIR


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
pybullet build time: Dec 11 2025 17:43:29


PosixPath('/home/maxine/ai4robot/gc_ope')

## 准备环境与策略

In [2]:
# 准备环境与策略
env_cfg = load_config(
    config_path="../../../configs/train",
    config_name="config",
)

# TODO: 改为从"../../../configs/ope/config.yaml"中读取OPE评估的环境和策略参数、行为策略采样参数、在线评估参数、FQE参数
# TODO: 需要对应修改env_cfg.algo，env_cfg.env
ckpt_path_1 = PROJECT_ROOT_DIR / "checkpoints/flycraft/sac/seed_1/best_model"
ckpt_path_2 = PROJECT_ROOT_DIR / "checkpoints/flycraft/sac/seed_2/best_model"

# 根据checkpoint路径确定环境
if "flycraft" in str(ckpt_path_1):
    env_cfg.env.env_id = "FlyCraft-v0"
elif "flycraft" in str(ckpt_path_2):
    env_cfg.env.env_id = "FlyCraft-v0"

env = get_env(env_cfg.env)

# 根据checkpoint路径确定策略类型
# 注意：HER 实际上是 SAC + HerReplayBuffer，所以使用 SAC.load() 来加载
# HER 模型需要传递 env 参数，因为 HerReplayBuffer 需要环境来初始化
if "sac" in str(ckpt_path_1) or "her" in str(ckpt_path_1):
    # 对于 HER 模型，需要传递 env 参数；对于普通 SAC，传递 env 也是安全的
    behavior_algo = SAC.load(ckpt_path_1, env=env)
    eval_algo = SAC.load(ckpt_path_2, env=env)
elif "ppo" in str(ckpt_path_1):
    behavior_algo = PPO.load(ckpt_path_1, env=env)
    eval_algo = PPO.load(ckpt_path_2, env=env)
else:
    raise ValueError(f"Unsupported algorithm: {ckpt_path_1}")

gamma = float(getattr(eval_algo, "gamma", 0.99))
print("gamma=", gamma)



load config from: /home/maxine/ai4robot/gc_ope/configs/env_configs/flycraft/env_config_for_sac_easy.json
3 Generator(PCG64) Generator(PCG64)
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
gamma= 0.995


## 采样行为数据，并缓存评价策略动作/对数概率
Logged dataset from behavior policy rollouts.

Contains transitions $$(s_t, a_t, r_{t+1}, s_{t+1}, \text{done}_t)$$
collected by rolling out a behavior policy, along with optional
precomputed evaluation policy actions and log-probabilities.

1. 没用DictReplayBuffer：除了(s,a,r,s',d)，还需要识别变长轨迹，显式存`traj_id$、`step_index`
2. 直接在rollout时，用评价策略对s、(s,a)进行概率获取
3. 对于状态`obs`：存两类，`dict`是原始goal-conditioned RL的状态形式，放平成`flat`

Attributes:
- obs_flat: Flattened observations (N, obs_dim).
- actions: Actions taken by behavior policy (N, act_dim).
- rewards: Rewards (N,).
- next_obs_flat: Next observations (N, obs_dim).
- dones: Episode termination flags (N,).
- traj_id: Trajectory ID for each transition (N,).
- step_index: Step index within trajectory (N,).
- obs_dict: Original dict observations (list of N dicts).
- next_obs_dict: Original dict next observations (list of N dicts)
- behavior_log_prob: Log-probability of actions under behavior policy (N,).
  
*对于评价策略：（build_ope_input时计算）*
- eval_action_curr: Evaluation policy actions at $s_t$ (N, act_dim) or None.
- eval_action_next: Evaluation policy actions at $s_{t+1}$ (N, act_dim) or None.
- eval_log_prob_curr: Log-probability of eval actions at $s_t$ (N,) or None.
- eval_log_prob_next: Log-probability of eval actions at $s_{t+1}$ (N,) or None.

In [None]:
# 采样行为数据（已实现：数据采样和评价策略缓存分离）
# 现在 collect_logged_dataset 只进行数据采样，评价策略缓存会在 build_ope_inputs 中自动计算
# TODO：写在python脚本里；再做日志存档
n_episodes = 1000
max_steps = 400

dataset = collect_logged_dataset(
    env=env,
    behavior_algo=behavior_algo,
    # eval_algo 参数已移除，评价策略缓存将在 build_ope_inputs 中自动计算
    n_episodes=n_episodes,
    max_steps=max_steps,
)


print, Train, [32mreach_target_termination_single_step_based_on_angle_of_velocity_vector。[0m steps: 35。target: (208.22, -8.12, -4.01)。achieved target: (212.88, -10.69, -3.40)。expert steps: 0。
print, Train, [32mreach_target_termination_single_step_based_on_angle_of_velocity_vector。[0m steps: 186。target: (197.91, -6.81, 14.07)。achieved target: (196.67, -6.86, 11.09)。expert steps: 0。
print, Train, [32mreach_target_termination_single_step_based_on_angle_of_velocity_vector。[0m steps: 75。target: (161.37, -2.18, 1.00)。achieved target: (164.12, 0.09, 0.82)。expert steps: 0。
print, Train, [32mreach_target_termination_single_step_based_on_angle_of_velocity_vector。[0m steps: 81。target: (193.06, 1.74, 14.27)。achieved target: (196.12, 2.75, 11.68)。expert steps: 0。
print, Train, [32mreach_target_termination_single_step_based_on_angle_of_velocity_vector。[0m steps: 329。target: (245.63, -4.32, 8.91)。achieved target: (236.17, -5.38, 6.14)。expert steps: 0。
print, Train, [31mtimeout_termination。

In [9]:
dataset.__dict__.keys()

dict_keys(['obs_flat', 'actions', 'rewards', 'next_obs_flat', 'dones', 'traj_id', 'step_index', 'obs_dict', 'next_obs_dict', 'behavior_log_prob', 'eval_action_curr', 'eval_action_next', 'eval_log_prob_curr', 'eval_log_prob_next'])

In [4]:
print(
    f"Collected {len(dataset.obs_flat)} transitions from {dataset.traj_id.max() + 1} episodes; "
    f"obs_dim={dataset.obs_flat.shape[1]}, act_dim={dataset.actions.shape[1]}"
)

Collected 2068 transitions from 10 episodes; obs_dim=14, act_dim=3


In [11]:
display(dataset.obs_dict[0].keys())
display(dataset.obs_dict[0]['observation'])
display(dataset.obs_dict[0]['achieved_goal'])
display(dataset.obs_dict[0]['desired_goal'])
display(dataset.obs_flat[0])

dict_keys(['observation', 'desired_goal', 'achieved_goal'])

array([0.5      , 0.5145179, 0.5      , 0.2      , 0.5      , 0.5      ,
       0.5      , 0.25     ], dtype=float32)

array([0.2, 0.5, 0.5], dtype=float32)

array([0.2082162, 0.4549032, 0.4888545], dtype=float32)

array([0.5      , 0.5145179, 0.5      , 0.2      , 0.5      , 0.5      ,
       0.5      , 0.25     , 0.2082162, 0.4549032, 0.4888545, 0.2      ,
       0.5      , 0.5      ], dtype=float32)

## OPEInput准备
**已实现：FQE 训练和预测集成到 build_ope_inputs**

FQE 训练和预测过程已集成到 `build_ope_inputs` 函数中，不再需要手动训练。
可以通过 `fqe_train_kwargs` 和 `fqe_kwargs` 参数自定义训练和初始化参数。

### FQE 说明
Fitted Q Evaluation trainer for continuous goal-conditioned policies.

FQE is an off-policy evaluation method that approximates a Q function
$Q_\theta(s, a)$ for the evaluation policy $\pi_\phi(s)$.

The FQE loss is:

$$
    L(\theta) = \mathbb{E}_{(s_t, a_t, r_{t+1}, s_{t+1}) \sim D}
        \left[ \left( Q_\theta(s_t, a_t) - r_{t+1}
            - \gamma Q_{\theta'}(s_{t+1}, \pi_\phi(s_{t+1})) \right)^2 \right]
$$

where $D$ is the logged dataset, $\theta'$ is the target network
parameters (soft-updated with $\tau$), and $\pi_\phi(s_{t+1})$
is the deterministic action from the evaluation policy.

The trained Q function in FQE estimates evaluation metrics more accurately
than the Q function learned during policy training.

#DONE：好像没有做goal-conditioned？现在是把整个obs('observation', 'desired_goal', 'achieved_goal')拉平作为一个obs。

### 构造 OPE 输入
Build OPE inputs from logged dataset and trained FQE model.

Computes evaluation policy actions/log-probs and Q-values needed for
DM, TIS, and DR estimators.

In [None]:
# 构造 OPE 输入（已实现：FQE 训练和预测集成到 build_ope_inputs）
# FQE 训练和预测过程已集成，支持 q_function_method 参数（当前仅支持 "fqe"）
# 可以通过 fqe_train_kwargs 自定义训练参数，通过 fqe_kwargs 自定义 FQE 初始化参数

# 指定设备：'cuda' 或 'cpu'，如果为 None 则自动检测（优先使用 CUDA）
device = 'cuda' if th.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# 方式1：自动创建和训练 FQE（推荐）
loss_log = []


def _logger(epoch: int, loss: float):
    if epoch % 50 == 0 or epoch == 1:
        print(f"Epoch {epoch:04d} | FQE loss={loss:.3f}")
    loss_log.append((epoch, loss))


inputs = build_ope_inputs(
    dataset=dataset,
    eval_algo=eval_algo,
    gamma=gamma,
    # fqe=None,  # 如果为 None，会自动创建并训练
    # q_function_method="fqe",  # 默认 "fqe"
    fqe_train_kwargs={
        "batch_size": 256,
        "n_epochs": 300,
        "shuffle": True,
        "logger": _logger,
    },
    fqe_kwargs={
        "lr": 3e-4,
        "tau": 0.005,
        "device": device,
        "obs_state_dim": dataset.obs_dict[0]['observation'].shape[0],
        "goal_dim": dataset.obs_dict[0]['desired_goal'].shape[0],
    },
)

# 方式2：使用预训练的 FQE（向后兼容，如果需要）
# inputs = build_ope_inputs(
#     dataset=dataset,
#     eval_algo=eval_algo,
#     gamma=gamma,
#     fqe=fqe,  # 传入已训练的 FQE
# )


Using device: cuda
Goal-conditioned mode: separate processing for state and goal
obs_state_dim: 8, goal_dim: 3, obs_dim: 14
Goal-conditioned mode: separate processing for state and goal
obs_state_dim: 8, goal_dim: 3, obs_dim: 14
Epoch 0001 | FQE loss=0.037
Epoch 0050 | FQE loss=0.004
Epoch 0100 | FQE loss=0.007
Epoch 0150 | FQE loss=0.011
Epoch 0200 | FQE loss=0.018
Epoch 0250 | FQE loss=0.030
Epoch 0300 | FQE loss=0.047


In [None]:
inputs.__dict__.keys()

dict_keys(['obs_flat', 'actions', 'rewards', 'next_obs_flat', 'dones', 'traj_id', 'step_index', 'behavior_log_prob', 'eval_action', 'eval_log_prob', 'q_sa_behavior', 'q_sa_eval', 'gamma'])

## OPE算法计算
### DM
Direct Method (DM) estimator.

DM estimates the policy value using the FQE Q-function:

$$

    \hat{V}^{\text{DM}} = \frac{1}{N} \sum_{i=1}^N Q(s_i, \pi_{\text{eval}}(s_i)) $$

If ``initial_only=True``, only uses initial states (step_index == 0):

$$

    \hat{V}^{\text{DM}} = \frac{1}{|\mathcal{I}_0|} \sum_{i \in \mathcal{I}_0} Q(s_i, \pi_{\text{eval}}(s_i)) $$

where $\mathcal{I}_0$ is the set of initial state indices.

### TIS
Trajectory-wise Importance Sampling (TIS) estimator.

TIS estimates the policy value using trajectory-level importance weights:

$$

    \hat{V}^{\text{TIS}} = \frac{1}{M} \sum_{\tau=1}^M w_\tau G_\tau

$$

where $M$ is the number of trajectories, $G_\tau$ is the
discounted return of trajectory $\tau$, and the importance weight is:

$$

    w_\tau = \prod_{t=0}^{T_\tau-1} \frac{\pi_{\text{eval}}(a_t | s_t)}{\pi_{\text{behavior}}(a_t | s_t)}
        = \exp\left( \sum_{t=0}^{T_\tau-1} \left( \log \pi_{\text{eval}}(a_t | s_t)
            - \log \pi_{\text{behavior}}(a_t | s_t) \right) \right)
$$

### DR
Doubly Robust (DR) estimator.

DR combines importance sampling with a control variate (Q-function) to
reduce variance. For each trajectory $\tau$, the estimate is:

$$

    \hat{V}_\tau^{\text{DR}} = \sum_{t=0}^{T_\tau-1} \gamma^t \left[
        w_t (r_t - Q(s_t, a_t)) + w_{t-1} Q(s_t, \pi_{\text{eval}}(s_t))
    \right]
$$

where $w_t = \prod_{k=0}^t \frac{\pi_{\text{eval}}(a_k | s_k)}{\pi_{\text{behavior}}(a_k | s_k)}$
is the step-wise importance weight, $w_{-1} = 1$, and
$Q(s_t, a_t)$ is the Q-value for the behavior action while
$Q(s_t, \pi_{\text{eval}}(s_t))$ is for the evaluation policy action.

The overall estimate is:

$$

    \hat{V}^{\text{DR}} = \frac{1}{M} \sum_{\tau=1}^M \hat{V}_\tau^{\text{DR}}
$$


In [8]:
# 已实现：支持多种计算 mean&ci 的方式，计算轨迹 v 值和计算 mean&ci 的过程已分离
# 支持 ci_method 参数："bootstrap"（默认）、"normal"、"t_test"

# 计算估计值（支持多种 CI 方法）
dm_all = dm_estimate(inputs, initial_only=False, ci_method="bootstrap")
# dm_init = dm_estimate(inputs, initial_only=True, ci_method="bootstrap")
tis_res = tis_estimate(inputs, ci_method="bootstrap")
dr_res = dr_estimate(inputs, ci_method="bootstrap")

# 也可以使用其他 CI 方法
# dm_all_normal = dm_estimate(inputs, initial_only=False, ci_method="normal")
# tis_res_t = tis_estimate(inputs, ci_method="t_test")

# 如果需要单独获取轨迹级别的值（不计算 CI）
# dm_values = dm_compute_trajectory_values(inputs, initial_only=False)
# tis_values = tis_compute_trajectory_values(inputs)
# dr_values = dr_compute_trajectory_values(inputs)

print("DM (step-wise):", dm_all)
# print("DM (initial-state):", dm_init)
print("TIS:", tis_res)
print("DR:", dr_res)


DM (step-wise): EstimateResult(mean=-3.887298107147217, ci_lower=-3.897599458694458, ci_upper=-3.8750970363616943)
TIS: EstimateResult(mean=-inf, ci_lower=nan, ci_upper=nan)
DR: EstimateResult(mean=nan, ci_lower=nan, ci_upper=nan)


  weight = np.exp((logp_e - logp_b).sum())
  diff_b_a = subtract(b, a)
  return bound(*args, **kwds)
  term = w_step * (r - q_sa) + w_prev * v_eval
  term = w_step * (r - q_sa) + w_prev * v_eval


In [None]:
# 在线评估真实回报（可选，耗时）
if True:
    # mean_r_b, std_r_b = evaluate_policy(behavior_algo, env, n_eval_episodes=5, deterministic=True)
    mean_r_e, std_r_e = evaluate_policy(eval_algo, env, n_eval_episodes=10, deterministic=True)
    # print(f"behavior return: {mean_r_b:.2f} ± {std_r_b:.2f}")
    print(f"eval return:     {mean_r_e:.2f} ± {std_r_e:.2f}")



print, Train, [32mreach_target_termination_single_step_based_on_angle_of_velocity_vector。[0m steps: 101。target: (243.15, -5.86, 7.81)。achieved target: (247.15, -3.80, 9.44)。expert steps: 0。
print, Train, [32mreach_target_termination_single_step_based_on_angle_of_velocity_vector。[0m steps: 171。target: (179.82, 4.84, 13.33)。achieved target: (175.77, 4.29, 10.50)。expert steps: 0。
print, Train, [32mreach_target_termination_single_step_based_on_angle_of_velocity_vector。[0m steps: 223。target: (171.87, 6.60, 9.46)。achieved target: (171.50, 6.48, 6.76)。expert steps: 0。
print, Train, [32mreach_target_termination_single_step_based_on_angle_of_velocity_vector。[0m steps: 62。target: (218.28, 6.40, -4.29)。achieved target: (226.64, 9.35, -4.18)。expert steps: 0。
print, Train, [32mreach_target_termination_single_step_based_on_angle_of_velocity_vector。[0m steps: 66。target: (225.87, 7.57, -23.86)。achieved target: (227.81, 6.17, -21.40)。expert steps: 0。
print, Train, [32mreach_target_terminatio