In [132]:
import pandas as pd
import numpy as np
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

## **User Define Class**

In [129]:
# 创建类
class Data_Class:

    # 内部嵌套类
    class Trajectory_Class:
        def __init__(self, traj_series):
            self.traj_list = traj_series
            self.length = len(traj_series)

        def get_single_traj(self, index):
            return json.loads(self.traj_list[index])

    def __init__(self, path):
        self.path = path

        # 原始数据
        self.trajs_prefer_list = []
        self.trajs_reject_list = []

        # 处理数据
        self.traj_prefer_list_list_tensor = []
        self.traj_reject_list_list_tensor = []

        # 启动函数
        self.load_data(path)
        self.convert(self.trajs_prefer_list, self.traj_prefer_list_list_tensor) # 数据转换
        self.convert(self.trajs_reject_list, self.traj_reject_list_list_tensor)
        print("Data loaded successfully")

    def load_data(self, path):
        data = pd.read_csv(path)

        self.trajs_prefer_list = Data_Class.Trajectory_Class(data['preferred'])   # list 数据
        self.trajs_reject_list = Data_Class.Trajectory_Class(data['rejected'])    # list 数据

    def convert(self,
                list_json: Trajectory_Class,
                traj_list_list_tensor):

        # 获取第0条轨迹的第0时刻样本来确定维度
        sample = list_json.get_single_traj(0)[0]
        state0 = np.array(sample['state'])
        action0 = np.array(sample['action'])

        # 获取 state action 维度
        self.dim_state = state0.size if state0.ndim == 0 else state0.shape[0]
        self.dim_action = action0.size if action0.ndim == 0 else action0.shape[0]

        # 数据批量转换 tensor
        for idx in range(list_json.length):
            traj = list_json.get_single_traj(idx)
            states, actions = [], []

            for time_i in traj:
                # 转换为 numpy，然后 torch tensor
                state_np = np.array(time_i['state'])
                action_np = np.array(time_i['action'])

                state_t = torch.from_numpy(state_np).float()
                action_t = torch.from_numpy(action_np).float()

                # 如果是一维标量，要展开成长度1向量
                state_t = state_t.view(-1)
                action_t = action_t.view(-1)

                states.append(state_t)
                actions.append(action_t)

            # 将列表堆成张量 [L_i, dim]
            states_tensor = torch.stack(states, dim=0)
            actions_tensor = torch.stack(actions, dim=0)

            # 将每条轨迹作为一个元组 (states, actions) 添加到列表中
            traj_list_list_tensor.append((states_tensor, actions_tensor))

# ——— 数据集与加载器 ———
class PreferenceDataset(Dataset):
    def __init__(self, pref, rej, gamma):
        assert len(pref) == len(rej)
        self.pref = pref
        self.rej = rej
        self.gamma = gamma

    def __len__(self):
        return len(self.pref)

    def __getitem__(self, idx):
        return (*self.pref[idx], *self.rej[idx])

# 创建 MLP 打分模型
class RewardMLP(nn.Module):
    def __init__(self, s_dim, a_dim, hidden_dim=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(s_dim + a_dim, hidden_dim),  # 这里在构造神经网络
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, s, a):
        # s: [L_i, s_dim], a: [L_i, a_dim]
        x = torch.cat([s, a], dim=-1)
        return self.net(x).squeeze(-1)



In [142]:
# ———— 加载数据 ————
path = "trajectory_pairs.csv"
Data = Data_Class(path)

print(len(Data.trajs_prefer_list.get_single_traj(0)),  len(Data.trajs_reject_list.get_single_traj(0)))
print(len(Data.trajs_prefer_list.get_single_traj(1)),  len(Data.trajs_reject_list.get_single_traj(1)))
print(len(Data.trajs_prefer_list.get_single_traj(2)),  len(Data.trajs_reject_list.get_single_traj(2)))
print("")
print(len(Data.traj_prefer_list_list_tensor[0][0]), len(Data.traj_reject_list_list_tensor[0][0]))
print(len(Data.traj_prefer_list_list_tensor[1][0]), len(Data.traj_reject_list_list_tensor[1][0]))

Data loaded successfully
200 200
200 144
200 180

200 200
200 144


## **Training**

In [133]:

# ———— 超参数 ————
num_pairs = 200    # 偏好对数量
T = 50             # 期望最大轨迹长度（用于评估或其他需求）
s_dim = 4         # 状态维度 [角度, 角速度, 小车位置, 小车速度]
a_dim = 1         # 动作维度（推力）
gamma = 0.99      # 折扣因子
lr = 1e-4         # 学习率
batch_size = 16
num_epochs = 50

# ———— 加载数据 ————
path = "trajectory_pairs.csv"
Data = Data_Class(path)

# 自定义 collate_fn，保留变长序列
def variable_collate(batch):
    # batch: List of tuples (s_pref, a_pref, s_rej, a_rej)
    s_pf, a_pf, s_rj, a_rj = zip(*batch)
    return list(s_pf), list(a_pf), list(s_rj), list(a_rj)

# 准备训练
dataset = PreferenceDataset(
    Data.traj_prefer_list_list_tensor,
    Data.traj_reject_list_list_tensor,
    gamma
)
loader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=variable_collate
)

reward_net = RewardMLP(Data.dim_state, Data.dim_action, hidden_dim=64)
optimizer  = optim.Adam(reward_net.parameters(), lr=lr)
loss_fn    = nn.BCEWithLogitsLoss()

# ——— 训练循环 ———
for epoch in range(1, num_epochs + 1):
    total_loss = 0.0
    for s_pref_list, a_pref_list, s_rej_list, a_rej_list in loader:
        R_pref_batch = []
        R_rej_batch  = []

        # 计算 prefer 轨迹的回报
        for s_pf, a_pf in zip(s_pref_list, a_pref_list):
            r_pf = reward_net(s_pf, a_pf)           # [L_i]
            discounts = torch.tensor([gamma**t for t in range(r_pf.size(0))], device=r_pf.device)
            R_pref_batch.append((r_pf * discounts).sum())

        # 计算 reject 轨迹的回报
        for s_rj, a_rj in zip(s_rej_list, a_rej_list):
            r_rj = reward_net(s_rj, a_rj)          # [L_j]
            discounts = torch.tensor([gamma**t for t in range(r_rj.size(0))], device=r_rj.device)
            R_rej_batch.append((r_rj * discounts).sum())

        R_pref = torch.stack(R_pref_batch)
        R_rej = torch.stack(R_rej_batch)

        logits = R_pref - R_rej
        targets = torch.ones_like(logits)        # pref 应得更高分
        loss = loss_fn(logits, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * len(R_pref_batch)

    avg_loss = total_loss / len(dataset)
    print(f"Epoch {epoch}/{num_epochs} — Avg Loss: {avg_loss:.4f}")

print("🎉 合成数据上的奖励模型训练完成！")

# ——— 保存模型 ———
torch.save(reward_net.state_dict(), 'reward_net.pth')
print("模型已保存到 reward_net.pth")


Data loaded successfully
Epoch 1/50 — Avg Loss: 0.5030
Epoch 2/50 — Avg Loss: 0.3736
Epoch 3/50 — Avg Loss: 0.3204
Epoch 4/50 — Avg Loss: 0.2934
Epoch 5/50 — Avg Loss: 0.2803
Epoch 6/50 — Avg Loss: 0.2708
Epoch 7/50 — Avg Loss: 0.2642
Epoch 8/50 — Avg Loss: 0.2583
Epoch 9/50 — Avg Loss: 0.2534
Epoch 10/50 — Avg Loss: 0.2486
Epoch 11/50 — Avg Loss: 0.2452
Epoch 12/50 — Avg Loss: 0.2420
Epoch 13/50 — Avg Loss: 0.2389
Epoch 14/50 — Avg Loss: 0.2363
Epoch 15/50 — Avg Loss: 0.2346
Epoch 16/50 — Avg Loss: 0.2330
Epoch 17/50 — Avg Loss: 0.2311
Epoch 18/50 — Avg Loss: 0.2293
Epoch 19/50 — Avg Loss: 0.2277
Epoch 20/50 — Avg Loss: 0.2264
Epoch 21/50 — Avg Loss: 0.2246
Epoch 22/50 — Avg Loss: 0.2240
Epoch 23/50 — Avg Loss: 0.2226
Epoch 24/50 — Avg Loss: 0.2212
Epoch 25/50 — Avg Loss: 0.2211
Epoch 26/50 — Avg Loss: 0.2179
Epoch 27/50 — Avg Loss: 0.2195
Epoch 28/50 — Avg Loss: 0.2177
Epoch 29/50 — Avg Loss: 0.2173
Epoch 30/50 — Avg Loss: 0.2158
Epoch 31/50 — Avg Loss: 0.2142
Epoch 32/50 — Avg Loss:

## **Load Grading Model**

In [143]:
# 假设你已有一条轨迹的原始 JSON 数据 traj_json
traj_json = Data.trajs_prefer_list.get_single_traj(2)
traj_json = Data.trajs_reject_list.get_single_traj(2)

# 把它转成张量
states  = torch.stack([torch.from_numpy(np.array(step['state'])).float().view(-1)
                       for step in traj_json], dim=0)  # [L, s_dim]
actions = torch.stack([torch.from_numpy(np.array(step['action'])).float().view(-1)
                       for step in traj_json], dim=0)  # [L, a_dim]



# ——— 加载模型示例 ———
reward_net_loaded = RewardMLP(Data.dim_state, Data.dim_action, hidden_dim=64)
reward_net_loaded.load_state_dict(torch.load('reward_net.pth', weights_only=True))

# 切到推理模式，并关闭梯度
reward_net_loaded.eval()
print("加载并准备好进行推理")
with torch.no_grad():
    per_step_rewards = reward_net_loaded(states, actions)  # 张量形状 [L]

# 如果你想要轨迹的总折扣回报：
discounts = torch.tensor([gamma**t for t in range(per_step_rewards.size(0))])
total_return = (per_step_rewards * discounts).sum()
# print("每步奖励：", per_step_rewards)
print("折扣后总回报：", total_return)


加载并准备好进行推理
折扣后总回报： tensor(18.6901)
