## 完整的框架

In [1]:
# -*- coding: utf-8 -*-
"""
双流端到端自动驾驶网络 (Dual-Stream End-to-End Driving Model)
- 视觉流: ResNet_CBAM + Transformer (处理 t-2, t-1, t 三帧图像)
- 状态流: LSTM (处理过去 N 帧的 [速度, 加速度, 转角])
- 融合策略: Output = MLP(Concat(Visual, LSTM)) + LSTM
- 预测输出: 当前时刻所需的 [加速度, 转角]
"""
import torch.nn as nn
import torch.nn.functional as F
import math
import os
import torch
import pandas as pd
import numpy as np
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import json
from sklearn.preprocessing import MinMaxScaler

# 自建函数
from model import prepare_dataset_and_scaler, inverse_transform


# 执行处理
scaler_params = prepare_dataset_and_scaler(
    input_file='csv.csv',
    output_csv='global_vehicle_data_history_cols.csv',
    scaler_json_path='scaler_params.json',
    seq_length=9
)
print("处理完成！归一化参数已保存至 scaler_params.json，序列化数据已保存至 global_vehicle_data_history_cols.csv")

# 还原数据
# target_names = ['acceleration_x', 'acceleration_y', 'acceleration_z', 'steer']



处理完成！归一化参数已保存至 scaler_params.json，序列化数据已保存至 global_vehicle_data_history_cols.csv


In [None]:

# ==================== 1. 适配新格式的 Dataset ====================

class ProcessedDrivingDataset(Dataset):
    """
    专门解析带有 JSON history 列的端到端驾驶数据集
    """
    def __init__(self, csv_file, root_dir="", transform=None):
        self.data_df = pd.read_csv(csv_file)
        self.root_dir = root_dir

        # 定义需要送入 LSTM 的 10 个数值特征列 (严格对应预处理阶段)
        self.numeric_cols = [
            'global_x_history', 'global_y_history', 'global_z_history',
            'velocity_x_history', 'velocity_y_history', 'velocity_z_history',
            'steer_history',
            'acceleration_x_history', 'acceleration_y_history', 'acceleration_z_history'
        ]

        if transform is None:
            self.transform = transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
            ])
        else:
            self.transform = transform

    def __len__(self):
        return len(self.data_df)

    def __getitem__(self, idx):
        row = self.data_df.iloc[idx]

        # ---------------- A. 视觉数据 (取最后3帧) ----------------
        front_images = json.loads(row['front_image_history'])
        # 截取 t-2, t-1, t
        img_paths = [front_images[-3], front_images[-2], front_images[-1]]

        images = []
        for path in img_paths:
            full_path = os.path.join(self.root_dir, path) if self.root_dir else path
            img = Image.open(full_path).convert('RGB')
            if self.transform:
                img = self.transform(img)
            images.append(img)

        img_t_minus_2, img_t_minus_1, img_t = images

        # ---------------- B. 状态历史数据 (N=9帧, Features=10) ----------------
        # 构建形状为 (9, 10) 的矩阵
        state_features = []
        for col in self.numeric_cols:
            # 解析 json list，长度为 9
            val_list = json.loads(row[col])
            state_features.append(val_list)

        # state_features: 10 x 9 -> 转置为 9 x 10
        state_seq = np.array(state_features, dtype=np.float32).T
        state_seq_tensor = torch.tensor(state_seq)

        # ---------------- C. 目标标签 (Targets=4) ----------------
        # [accel_x, accel_y, accel_z, steer]
        target_tensor = torch.tensor([
            row['target_acceleration_x'],
            row['target_acceleration_y'],
            row['target_acceleration_z'],
            row['target_steer']
        ], dtype=torch.float32)

        return (img_t_minus_2, img_t_minus_1, img_t), state_seq_tensor, target_tensor


# ==================== 2. 模型核心组件 (与预处理结构对齐) ====================
# (此处省略部分 ResNet 基础块定义以保持简洁，使用时请补全之前的 Bottleneck/ResNet 代码)

class MotionLSTMEncoder(nn.Module):
    """LSTM 处理历史状态"""
    # 核心修改：input_size 修改为 10
    def __init__(self, input_size=10, hidden_size=512, num_layers=2):
        super(MotionLSTMEncoder, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.1
        )
        self.ln = nn.LayerNorm(hidden_size)

    def forward(self, x):
        """x shape: (Batch, seq_length=9, features=10)"""
        output, (h_n, c_n) = self.lstm(x)
        last_hidden = h_n[-1]
        return self.ln(last_hidden)

# 使用假 ResNet 占位（实际使用时替换回真实的 create_resnet_cbam）
class MockResNet(nn.Module):
    def get_feature_maps(self, x):
        return torch.randn(x.size(0), 2048, 7, 7).to(x.device)
def create_resnet_cbam(): return MockResNet()

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=500):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class DualStreamDrivingModel(nn.Module):
    def __init__(self,
                 use_shared_weights=True,
                 visual_d_model=512,
                 lstm_hidden_size=512):
        super(DualStreamDrivingModel, self).__init__()

        self.use_shared_weights = use_shared_weights
        self.visual_d_model = visual_d_model

        # 1. 视觉流
        self.resnet = create_resnet_cbam()
        self.feature_proj = nn.Linear(2048, visual_d_model)
        self.pos_encoder = PositionalEncoding(visual_d_model, max_len=500)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=visual_d_model, nhead=8, batch_first=True, dim_feedforward=1024, dropout=0.1
        )
        self.visual_transformer = nn.TransformerEncoder(encoder_layer, num_layers=3)

        # 2. 状态流 (输入特征维度改为 10)
        self.lstm_hidden_size = lstm_hidden_size
        self.motion_lstm = MotionLSTMEncoder(
            input_size=10,
            hidden_size=lstm_hidden_size,
            num_layers=2
        )

        # 3. 融合模块 MLP(Concat(Vis, LSTM))
        fusion_input_dim = visual_d_model + lstm_hidden_size
        self.fusion_mlp = nn.Sequential(
            nn.Linear(fusion_input_dim, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, lstm_hidden_size),
            nn.ReLU()
        )

        # 4. 预测头
        # 核心修改：输出层改为 4 (对应 accel_x, accel_y, accel_z, steer)
        self.control_head = nn.Sequential(
            nn.Linear(lstm_hidden_size, 128),
            nn.ReLU(),
            nn.Linear(128, 4)
        )

    def extract_image_features(self, img_t_minus_2, img_t_minus_1, img_t):
        f1 = self.resnet.get_feature_maps(img_t_minus_2)
        f2 = self.resnet.get_feature_maps(img_t_minus_1)
        f3 = self.resnet.get_feature_maps(img_t)
        return f1, f2, f3

    def forward(self, img_t_minus_2, img_t_minus_1, img_t, state_history):
        batch_size = img_t.size(0)

        # 视觉流
        f1, f2, f3 = self.extract_image_features(img_t_minus_2, img_t_minus_1, img_t)
        def flatten_and_project(f):
            flat = f.view(batch_size, f.size(1), -1).transpose(1, 2)
            return self.feature_proj(flat)
        proj_1, proj_2, proj_3 = flatten_and_project(f1), flatten_and_project(f2), flatten_and_project(f3)

        visual_seq = torch.cat([proj_1, proj_2, proj_3], dim=1)
        visual_seq = self.pos_encoder(visual_seq)
        trans_out = self.visual_transformer(visual_seq)
        visual_vector = trans_out.mean(dim=1)

        # 状态流
        lstm_vector = self.motion_lstm(state_history)

        # 融合与残差连接
        combined_features = torch.cat([visual_vector, lstm_vector], dim=1)
        mlp_output = self.fusion_mlp(combined_features)
        fused_final = mlp_output + lstm_vector

        # 预测
        prediction = self.control_head(fused_final) # 输出尺寸 (Batch, 4)

        return prediction

# ==================== 3. 运行测试流程 ====================
if __name__ == "__main__":
    # 实例化 Dataset 和 DataLoader
    dataset = ProcessedDrivingDataset(csv_file='global_vehicle_data_history_cols.csv')
    dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

    # 初始化模型
    model = DualStreamDrivingModel()
    criterion = nn.MSELoss()

    # 抽取一个 Batch 测试
    for batch_idx, (images, state_seq, target) in enumerate(dataloader):
        img_t_minus_2, img_t_minus_1, img_t = images

        # 前向传播
        predictions = model(img_t_minus_2, img_t_minus_1, img_t, state_seq)
        loss = criterion(predictions, target)

        print(f"--- Batch {batch_idx} ---")
        print(f"图像输入尺寸 (单张): {img_t.shape}")
        print(f"LSTM 输入尺寸: {state_seq.shape} -> (Batch, Seq=9, Features=10)")
        print(f"预测值尺寸: {predictions.shape} -> [accel_x, accel_y, accel_z, steer]")
        print(f"目标值尺寸: {target.shape}")
        print(f"Loss: {loss.item():.4f}\n")

        # 模拟展示如何将预测结果通过你写的反归一化还原
        pred_numpy = predictions.detach().numpy()[0]
        # 当你要获取真实的加速度和转角大小时，调用你的 inverse_transform 函数：
        # real_values = inverse_transform(pred_numpy, target_names, scaler_params)
        break

In [None]:

# ==================== 1. 视觉特征提取骨干 (ResNet + CBAM) ====================

def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)

class ChannelAttention(nn.Module):
    """通道注意力模块"""
    def __init__(self, in_planes, ratio=16):
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)

        self.fc = nn.Sequential(
            nn.Conv2d(in_planes, in_planes // 16, 1, bias=False),
            nn.ReLU(),
            nn.Conv2d(in_planes // 16, in_planes, 1, bias=False)
        )
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = self.fc(self.avg_pool(x))
        max_out = self.fc(self.max_pool(x))
        out = avg_out + max_out
        return self.sigmoid(out)

class SpatialAttention(nn.Module):
    """空间注意力模块"""
    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()
        self.conv1 = nn.Conv2d(2, 1, kernel_size, padding=kernel_size // 2, bias=False)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        x = torch.cat([avg_out, max_out], dim=1)
        x = self.conv1(x)
        return self.sigmoid(x)

class Bottleneck(nn.Module):
    """ResNet50的瓶颈块，集成CBAM注意力机制"""
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)

        self.ca = ChannelAttention(planes * 4)
        self.sa = SpatialAttention()

        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        out = self.ca(out) * out
        out = self.sa(out) * out

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)
        return out

class ResNet(nn.Module):
    """完整的ResNet网络结构，集成CBAM注意力机制"""
    def __init__(self, block, layers):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )
        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))
        return nn.Sequential(*layers)

    def get_feature_maps(self, x):
        """获取特征图（不进行全局池化），输出维度 (B, 2048, H/32, W/32)"""
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        return x

def create_resnet_cbam():
    return ResNet(Bottleneck, [3, 4, 6, 3])

# ==================== 2. 时序与状态处理模块 ====================

class PositionalEncoding(nn.Module):
    """标准的正弦/余弦位置编码，适用于任意长度序列"""
    def __init__(self, d_model, max_len=500):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0)) # Shape: (1, max_len, d_model)

    def forward(self, x):
        """x shape: (Batch, Seq_Len, d_model)"""
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len, :]

class MotionLSTMEncoder(nn.Module):
    """处理车辆历史状态序列的 LSTM 模块"""
    def __init__(self, input_size=3, hidden_size=512, num_layers=2):
        super(MotionLSTMEncoder, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.1
        )
        self.ln = nn.LayerNorm(hidden_size)

    def forward(self, x):
        """x shape: (Batch, N, 3)"""
        output, (h_n, c_n) = self.lstm(x)
        # 取最后一层的最后一个时间步状态
        last_hidden = h_n[-1]
        return self.ln(last_hidden)

# ==================== 3. 核心双流融合模型 ====================

class DualStreamDrivingModel(nn.Module):
    def __init__(self,
                 use_shared_weights=True,
                 visual_d_model=512,
                 lstm_hidden_size=512):
        super(DualStreamDrivingModel, self).__init__()

        self.use_shared_weights = use_shared_weights
        self.visual_d_model = visual_d_model

        # --- 1. 视觉流 (Visual Stream) ---
        if use_shared_weights:
            self.resnet = create_resnet_cbam()
        else:
            self.resnet_t_minus_2 = create_resnet_cbam()
            self.resnet_t_minus_1 = create_resnet_cbam()
            self.resnet_t = create_resnet_cbam()

        self.feature_proj = nn.Linear(2048, visual_d_model)
        self.pos_encoder = PositionalEncoding(visual_d_model, max_len=500)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=visual_d_model, nhead=8, batch_first=True, dim_feedforward=1024, dropout=0.1
        )
        self.visual_transformer = nn.TransformerEncoder(encoder_layer, num_layers=3)

        # --- 2. 状态流 (State Stream) ---
        self.lstm_hidden_size = lstm_hidden_size
        self.motion_lstm = MotionLSTMEncoder(
            input_size=3, # (v, a, angle)
            hidden_size=lstm_hidden_size,
            num_layers=2
        )

        # --- 3. 融合模块 (Fusion) ---
        fusion_input_dim = visual_d_model + lstm_hidden_size
        self.fusion_mlp = nn.Sequential(
            nn.Linear(fusion_input_dim, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, lstm_hidden_size), # 输出必须对齐 LSTM 维度以进行残差相加
            nn.ReLU()
        )

        # --- 4. 预测头 (Prediction Head) ---
        self.control_head = nn.Sequential(
            nn.Linear(lstm_hidden_size, 128),
            nn.ReLU(),
            nn.Linear(128, 2) # [加速度, 转角]
        )

    def extract_image_features(self, img_t_minus_2, img_t_minus_1, img_t):
        if self.use_shared_weights:
            f1 = self.resnet.get_feature_maps(img_t_minus_2)
            f2 = self.resnet.get_feature_maps(img_t_minus_1)
            f3 = self.resnet.get_feature_maps(img_t)
        else:
            f1 = self.resnet_t_minus_2.get_feature_maps(img_t_minus_2)
            f2 = self.resnet_t_minus_1.get_feature_maps(img_t_minus_1)
            f3 = self.resnet_t.get_feature_maps(img_t)
        return f1, f2, f3

    def forward(self, img_t_minus_2, img_t_minus_1, img_t, state_history):
        """
        Args:
            img_t_minus_2, img_t_minus_1, img_t: (Batch, 3, H, W)
            state_history: (Batch, N, 3) 包含过去 N 帧的 [速度, 加速度, 转角]
        """
        batch_size = img_t.size(0)

        # ---------------- A. 视觉处理分支 ----------------
        f1, f2, f3 = self.extract_image_features(img_t_minus_2, img_t_minus_1, img_t)

        # 展平并投影
        def flatten_and_project(f):
            # f shape: (B, 2048, H', W') -> (B, H'*W', 2048)
            flat = f.view(batch_size, f.size(1), -1).transpose(1, 2)
            return self.feature_proj(flat)

        proj_1 = flatten_and_project(f1) # (B, 49, d_model) 假设输入224x224
        proj_2 = flatten_and_project(f2)
        proj_3 = flatten_and_project(f3)

        # 拼接序列，加入位置编码，送入 Transformer
        visual_seq = torch.cat([proj_1, proj_2, proj_3], dim=1) # (B, 147, d_model)
        visual_seq = self.pos_encoder(visual_seq)
        trans_out = self.visual_transformer(visual_seq)

        # 聚合视觉特征 (Global Average Pooling)
        visual_vector = trans_out.mean(dim=1) # (Batch, visual_d_model)

        # ---------------- B. 状态处理分支 ----------------
        lstm_vector = self.motion_lstm(state_history) # (Batch, lstm_hidden_size)

        # ---------------- C. 核心融合逻辑 ----------------
        # 逻辑: Fused = MLP(Concat(Visual, LSTM)) + LSTM
        combined_features = torch.cat([visual_vector, lstm_vector], dim=1)
        mlp_output = self.fusion_mlp(combined_features)

        # 残差连接 (Residual Add)
        fused_final = mlp_output + lstm_vector

        # ---------------- D. 预测输出 ----------------
        prediction = self.control_head(fused_final) # (Batch, 2)

        return prediction, {
            'visual_vector': visual_vector,
            'lstm_vector': lstm_vector,
            'fused_final': fused_final
        }


# ==================== 测试用例 (Main) ====================

if __name__ == "__main__":
    print("=== 开始测试：双流端到端自动驾驶网络 ===\n")

    # 1. 设定超参数
    BATCH_SIZE = 2
    HISTORY_N = 10     # 状态流输入过去 10 帧数据
    IMG_SIZE = 224     # 图像分辨率

    print(f"设定参数: Batch Size = {BATCH_SIZE}, 历史帧数 = {HISTORY_N}, 图像尺寸 = {IMG_SIZE}x{IMG_SIZE}")

    # 2. 构造模拟的 Tensor 数据
    # 图像数据: (B, C, H, W)
    img_t2 = torch.randn(BATCH_SIZE, 3, IMG_SIZE, IMG_SIZE)
    img_t1 = torch.randn(BATCH_SIZE, 3, IMG_SIZE, IMG_SIZE)
    img_t0 = torch.randn(BATCH_SIZE, 3, IMG_SIZE, IMG_SIZE)

    # 状态历史数据: (Batch, N, 3) 对应 [速度, 加速度, 转角]
    state_seq = torch.randn(BATCH_SIZE, HISTORY_N, 3)

    print("生成模拟数据完毕...\n")

    # 3. 实例化模型
    # visual_d_model 和 lstm_hidden_size 保持一致，方便做残差
    model = DualStreamDrivingModel(
        use_shared_weights=True,
        visual_d_model=512,
        lstm_hidden_size=512
    )

    print("模型实例化成功，开始前向传播 (Forward Pass)... (可能需要几秒钟计算)")

    # 4. 执行前向传播
    model.eval() # 测试模式
    with torch.no_grad():
        predictions, debug_info = model(img_t2, img_t1, img_t0, state_seq)

    # 5. 打印验证结果
    print("\n" + "="*40)
    print("内部特征维度检查:")
    print(f" - Transformer 视觉向量: {debug_info['visual_vector'].shape} -> 期望: ({BATCH_SIZE}, 512)")
    print(f" - LSTM 运动状态向量:    {debug_info['lstm_vector'].shape} -> 期望: ({BATCH_SIZE}, 512)")
    print(f" - 残差融合后特征向量:   {debug_info['fused_final'].shape} -> 期望: ({BATCH_SIZE}, 512)")

    print("\n模型最终预测输出:")
    print(f" - 预测 Tensor 尺寸: {predictions.shape} -> 期望: ({BATCH_SIZE}, 2) [表示加速度和转角]")

    for i in range(BATCH_SIZE):
        accel = predictions[i, 0].item()
        steer = predictions[i, 1].item()
        print(f"   * 样本 {i+1}: 预测加速度 = {accel:+.4f}, 预测转角 = {steer:+.4f}")

    print("="*40)
    print("测试通过！网络结构和数据流转完全正确。")