In [None]:
import torch
from torch.utils.data import Dataset,DataLoader,Subset
import numpy as np
import pandas as pd
import os
import math

In [4]:
def build_phasenet_label(p_arrival, s_arrival, length=6000):
    """
    构建修改后的PhaseNet标签
    
    Args:
        p_arrival: P波到达时间点（样本索引），如果为None表示无P波
        s_arrival: S波到达时间点（样本索引），如果为None表示无S波  
        length: 波形长度
    
    Returns:
        label: shape为(length,)的标签数组
        - 0: 背景噪声
        - 1: P_arrival到S_arrival之间
        - 2: S_arrival到P+3×(S-P)之间
    """
    # 初始化标签数组，默认全为0（背景噪声）
    label = np.zeros(length, dtype=np.int64)
    
    # 检查P波和S波是否都存在且有效
    if (p_arrival is not None and s_arrival is not None and 
        0 <= p_arrival < length and 0 <= s_arrival < length and 
        p_arrival < s_arrival):  # 确保P波在S波之前
        
        # 计算各个区间的边界
        p_start = int(p_arrival)
        s_start = int(s_arrival)
        
        # 计算第二个区间的结束点: P + 3×(S-P) = 3S - 2P
        s_p_diff = s_start - p_start  # S-P时间差
        region2_end = s_start + 3 * s_p_diff  # S + 3×(S-P)
        region2_end = min(region2_end, length - 1)  # 确保不超出数组边界
        
        # 设置标签
        # 区间1: P_arrival到S_arrival之间设为1
        label[p_start:s_start + 1] = 1
        
        # 区间2: S_arrival到P+3×(S-P)之间设为2
        if region2_end > s_start:
            label[s_start:region2_end + 1] = 2
    
    return torch.from_numpy(label).long()  # 使用long类型适合分类标签


# 如果您需要one-hot编码版本（用于某些损失函数）
def build_phasenet_label_onehot(p_arrival, s_arrival, length=6000):
    """
    构建one-hot编码版本的标签
    
    Returns:
        label: shape为(length, 3)的one-hot标签数组
        通道0 噪声
        通道1 P
        通道2 S
    """
    # 获取类别标签
    class_labels = build_phasenet_label(p_arrival, s_arrival, length)
    
    # 转换为one-hot编码
    label_onehot = np.zeros((length, 3), dtype=np.float32)
    for i in range(3):
        label_onehot[:, i] = (class_labels == i).float()
    
    return torch.from_numpy(label_onehot).float()




In [None]:
class SteadDataset(Dataset):
    def __init__(self, npy_dir, csv_path):
        self.npy_dir = npy_dir
        self.df = pd.read_csv(csv_path)
        self.trace_names = self.df['trace_name'].tolist()

    def __len__(self):
        return len(self.trace_names)
    
    def __getitem__(self, idx):
        trace_name = self.trace_names[idx]
        npy_path = os.path.join(self.npy_dir, f"{trace_name}.npy")
        data = np.load(npy_path)
        data_tensor = torch.from_numpy(data).float()


        # 构建标签
        row = self.df.iloc[idx]
        if row['trace_category'] == 'noise':
            p_label, s_label = None, None
        else:
            p_label = row['p_arrival_sample']
            s_label = row['s_arrival_sample']
        label_tensor = build_phasenet_label_onehot(p_label, s_label, length=data.shape[0])


        # Z-score 标准化
        mean = data_tensor.mean(dim=0, keepdim=True)
        std = data_tensor.std(dim=0, keepdim=True)
        eps = 1e-8
        data_tensor = (data_tensor - mean) / (std + eps)

        return data_tensor, label_tensor

In [None]:
# 数据路径
h5_path = "D:\\merge.hdf5"
npy_path = 'D:\\STEAD_npy'
train_path = "E:\\STEAD_dataset\\manual_train.csv"
val_path = "E:\\STEAD_dataset\\manual_val.csv"
test_path = "E:\\STEAD_dataset\\manual_test.csv"

# 创建完整数据集
print("Loading full datasets...")

# 创建训练集（启用数据增强）
train_dataset_full = SteadDataset(
    npy_dir=npy_path,
    csv_path=train_path,
    #augment=True,
    #augment_params=augment_params
)
# 创建验证集（不使用数据增强）
val_dataset_full = SteadDataset(
    npy_dir=npy_path,
    csv_path=val_path,
    augment=False
)
# 创建测试集（不使用数据增强）
test_dataset_full = SteadDataset(
    npy_dir=npy_path,
    csv_path=test_path,
    augment=False
)

# 计算前10%的样本数量
train_size = len(train_dataset_full)
val_size = len(val_dataset_full)
test_size = len(test_dataset_full)

train_subset_size = math.ceil(train_size * 0.1)  # 向上取整确保至少有一些样本
val_subset_size = math.ceil(val_size * 0.1)
test_subset_size = math.ceil(test_size * 0.1)

print(f"Original dataset sizes:")
print(f"  Train: {train_size} -> Using: {train_subset_size} (10%)")
print(f"  Val: {val_size} -> Using: {val_subset_size} (10%)")
print(f"  Test: {test_size} -> Using: {test_subset_size} (10%)")

# 创建前10%的子集
train_indices = list(range(train_subset_size))
val_indices = list(range(val_subset_size))
test_indices = list(range(test_subset_size))

train_dataset = Subset(train_dataset_full, train_indices)
val_dataset = Subset(val_dataset_full, val_indices)
test_dataset = Subset(test_dataset_full, test_indices)

# 创建数据加载器
train_loader = DataLoader(
    train_dataset, 
    batch_size=128, 
    shuffle=True, 
    #num_workers=8,
    pin_memory=True,
    #prefetch_factor=2
)

val_loader = DataLoader(
    val_dataset, 
    batch_size=128, 
    shuffle=False, 
    #num_workers=8,
    pin_memory=True,
    #prefetch_factor=2
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=128, 
    shuffle=False, 
    #num_workers=8,
    pin_memory=True
)

print(f"\nFinal dataset sizes:")
print(f"Train samples: {len(train_dataset)}")
print(f"Val samples: {len(val_dataset)}")
print(f"Test samples: {len(test_dataset)}")