In [21]:
import torch.nn as nn
import torch
import hiddenlayer as hl
import tensorwatch as tw
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset,random_split
# 定义自编码器网络  每个卷积操作 添加一个激活函数操作
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        
        # 编码器部分
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=(1, 3), padding=(0, 1)),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(1, 3)),
            nn.Conv2d(16, 32, kernel_size=(1, 3), padding=(0, 1)),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(1, 3)),
            nn.Conv2d(32, 64, kernel_size=(1, 3), padding=(0, 1)),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(1, 5)),
            nn.Conv2d(64, 128, kernel_size=(1, 3), padding=(0, 1)),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 1, kernel_size=(1, 3), padding=(0, 1))
        )
        
        # 解码器部分
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(1, 128, kernel_size=(1, 3), padding=(0, 1)),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(128, 64, kernel_size=(1, 3), padding=(0, 1)),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(64, 32, kernel_size=(1, 3), padding=(0, 1)),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(32, 16, kernel_size=(1, 3), padding=(0, 1)),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(16, 1, kernel_size=(1, 3), padding=(0, 1)),
        )
    
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [22]:
# 定义网络对象
model = AutoEncoder()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [23]:
# 自定义dataSet
class NPZDataset(Dataset):
    def __init__(self, file_paths):
        self.file_paths = file_paths
        self.data = []
        self.traces = []
        self.sample = []

        for path in file_paths:
            npz_data = np.load(path)
            self.data.append(npz_data['data'])  # 读取地震数量
            self.traces.append(npz_data['num_traces'])  # 读取地震道数目
            self.sample.append(npz_data['num_samples'])  # 读取采样点数量

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):

        # 通过索引获取数据
        data = self.data[idx]
        data= torch.from_numpy(data).float()  # 转换为张量
        data = data.reshape(-1,1,15000)
        traces = self.traces[idx]
        sample = self.sample[idx]

        return data, traces, sample


In [24]:
#  读取指定文件夹的所有npz文件
import os
import numpy as np

folder_path = '../NPZDATA'  # 替换为实际的文件夹路径

file_paths = []
for file_name in os.listdir(folder_path):
    if file_name.endswith('.npz'):
        file_paths.append(os.path.join(folder_path, file_name))

npz_dataset = NPZDataset(file_paths)
data, traces, sample = npz_dataset[0]
print(data.shape)  # 输出 (42, 1, 15000)


MemoryError: Unable to allocate 2.40 MiB for an array with shape (630000,) and data type float32

In [None]:
from torch.utils.data import random_split
# file_paths = ['data1.npz', 'data2.npz', 'data3.npz']

# 创建 NPZDataset的实例 并使用DataLoader进行数据加载
dataset = NPZDataset(file_paths)   ## 将指定的文件夹下面的所有npz文件 读取出来 保存成NPZDATASET

# 计算训练集和测试集的划分大小
dataset_size = len(dataset)
print(dataset_size)
train_size = int(0.8 * dataset_size)
test_size = dataset_size - train_size

#  划分数据集
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])


#  四维输入 batch_size channel2 height width  1 x 42 x 1 x 15000
#  创建训练集和测试集的数据加载器  DataLoader  batch_size = 1  shuffle 打乱文件顺序
train_dataloader = torch.utils.data.DataLoader(train_dataset,batch_size = 1,shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset,batch_size = 1,shuffle=True)

811


In [None]:

# # 处理数据集
# # 如果数据是 594 x 1500 将数据处理成 594  x 1 x 1500 按照batch_size 分批

# import segyio
# import numpy as np
# import torch
# from sklearn.model_selection import train_test_split
# from torch.utils.data import DataLoader

# # 读取SEGY文件
# segy_file = '../SegyData/20220101_115300.sgy'

# with segyio.open(segy_file, 'rb') as segy:
#     # 获取地震数据的一维数组
#     seismic_data = segy.trace.raw[:]
#     # 获取地震道数量和每个地震道的时间采样点数量
#     n_traces = segy.tracecount
#     n_samples = segy.samples.size

#     print("地震道数量:{}".format(n_traces))
#     print("采样点数量：{}".format(n_samples))

#     # 将一维数组重新形状为二维数组，形状为 (n_traces, n_samples)
#     #  reshape地震数据
#     seismic_data = seismic_data.reshape((n_traces, n_samples))

# # 随机划分为训练集和测试集
# train_data_s, test_data_s = train_test_split(seismic_data, test_size=0.2, random_state=42)
# # 回去学习：torch.utils.data.DataSet
# #  分割成 训练集和测试集
# # 将数据转换为PyTorch张量  
# train_data_s = torch.from_numpy(train_data_s).float()
# test_data_s = torch.from_numpy(test_data_s).float()

# print(f"训练集大小: {train_data_s.shape}")
# print(f"测试集大小: {test_data_s.shape}")

# # reshape 42 x 15000 x 1
# train_data_s = train_data_s.reshape(33,1,15000)
# test_data_s = test_data_s.reshape(9,1,15000)
# print(train_data_s.shape)

# #  数据预处理  归一化 0，1 之间

# # 上面的数据是 42 x 15000 然后reshape 42 x 1 x 15000

# #  dataloader加载数据集
# train_dataloader_s = DataLoader(train_data_s,batch_size = 3)
# test_dataloader_s = DataLoader(test_data_s,batch_size = 3)

In [None]:
loss_fn = nn.MSELoss()  # 损失函数  均方根损失函数
learning_rate = 0.001   ## 学习率
writer = SummaryWriter("../encoder_train")  # tensorboard
optimizer = torch.optim.SGD(model.parameters(),lr = learning_rate)  ## 定义优化器
total_train_step = 0  ## 训练步骤数目
epoch = 100  ## 迭代次数

# 训练
for epoch in range(epoch):

    #  设置模型为训练模式
    model.train()

    for data,num_traces,num_samples in train_dataloader:
        #  将数据输入模型
        outputs = model(data)  

        #  计算损失  对比原始输入和自编码器的输出结果
        loss = loss_fn(outputs,data)

        #  先清除梯度
        optimizer.zero_grad()

        #  反向传播
        loss.backward()

        # 优化
        optimizer.step()

        # 训练次数加一
        total_train_step = total_train_step + 1

        if total_train_step % 10 == 0:
            # 绘制训练损失
            writer.add_scalar("train1",loss.item(),total_train_step)
            print("训练次数：{},Loss{}".format(total_train_step,loss.item()))




RuntimeError: Given groups=1, weight of size [16, 1, 3, 3], expected input[1, 42, 1, 15000] to have 1 channels, but got 42 channels instead

In [None]:
#  定义损失函数  均方根
# loss_fn = nn.CrossEntropyLoss()
loss_fn = nn.MSELoss()

#  学习率 
learning_rate = 0.01

total_train_step = 0


autoCoder = AutoEncoder()
writer = SummaryWriter("../encoder_train")
#  定义优化器  随机梯度下降
optimizer = torch.optim.SGD(autoCoder.parameters(),lr = learning_rate)
epoch = 2000
for i in range(epoch):
    print("-------第{}轮训练开始".format(i + 1))

    #  训练步骤开始
    autoCoder.train()

    for data in train_dataloader_s:
        inputs = data
        #  将梯度清零
       
        # 前向传播
        outputs = autoCoder(inputs)
        # 计算损失  对比原始输入和 自编码器输出的结果 看看压缩效果
        loss = loss_fn(outputs,inputs)
        # 反向传播 计算梯度
        optimizer.zero_grad()
        loss.backward()
        # 优化
        optimizer.step()

        # 统计训练次数
        total_train_step = total_train_step + 1

        if total_train_step % 10 == 0:
            # 绘制训练损失
            writer.add_scalar("train_segy_loss",loss.item(),total_train_step)
            print("训练次数:{},Loss{}".format(total_train_step,loss.item()))


    # #  测试步骤开始
    # autoCoder.eval()
    # total_test_loss = 0
    # total_accuracy = 0
    # with torch.no_grad():

    #     #  取出测试数据集的数据
    #     for data in test_dataloader_s:

    #         inputs = data

    #         outputs = autoCoder(inputs)

    #         # #  取出数据
    #         # imgs = data
    #         # # imgs = imgs.to(device)
    #         # # targets = targets.to(device)

    #         # outputs = tudui(imgs)

    #         loss = loss_fn(outputs,inputs) # 计算损失
    #         optimizer.zero_grad()

    #         loss.backward()

    #         optimizer.step()

    #         #  统计测试集上面的总损失其
    #         total_test_loss = total_test_loss + loss.item()
    #         accuracy = (outputs.argmax(1) == inputs).sum()
    #         total_accuracy = total_accuracy + accuracy


    # print("整体测试集上面的Loss:{}".format(total_test_loss))
    # print("整体测试及上面的正确率:{}".format(total_accuracy / test_data_size))
    # # writer.add_scalar("test_loss",loss.item(),total_test_step)
    # # writer.add_scalar("test_accuracy",total_accuracy / test_data_size,total_test_step)
    # total_test_step = total_test_step + 1


-------第1轮训练开始
训练次数:10,Loss0.0013878982281312346
-------第2轮训练开始
训练次数:20,Loss0.00115226733032614
-------第3轮训练开始
训练次数:30,Loss0.0005665685166604817
-------第4轮训练开始
训练次数:40,Loss0.006522593088448048
-------第5轮训练开始
训练次数:50,Loss0.0006176924216561019
-------第6轮训练开始
训练次数:60,Loss0.004694291390478611
-------第7轮训练开始
训练次数:70,Loss0.0012184163788333535
-------第8轮训练开始
训练次数:80,Loss0.001182312611490488
-------第9轮训练开始
训练次数:90,Loss0.00024775179917924106
-------第10轮训练开始
训练次数:100,Loss0.0009492202661931515
训练次数:110,Loss0.00041826762026175857
-------第11轮训练开始
训练次数:120,Loss0.001480179838836193
-------第12轮训练开始
训练次数:130,Loss0.001151376054622233
-------第13轮训练开始
训练次数:140,Loss0.0005665399949066341
-------第14轮训练开始
训练次数:150,Loss0.006522115785628557
-------第15轮训练开始
训练次数:160,Loss0.0006175987073220313
-------第16轮训练开始
训练次数:170,Loss0.004694262519478798
-------第17轮训练开始
训练次数:180,Loss0.0012184004299342632
-------第18轮训练开始
训练次数:190,Loss0.001182297826744616
-------第19轮训练开始
训练次数:200,Loss0.00024771911557763815
-------第20轮训练开始
训练次数:

KeyboardInterrupt: 