In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

In [2]:
# 数据预处理
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# 加载数据集
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [3]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) # 初始化隐藏状态
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) # 初始化细胞状态
        
        out, _ = self.lstm(x, (h0, c0)) # 通过LSTM层
        out = self.fc(out[:, -1, :]) # 取最后一个时间步的输出用于分类
        return out

# 超参数设置
input_size = 28  # 由于我们将图像展平为一维，所以输入大小为28（28x28像素）
sequence_length = 28  # 序列长度，对于MNIST每个序列即为一行或一列的像素点
hidden_size = 128
num_layers = 2
num_classes = 10  # 手写数字共有10类

# 实例化模型并移动到GPU（如果可用）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMModel(input_size, hidden_size, num_layers, num_classes).to(device)

In [4]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [5]:
num_epochs = 10  # 训练轮数

for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        
        # 前向传播
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

Epoch [1/10], Step [100/938], Loss: 0.5976
Epoch [1/10], Step [200/938], Loss: 0.2649
Epoch [1/10], Step [300/938], Loss: 0.2096
Epoch [1/10], Step [400/938], Loss: 0.2489
Epoch [1/10], Step [500/938], Loss: 0.1026
Epoch [1/10], Step [600/938], Loss: 0.0793
Epoch [1/10], Step [700/938], Loss: 0.0765
Epoch [1/10], Step [800/938], Loss: 0.0601
Epoch [1/10], Step [900/938], Loss: 0.1225
Epoch [2/10], Step [100/938], Loss: 0.0911
Epoch [2/10], Step [200/938], Loss: 0.0880
Epoch [2/10], Step [300/938], Loss: 0.2217
Epoch [2/10], Step [400/938], Loss: 0.0418
Epoch [2/10], Step [500/938], Loss: 0.1516
Epoch [2/10], Step [600/938], Loss: 0.0510
Epoch [2/10], Step [700/938], Loss: 0.0423
Epoch [2/10], Step [800/938], Loss: 0.0643
Epoch [2/10], Step [900/938], Loss: 0.0113
Epoch [3/10], Step [100/938], Loss: 0.0499
Epoch [3/10], Step [200/938], Loss: 0.1373
Epoch [3/10], Step [300/938], Loss: 0.0324
Epoch [3/10], Step [400/938], Loss: 0.0115
Epoch [3/10], Step [500/938], Loss: 0.0606
Epoch [3/10