In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from tqdm import tqdm
import pandas as pd

# nltk.download("punkt")

# 加载新闻数据集
train_df = pd.read_csv("./data/train_news.csv")
test_df = pd.read_csv("./data/test_news.csv")


def preprocess(df, word2idx=None, label2idx=None, max_len=500):#当一段话单词数（包括标点）超过500的时候，截断该句；当单词数不足500时，在句子后面都补上统一的数字
    texts = df["text"].values
    labels = df["category"].values

    # 标签编码
    if label2idx is None:
        unique_labels = set(labels)
        label2idx = {label: idx for idx, label in enumerate(unique_labels)}
    labels = [label2idx[label] for label in labels]

    # 分词
    tokenized_texts = [word_tokenize(text.lower()) for text in texts]

    if word2idx is None:
        # 构建词汇表
        all_tokens = [token for text in tokenized_texts for token in text]
        vocab = Counter(all_tokens)
        vocab_size = 25000
        vocab = vocab.most_common(vocab_size - 2)
        word2idx = {word: idx + 2 for idx, (word, _) in enumerate(vocab)}
        word2idx["<unk>"] = 0
        word2idx["<pad>"] = 1

    # 数值化
    def encode_text(text):
        return [word2idx.get(word, word2idx["<unk>"]) for word in text]

    encoded_texts = [encode_text(text) for text in tokenized_texts]

    # 填充或截断
    padded_texts = [
        (
            text[:max_len]
            if len(text) > max_len
            else text + [word2idx["<pad>"]] * (max_len - len(text))
        )
        for text in encoded_texts
    ]

    return padded_texts, labels, word2idx, label2idx


# 划分训练集和测试集
X_train, y_train, word2idx, label2idx = preprocess(train_df)
X_test, y_test, _, _ = preprocess(test_df, word2idx, label2idx)
#print(X_test)

# 自定义Dataset类
class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

train_dataset = NewsDataset(X_train, y_train)
test_dataset = NewsDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


第2题解题过程：

In [2]:
# 定义单向LSTM模型
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, pad_idx):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim,padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional = False) #单向LSTM
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_output, (hidden, cell) = self.lstm(embedded)
        # 我们只需要最后一个时间步的输出
        last_hidden = hidden[-1] #单向LSTM
        logits = self.fc(last_hidden)
        return logits

# 模型参数
embedding_dim = 128
hidden_dim = 64
vocab_size = len(word2idx)
num_classes = 4
pad_idx = word2idx["<pad>"]

# 实例化模型
model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, num_classes, pad_idx)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# 训练模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)
criterion = criterion.to(device)
num_epochs = 10
history = {"train_loss": [], "train_acc": []}
def categorical_accuracy(preds, y):
    max_preds = preds.argmax(dim=1, keepdim=True)
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / y.shape[0]
for epoch in range(num_epochs):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, targets = batch
        logits = model(inputs)
        loss = criterion(logits, targets)
        acc = categorical_accuracy(logits, targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    history["train_loss"].append(epoch_loss / len(train_loader))
    history["train_acc"].append(epoch_acc / len(train_loader))
    print(f'Epoch {epoch+1}/{num_epochs}, Xiao Ai Train Loss: {epoch_loss/len(train_loader):.4f}, Xiao Ai Train Accuracy: {epoch_acc/len(train_loader):.4f}')

# 在测试数据上评估模型
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        inputs, targets = batch
        logits = model(inputs)
        _, predicted = torch.max(logits, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

print(f'Xiao Ai Test Accuracy: {correct/total:.4f}')

  return torch._C._cuda_getDeviceCount() > 0


cpu
Epoch 1/10, Xiao Ai Train Loss: 1.3831, Xiao Ai Train Accuracy: 0.2844
Epoch 2/10, Xiao Ai Train Loss: 1.3406, Xiao Ai Train Accuracy: 0.3871
Epoch 3/10, Xiao Ai Train Loss: 1.3075, Xiao Ai Train Accuracy: 0.4143
Epoch 4/10, Xiao Ai Train Loss: 1.2794, Xiao Ai Train Accuracy: 0.4145
Epoch 5/10, Xiao Ai Train Loss: 1.2483, Xiao Ai Train Accuracy: 0.4135
Epoch 6/10, Xiao Ai Train Loss: 1.2083, Xiao Ai Train Accuracy: 0.4406
Epoch 7/10, Xiao Ai Train Loss: 1.1891, Xiao Ai Train Accuracy: 0.4688
Epoch 8/10, Xiao Ai Train Loss: 1.1408, Xiao Ai Train Accuracy: 0.4717
Epoch 9/10, Xiao Ai Train Loss: 1.1075, Xiao Ai Train Accuracy: 0.4820
Epoch 10/10, Xiao Ai Train Loss: 1.1115, Xiao Ai Train Accuracy: 0.4881
Xiao Ai Test Accuracy: 0.3200
