> 对唐诗三百首进行文字处理，并作为数据集，用RNN尝试文
字生成。

In [20]:
import pandas as pd
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
import jieba
from collections import Counter


In [3]:
with open('唐诗三百首.txt', 'r', encoding='utf-8') as f:
    text = f.readlines()

poems = [line.strip() for line in text]

tokenized_poems = []
for poem in poems:
    tokens = list(jieba.cut(poem))
    tokenized_poems.append(tokens)



Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\annanyi\AppData\Local\Temp\jieba.cache
Loading model cost 0.356 seconds.
Prefix dict has been built successfully.


In [4]:
word_counters = Counter()
for poem in tokenized_poems:
    word_counters.update(poem)
word_counters

vocab = ['<PAD>', '<START>', '<END>', '<UNK>'] + [i for i in word_counters.keys() if word_counters[i] >= 1]  # 出现次数大于 3 的词

word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for idx, word in enumerate(vocab)}

print(idx2word)

def tokenize(text):
    """将句子转换为词语索引序列"""
    return [word2idx.get(word, word2idx['<UNK>']) for word in text]

def detokenize(indices):
    """将索引序列还原为字符串"""
    return ''.join([idx2word.get(idx, '<UNK>') for idx in indices])


{0: '<PAD>', 1: '<START>', 2: '<END>', 3: '<UNK>', 4: '琴书', 5: '中有', 6: '得', 7: '，', 8: '衣食', 9: '外', 10: '何求', 11: '。', 12: '初疑', 13: '白莲花', 14: '浮出', 15: '龙', 16: '王宫', 17: '难', 18: '为', 19: '此时', 20: '别', 21: '欲别', 22: '愿人留', 23: '遥知', 24: '太平', 25: '代', 26: '国宝', 27: '在', 28: '名', 29: '都', 30: '终南山', 31: '北面', 32: '直下', 33: '是', 34: '长安', 35: '昔为', 36: '同恨', 37: '客', 38: '今为', 39: '独笑', 40: '人', 41: '此地', 42: '夫子', 43: '今来思', 44: '旧游', 45: '少结', 46: '相思', 47: '恨', 48: '佳期', 49: '芳草', 50: '前', 51: '虽无南', 52: '去', 53: '雁', 54: '看取', 55: '北来', 56: '鱼', 57: '秋亭', 58: '病', 59: '客眠', 60: '庭树', 61: '满枝', 62: '蝉', 63: '青山', 64: '春暮见', 65: '流水', 66: '夜深', 67: '闻', 68: '主人', 69: '常不在', 70: '春物', 71: '谁', 72: '开', 73: '春尽', 74: '有', 75: '归', 76: '日', 77: '老来', 78: '无去', 79: '时', 80: '前山', 81: '依旧', 82: '碧', 83: '闲草', 84: '经秋绿', 85: '南中', 86: '多古事', 87: '咏', 88: '遍始', 89: '应', 90: '还', 91: '琴曲', 92: '唯留古', 93: '书', 94: '多半', 95: '经', 96: '新居', 97: '未曾', 98: '到', 99: '邻里', 100: '谁家', 101: '莺', 

In [5]:
# 添加 <START>, <END> 标记，并转为 tensor
sequences = []
max_len = 20

for poem in tokenized_poems:
    seq = [word2idx['<START>']] + tokenize(poem) + [word2idx['<END>']]
    if len(seq) <= max_len:
        sequences.append(torch.tensor(seq))

padded_sequences = pad_sequence(sequences, batch_first=True)

unk_count = sum(1 for seq in padded_sequences for idx in seq if idx == word2idx['<UNK>'])
print(f"UNK 出现次数: {unk_count}")

pad_count = sum(1 for seq in padded_sequences for idx in seq if idx == word2idx['<PAD>'])
print(f"PAD 出现次数: {pad_count}")

print("填充后的张量形状:", padded_sequences.shape)
print("填充后的张量:", padded_sequences)

UNK 出现次数: 0
PAD 出现次数: 134370
填充后的张量形状: torch.Size([31072, 14])
填充后的张量: tensor([[    1,     4,     5,  ...,     0,     0,     0],
        [    1,    12,    13,  ...,     0,     0,     0],
        [    1,    17,    18,  ...,     0,     0,     0],
        ...,
        [    1,   710,   355,  ...,     0,     0,     0],
        [    1,   282,  2042,  ...,     0,     0,     0],
        [    1, 14848,   608,  ...,     0,     0,     0]])


In [6]:
from torch.utils.data import TensorDataset, DataLoader, random_split

# 创建数据集
dataset = TensorDataset(padded_sequences)

# 定义比例：80% 训练集，10% 验证集，10% 测试集
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

# 分割数据集
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# 创建 DataLoader
batch_size = 512
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

> 定义模型

In [7]:
import torch.nn as nn

class PoetryRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2):
        super(PoetryRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word2idx['<PAD>'])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        embedded = self.embedding(x)
        output, hidden = self.lstm(embedded, hidden)
        logits = self.fc(output)
        return logits, hidden
    
    

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:

model = PoetryRNN(len(word2idx)).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=word2idx['<PAD>'])  # 忽略 <PAD> 的损失
optimizer = torch.optim.Adam(model.parameters(), lr=0.002, weight_decay=1e-5)

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
def train(model, data_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for batch in data_loader:
        inputs = batch[0].to(device)
        targets = inputs[:, 1:].contiguous().view(-1)
        inputs = inputs[:, :-1]

        logits, _ = model(inputs)
        logits = logits.view(-1, len(word2idx))

        loss = criterion(logits, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(data_loader)

def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            inputs = batch[0].to(device)
            targets = inputs[:, 1:].contiguous().view(-1)
            inputs = inputs[:, :-1]

            logits, _ = model(inputs)
            logits = logits.view(-1, len(word2idx))

            loss = criterion(logits, targets)
            total_loss += loss.item()
    return total_loss / len(data_loader)




In [11]:
num_epochs = 100

for epoch in range(num_epochs):
    train_loss = train(model, train_loader, criterion, optimizer, device)
    # val_loss = evaluate(model, val_loader, criterion, device)
    # print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}")


# 保存整个模型（包括结构和参数）
torch.save(model, 'poetry_rnn_model.pth')

# 或者只保存模型的 state_dict（推荐做法，更灵活）
torch.save(model.state_dict(), 'poetry_rnn_state_dict.pth')


Epoch 1/100, Train Loss: 7.5160
Epoch 2/100, Train Loss: 6.3734
Epoch 3/100, Train Loss: 6.1571
Epoch 4/100, Train Loss: 6.0287
Epoch 5/100, Train Loss: 5.9289
Epoch 6/100, Train Loss: 5.8549
Epoch 7/100, Train Loss: 5.7949
Epoch 8/100, Train Loss: 5.7450
Epoch 9/100, Train Loss: 5.6995
Epoch 10/100, Train Loss: 5.6576
Epoch 11/100, Train Loss: 5.6173
Epoch 12/100, Train Loss: 5.5775
Epoch 13/100, Train Loss: 5.5348
Epoch 14/100, Train Loss: 5.4911
Epoch 15/100, Train Loss: 5.4434
Epoch 16/100, Train Loss: 5.4014
Epoch 17/100, Train Loss: 5.3609
Epoch 18/100, Train Loss: 5.3219
Epoch 19/100, Train Loss: 5.2874
Epoch 20/100, Train Loss: 5.2507
Epoch 21/100, Train Loss: 5.2109
Epoch 22/100, Train Loss: 5.1730
Epoch 23/100, Train Loss: 5.1294
Epoch 24/100, Train Loss: 5.0815
Epoch 25/100, Train Loss: 5.0343
Epoch 26/100, Train Loss: 4.9843
Epoch 27/100, Train Loss: 4.9346
Epoch 28/100, Train Loss: 4.8842
Epoch 29/100, Train Loss: 4.8318
Epoch 30/100, Train Loss: 4.7772
Epoch 31/100, Train

In [46]:
def generate_poem(model, start_words, idx2word, word2idx, max_len=20, device='gpu'):
    with torch.no_grad():
        # 分词并展平列表
        start_word_list = list(jieba.cut(start_words))  # 例如 ['白日', '依山尽']
        
        # 检查词汇表覆盖
        for word in start_word_list:
            if word not in word2idx:
                raise ValueError(f"词 '{word}' 不在词汇表中！")
        
        # 转换为张量
        start_word_list_tensor = torch.tensor([word2idx[word] for word in start_word_list], dtype=torch.long).to(device)
        
        # 初始化输入（添加 batch 维度）
        input_tensor = start_word_list_tensor.unsqueeze(0).to(device)
        poem = [start_words]
        
        # 生成后续词语
        for _ in range(max_len - 1):
            output, _ = model(input_tensor)
            predicted_idx = output.argmax(dim=-1)[:, -1].item()
            
            if idx2word[predicted_idx] == '<END>':
                break
            
            poem.append(idx2word[predicted_idx])
            input_tensor = torch.cat([input_tensor, torch.tensor([[predicted_idx]], device=device)], dim=1)
        
        return ''.join(poem)

# 先重新构建模型结构
model = PoetryRNN(vocab_size=len(word2idx), embedding_dim=128, hidden_dim=256, num_layers=2).to(device)
model.load_state_dict(torch.load('poetry_rnn_state_dict.pth'))
model.eval()


# 生成一首诗，以 <START> 开头
generated_poem = generate_poem(model, '船', idx2word, word2idx, max_len=20, device=device)
print("生成的诗句：", generated_poem)

生成的诗句： 船何处在三处所无古今。
