In [3]:
import numpy as np
import torch

from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch import optim
# from torchnet import meter

## 数据准备


### 数据读入


In [4]:
def read_data():
    '''读入数据'''
    datas = np.load("tang.npz", allow_pickle=True)
    data = datas['data']
    ix2word = datas['ix2word'].item()  # index to word
    word2ix = datas['word2ix'].item()

    data = torch.from_numpy(data)  # 转为torch.Tensor
    return ix2word, word2ix, data


def peek_data(row, data, ix2word):
    '''查看row行开始的5行数据'''
    for i in range(row, row + 5):
        print("".join([ix2word[int(j)] for j in data[i]]))

In [5]:
ix2word, word2ix, data = read_data()

In [6]:
print(data.shape)
peek_data(0, data, ix2word)

torch.Size([57580, 125])
</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s><START>度门能不访，冒雪屡西东。已想人如玉，遥怜马似骢。乍迷金谷路，稍变上阳宫。还比相思意，纷纷正满空。<EOP>
</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s><START>逍遥东城隅，双树寒葱蒨。广庭流华月，高阁凝余霰。杜门非养素，抱疾阻良䜩。孰谓无他人，思君岁云变。官曹亮先忝，陈躅慙俊彥。岂知晨与夜，相代不相见。缄书问所如，詶藻当芬绚。<EOP>
</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s><START>川上风雨来，须臾满城阙。岧峣青莲界，萧条孤兴发。前山遽已净，阴霭夜来歇。乔木生夏凉，流云吐华月。严城自有限，一水非难越。相望曙河远，高斋坐超忽。<EOP>
</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>

### 数据清洗

从上面数据片段能看出其中包含大量空格, 需要去除以排除对正确率的干扰.


In [7]:
def prepare_data(data):
    '''将数据平整为一维并滤除空格数据'''
    data = data.view(-1)
    data = data[data != word2ix['</s>']]
    return data

In [8]:
print(''.join([ix2word[int(j)] for j in prepare_data(data)[:80]]))

<START>度门能不访，冒雪屡西东。已想人如玉，遥怜马似骢。乍迷金谷路，稍变上阳宫。还比相思意，纷纷正满空。<EOP><START>逍遥东城隅，双树寒葱蒨。广庭流华月，高阁凝余霰。杜门非养素


### Dataloader 构造

然后将数据集划分为训练集和验证集


In [9]:
class PoetryDataSet(Dataset):
    def __init__(self, data):
        self.seq_len = 48  # 8首五言/6首七言为一批
        self.data = prepare_data(data).long()

    def __len__(self):
        '''数据集样本批数'''
        return len(self.data) // self.seq_len

    def __getitem__(self, index):
        text = self.data[index * self.seq_len: (index + 1) * self.seq_len]
        # 每个字的标签为下一个字, 配合LSTM如此才能用上一句每个字给出有变化的下一句, 是字符级语言模型
        label = self.data[index * self.seq_len + 1: (index + 1) * self.seq_len + 1]  # 这里其实可能越界
        return text, label

In [10]:
# TODO: 拆分为训练集和测试集
train_data = data[:, :]
dataset = PoetryDataSet(train_data)

## 模型构造


In [11]:
LSTM_LAYERS = 3  # 模型LSTM层数

In [12]:
class PoetryModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(PoetryModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, self.hidden_dim, num_layers=LSTM_LAYERS, batch_first=True)
        # self.linear = nn.Linear(self.hidden_dim, vocab_size)
        self.fc1 = nn.Linear(self.hidden_dim, 2048)
        self.fc2 = nn.Linear(2048, 4096)
        self.fc3 = nn.Linear(4096, vocab_size)  # 将输出转为词表的维度

    def forward(self, input, hidden=None):
        batch_size, seq_len = input.size()
        embeds = self.embeddings(input)  # 将汉字转为embedding向量
        # FIXME: hidden是给定的输入句子, 本来不需要, 但如果给定了几个首句, 就需要这个hidden然后再开始预测
        if hidden is None:
            # h, c为LSTM的hidden的隐状态和细胞状态, 一般初始化为0
            h_0 = input.data.new(LSTM_LAYERS, batch_size, self.hidden_dim).fill_(0).float()
            c_0 = input.data.new(LSTM_LAYERS, batch_size, self.hidden_dim).fill_(0).float()
        else:
            h_0, c_0 = hidden
        output, hidden = self.lstm(embeds, (h_0, c_0))
        # output = self.linear(output)
        # 改用3层全连接层处理来自LSTM的输出
        # 改用tanh作为全连接层的激活函数, <tanh效果比relu好?>
        output = torch.tanh(self.fc1(output))
        output = torch.tanh(self.fc2(output))
        output = self.fc3(output)
        output = output.reshape(batch_size * seq_len, -1)
        return output, hidden

## 训练


In [13]:
def train(model, dataset, epochs, batch_size, device, lr, scheduler_kwargs, criterion, tensorboard_path):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)  # 使用dataloader多进程分批加载数据
    optimizer = optim.Adam(model.parameters(), lr=lr)  # 可学习参数, 学习率 (超参数)
    scheduler = optim.lr_scheduler.StepLR(optimizer, **scheduler_kwargs)  # 学习率调整
    # loss_meter = meter.AverageValueMeter() # torchnet.meter.AverageValueMeter

    model.train() # 设置模型为训练模式
    model.to(device)
    for epoch in range(epochs):
        train_loss = 0
        for batch, (text, label) in enumerate(dataloader):
            text = text.to(device)
            label = label.to(device)
            # 正向传播
            output, _ = model(text)
            # 计算损失
            loss = criterion(output, label.reshape(-1))  # output平整为了一维所以label也要平整来对齐
            # 置零所有参数的梯度
            optimizer.zero_grad()
            # 反向传播计算梯度
            loss.backward()
            # 更新参数
            optimizer.step()

            train_loss += loss.item()
            # loss_meter.add(loss.item())
            if batch % 500 == 0:
                print(f'\tepoch: {epoch}, batch: {batch}, loss: {loss.item()}')
        scheduler.step() # 更新学习率
        print(f'epoch: {epoch}, average loss: {train_loss / len(dataloader)}')
    print('Finished Training')

In [15]:
# 参数设置

EMBEDDING_DIM = 100
HIDDEN_DIM = 1024
EPOCHS = 30
BATCH_SIZE = 32
LR = 1e-3
print(device := torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))

cuda:0


In [17]:
model = PoetryModel(len(word2ix), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM)

train(
    model=model,
    dataset=dataset,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    device=device,
    lr=LR,
    scheduler_kwargs={'step_size': 10, 'gamma': 0.1},  # 每10个epoch学习率乘0.1, 避免后期学习率过大导致损失震荡
    criterion=nn.CrossEntropyLoss(),  # 判据为交叉熵损失
    tensorboard_path=''
)

torch.save(model.state_dict(), 'model.pth')

epoch: 0, batch: 0, loss: 9.022175788879395
epoch: 0, batch: 100, loss: 6.049218654632568
epoch: 0, batch: 200, loss: 6.101924419403076
epoch: 0, batch: 300, loss: 5.931379795074463
epoch: 0, batch: 400, loss: 5.891652584075928
epoch: 0, batch: 500, loss: 5.73443078994751
epoch: 0, batch: 600, loss: 5.71327543258667
epoch: 0, batch: 700, loss: 5.548266887664795
epoch: 0, batch: 800, loss: 5.467962265014648
epoch: 0, batch: 900, loss: 5.334251403808594
epoch: 0, batch: 1000, loss: 5.3406453132629395
epoch: 0, batch: 1100, loss: 5.236422061920166
epoch: 0, batch: 1200, loss: 5.4016242027282715
epoch: 0, batch: 1300, loss: 5.1095733642578125
epoch: 0, batch: 1400, loss: 5.317927837371826
epoch: 0, batch: 1500, loss: 5.2256879806518555
epoch: 0, batch: 1600, loss: 5.148536205291748
epoch: 0, batch: 1700, loss: 5.162995338439941
epoch: 0, batch: 1800, loss: 4.954986095428467
epoch: 0, batch: 1900, loss: 5.076114654541016
epoch: 0, batch: 2000, loss: 5.080732822418213
epoch: 0, average loss:

## 测试


In [46]:
def generate(model, start_words, ix2word, word2ix, max_gen_len, device):
    results = []
    input = torch.Tensor([word2ix['<START>']]).view(1, 1).long().to(device)  # 第一个字是<START>
    hidden = None
    model = model.to(device)
    model.eval()  # 设置模型为评估模式
    with torch.no_grad():
        for i in range(max_gen_len):  # 限制诗词最大长度
            output, hidden = model(input, hidden)
            if i < len(start_words):
                # 在句首范围内持续将句首字作为输入
                w = start_words[i]
                ix = word2ix[w]
            else:
                ix = output.data[0].topk(1)[1][0].item()
                w = ix2word[ix]
                if w == '<EOP>':
                    break
            results.append(w)
            input = input.data.new([ix]).view(1, 1)
    return results

def gen_acrostic(model, start_words_list, ix2word, word2ix, max_gen_len, device):
    results = []
    w = '<START>'
    input = torch.Tensor([word2ix[w]]).view(1, 1).long().to(device)  # 第一个字是<START>
    hidden = None
    model = model.to(device)
    model.eval()  # 设置模型为评估模式
    with torch.no_grad():
        for start_words in start_words_list:
            for i in range(max_gen_len):  # 限制诗词最大长度
                output, hidden = model(input, hidden)
                if i < len(start_words):
                    # 在句首范围内持续将句首字作为输入
                    w = start_words[i]
                    ix = word2ix[w]
                else:
                    ix = output.data[0].topk(1)[1][0].item()
                    w = ix2word[ix]
                results.append(w)
                input = input.data.new([ix]).view(1, 1)
                if w == '<EOP>':
                    results.pop()
                if w in ['，', '。', '<EOP>']:
                    break
            if w == '<EOP>':
                break
    return results

In [23]:
model = PoetryModel(len(word2ix), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM)
model.load_state_dict(torch.load('model.pth'))  # 加载模型

<All keys matched successfully>

In [52]:
print(''.join(i for i in generate(model, '一', ix2word, word2ix, 50, device)))

一片鲎鱼壳，其中生翠波。买须能紫贝，用合对红螺。水岸吞空出，沙浑觅树行。何当得一息，借与此仙图。


In [48]:
result = ''.join(i for i in gen_acrostic(model, ['运', '宵', '可', '爱'], ix2word, word2ix, 50, device))
result = result.replace('，', '，\n')
result = result.replace('。', '。\n')
print(result)

运背征蛮定不然，
宵行永毕立搀环。
可怜万物元不变，
爱君堂上明玉声。

