In [3]:
%matplotlib inline
import math
import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l

batch_size, num_steps = 128, 50


In [4]:
import re
import json
#@save
# def read_wiki():  #@save
#     """将wiki2数据集加载到文本行的列表中"""
#     with open('./wikitext-103/wiki.train.tokens', 'r',encoding='utf-8') as f:
#         lines = f.readlines()
#     return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]

# # 创建一个空列表来保存所有古诗数据
# all_data = []
# def read_poem():  #@save
# # 读取所有JSON文件，并将其内容添加到all_data列表中
#     all_data = []
#     for i in range(1, 901):
#         file_num = str(i).zfill(3) # 将数字转换为字符串并用0填充至3位数
#         with open(f'./chinese-poetry-master/quan_tang_shi/json/{file_num}.json', 'r', encoding='utf-8') as f:
#             data = json.load(f)
#             all_data += data
#             lines = f.readlines()
#         return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]
# #print(all_data[0])
import re
import json

def read_poem():
    all_data = []
    for i in range(1, 901):
        file_num = str(i).zfill(3)
        with open(f'./Chinese Poem NLP/chinese-poetry-master/quan_tang_shi/json/{file_num}.json', 'r', encoding='utf-8') as f:
            data = json.load(f)
            all_data += data
    # Extract the lines from the data
    lines = []
    for poem in all_data:
        for line in poem['paragraphs']:
            lines.append(line)
    return lines

lines = read_poem()
print(lines[100])


之罘思漢帝，碣石想秦皇。霓裳非本意，端拱且圖王。


In [5]:
def tokenize(lines, token='word'):  #@save
    """将文本行拆分为单词或字符词元"""
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print('错误：未知词元类型：' + token)

tokens = tokenize(lines)
for i in range(11):
    print(tokens[i])

['秦川雄帝宅，函谷壯皇居。綺殿千尋起，離宮百雉餘。']
['連薨遙接漢，飛觀迥淩虛。雲日隱層闕，風煙出綺疏。']
['岩廊罷機務，崇文聊駐輦。玉匣啟龍圖，金繩披鳳篆。']
['韋編斷仍續，縹帙舒還卷。對此乃淹留，欹案觀墳典。']
['移步出詞林，停輿欣武宴。雕弓寫明月，駿馬疑流電。']
['驚雁落虛弦，啼猿悲急箭。閱賞誠多美，於茲乃忘倦。']
['鳴笳臨樂館，眺聽歡芳節。急管韻朱弦，清歌凝白雪。']
['彩鳳肅來儀，玄鶴紛成列。去茲鄭衛聲，雅音方可悅。']
['芳辰追逸趣，禁苑信多奇。橋形通漢上，峰勢接雲危。']
['煙霞交隱映，花鳥自參差。何如肆轍跡，萬里賞瑤池。']
['飛蓋去芳園，蘭橈遊翠渚。萍間日彩亂，荷處香風舉。']


In [6]:
def load_corpus_poem(max_tokens=-1):
    """Return token indices and the vocabulary of the wiki dataset.

    Defined in :numref:`sec_text_preprocessing`"""
    lines = read_poem()
    tokens = d2l.tokenize(lines, 'char')
    vocab = d2l.Vocab(tokens)
    # Since each text line in the time machine dataset is not necessarily a
    # sentence or a paragraph, flatten all the text lines into a single list
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab


In [7]:
class SeqDataLoader:
    """An iterator to load sequence data."""
    def __init__(self, batch_size, num_steps, use_random_iter, max_tokens):
        """Defined in :numref:`sec_language_model`"""
        if use_random_iter:
            self.data_iter_fn = d2l.seq_data_iter_random
        else:
            self.data_iter_fn = d2l.seq_data_iter_sequential
        self.corpus, self.vocab = load_corpus_poem(max_tokens)
        self.batch_size, self.num_steps = batch_size, num_steps
    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)


In [8]:
def load_data_poem(batch_size, num_steps,
                           use_random_iter=False, max_tokens=10000):
    """Return the iterator and the vocabulary of the time machine dataset.

    Defined in :numref:`sec_language_model`"""
    data_iter = SeqDataLoader(
        batch_size, num_steps, use_random_iter, max_tokens)
    return data_iter, data_iter.vocab

In [9]:
all_data = []

In [10]:
train_iter, vocab = load_data_poem(batch_size, num_steps)

In [11]:
F.one_hot(torch.tensor([0, 2]), len(vocab))

tensor([[1, 0, 0,  ..., 0, 0, 0],
        [0, 0, 1,  ..., 0, 0, 0]])

In [12]:
X = torch.arange(10).reshape((2, 5))
F.one_hot(X.T, 28).shape

torch.Size([5, 2, 28])

In [13]:
#初始化模型参数
def get_lstm_params(vocab_size, num_hiddens, device):
    num_inputs = num_outputs = vocab_size

    def normal(shape):
        return torch.randn(size=shape, device=device)*0.01

    def three():
        return (normal((num_inputs, num_hiddens)),
                normal((num_hiddens, num_hiddens)),
                torch.zeros(num_hiddens, device=device))

    W_xi, W_hi, b_i = three()  # 输入门参数
    W_xf, W_hf, b_f = three()  # 遗忘门参数
    W_xo, W_ho, b_o = three()  # 输出门参数
    W_xc, W_hc, b_c = three()  # 候选记忆元参数
    # 输出层参数
    W_hq = normal((num_hiddens, num_outputs))
    b_q = torch.zeros(num_outputs, device=device)
    # 附加梯度
    params = [W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc,
              b_c, W_hq, b_q]
    for param in params:
        param.requires_grad_(True)
    return params

In [14]:
#定义模型
def init_lstm_state(batch_size, num_hiddens, device):
    return (torch.zeros((batch_size, num_hiddens), device=device),
            torch.zeros((batch_size, num_hiddens), device=device))

In [15]:
def lstm(inputs, state, params):
    [W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c,
     W_hq, b_q] = params
    (H, C) = state
    outputs = []
    for X in inputs:
        I = torch.sigmoid((X @ W_xi) + (H @ W_hi) + b_i)
        F = torch.sigmoid((X @ W_xf) + (H @ W_hf) + b_f)
        O = torch.sigmoid((X @ W_xo) + (H @ W_ho) + b_o)
        C_tilda = torch.tanh((X @ W_xc) + (H @ W_hc) + b_c)
        C = F * C + I * C_tilda
        H = O * torch.tanh(C)
        Y = (H @ W_hq) + b_q
        outputs.append(Y)
    return torch.cat(outputs, dim=0), (H, C)

In [16]:
def grad_clipping(net, theta):  #@save
    """裁剪梯度"""
    if isinstance(net, nn.Module):
        params = [p for p in net.parameters() if p.requires_grad]
    else:
        params = net.params
    norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params))
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm

In [17]:
def train_ch8(net, train_iter, vocab, lr, num_epochs, device,
              use_random_iter=False):
    """Train a model (defined in Chapter 8).

    Defined in :numref:`sec_rnn_scratch`"""
    loss = nn.CrossEntropyLoss()
    animator = d2l.Animator(xlabel='epoch', ylabel='perplexity',
                            legend=['train'], xlim=[10, num_epochs])
    # Initialize
    if isinstance(net, nn.Module):
        updater = torch.optim.SGD(net.parameters(), lr)
    else:
        updater = lambda batch_size: d2l.sgd(net.params, lr, batch_size)
    predict = lambda prefix: predict_ch8(prefix, 50, net, vocab, device)
    # Train and predict
    for epoch in range(num_epochs):
        ppl, speed = train_epoch_ch8(
            net, train_iter, loss, updater, device, use_random_iter)
        if (epoch + 1) % 10 == 0:
            print(predict('山中相送罢'))
            animator.add(epoch + 1, [ppl])
    print(f'perplexity {ppl:.1f}, {speed:.1f} tokens/sec on {str(device)}')
    print(predict('千山鸟飞绝'))
    print(predict('君问归期未有期'))


In [18]:
def train_epoch_ch8(net, train_iter, loss, updater, device, use_random_iter):
    """Train a net within one epoch (defined in Chapter 8).

    Defined in :numref:`sec_rnn_scratch`"""
    state, timer = None, d2l.Timer()
    metric = d2l.Accumulator(2)  # Sum of training loss, no. of tokens
    for X, Y in train_iter:
        if state is None or use_random_iter:
            # Initialize `state` when either it is the first iteration or
            # using random sampling
            state = net.begin_state(batch_size=X.shape[0], device=device)
        else:
            if isinstance(net, nn.Module) and not isinstance(state, tuple):
                # `state` is a tensor for `nn.GRU`
                state.detach_()
            else:
                # `state` is a tuple of tensors for `nn.LSTM` and
                # for our custom scratch implementation
                for s in state:
                    s.detach_()
        y = Y.T.reshape(-1)
        X, y = X.to(device), y.to(device)
        y_hat, state = net(X, state)
        l = loss(y_hat, y.long()).mean()
        if isinstance(updater, torch.optim.Optimizer):
            updater.zero_grad()
            l.backward()
            grad_clipping(net, 1)
            updater.step()
        else:
            l.backward()
            grad_clipping(net, 1)
            # Since the `mean` function has been invoked
            updater(batch_size=1)
        metric.add(l * d2l.size(y), d2l.size(y))
    return math.exp(metric[0] / metric[1]), metric[1] / timer.stop()

In [19]:
class RNNModelScratch:
    """A RNN Model implemented from scratch."""
    def __init__(self, vocab_size, num_hiddens, device,
                 get_params, init_state, forward_fn):
        """Defined in :numref:`sec_rnn_scratch`"""
        self.vocab_size, self.num_hiddens = vocab_size, num_hiddens
        self.params = get_params(vocab_size, num_hiddens, device)
        self.init_state, self.forward_fn = init_state, forward_fn

    def __call__(self, X, state):
        X = F.one_hot(X.T, self.vocab_size).type(torch.float32)
        return self.forward_fn(X, state, self.params)

    def begin_state(self, batch_size, device):
        return self.init_state(batch_size, self.num_hiddens, device)


In [20]:
def predict_ch8(prefix, num_preds, net, vocab, device):  #@save
    """在prefix后面生成新字符"""
    state = net.begin_state(batch_size=1, device=device)
    outputs = [vocab[prefix[0]]]
    get_input = lambda: torch.tensor([outputs[-1]], device=device).reshape((1, 1))
    for y in prefix[1:]:  # 预热期
        _, state = net(get_input(), state)
        outputs.append(vocab[y])
    for _ in range(num_preds):  # 预测num_preds步
        y, state = net(get_input(), state)
        outputs.append(int(y.argmax(dim=1).reshape(1)))
    return ''.join([vocab.idx_to_token[i] for i in outputs])

In [21]:
# #训练和预测
# vocab_size, num_hiddens, device = len(vocab), 256, d2l.try_gpu()
# num_epochs, lr = 5000, 2
# model2 = RNNModelScratch(len(vocab), num_hiddens, device, get_lstm_params,
#                             init_lstm_state, lstm)
# train_ch8(model2, train_iter, vocab, lr, num_epochs, device)

In [22]:
# train_ch8(model, train_iter, vocab, lr, num_epochs, device)

In [24]:
#torch.save(model2, './Chinese Poem NLP/Checkpoints/Poem_TangShi_2LSTM.pth')
model2 = torch.load( './Chinese Poem NLP/Checkpoints/Poem_TangShi_2LSTM.pth')


In [25]:
predict_ch8('白日依山尽', 30, model2, vocab, d2l.try_gpu())

'白日依山<unk>，飛觀一長空。長野千舊影，方色方長空。雲茲一天色，風為一時風'

In [26]:
predict_ch8('床前明月光，疑似地上霜', 50, model2, vocab, d2l.try_gpu())

'床前明月光，疑似地上霜。連茲一舊影，終色宴長空。雲日一天跡，風日一時風。雲雲含舊影，風風散時風。雲雲含雲色，風風散時風。雲'

In [27]:
predict_ch8('白日依山尽 黄河入海流 欲穷千里目', 100, model2, vocab, d2l.try_gpu())

'白日依山<unk><unk><unk>河入海流<unk>欲<unk>千里目，—室由來獨擅名。霞衣霞錦千般狀，雲峰雲岫百百慚。霞衣霞岫何賢光，還目天池倍百慚。魯霧運岫入雲霄，還色和池倍煙央。魯霧運岫入雲霄，還柞連池倍煙央。魯霧運岫入雲霄，還柞連臺接煙金。魯霧運岫入雲霄，還柞連'

In [28]:
predict_ch8('白日依山尽，黄河入海流。欲穷千里目，更上一层楼', 50, model2, vocab, d2l.try_gpu())

'白日依山<unk>，<unk>河入海流。欲<unk>千里目，更上一<unk><unk>。日茲一天色，風為一時風。雲雲含舊跡，風風九時春。雲雲含雲色，風風散時風。雲雲含雲色，風風散時風。雲'

In [32]:
predict_ch8('万里悲秋常作客,百年多病独登台', 50, model2, vocab, d2l.try_gpu())

'<unk>里悲秋常作客<unk>百年多病<unk>登台—崔，——長愔初歌秦樓魯館沐雲光。——安樂公主陽本禮天地開，——薛稷帝歌難續仰昭回。——宋愔問師振旅'

In [33]:
predict_ch8('空山不见人，但闻人语响。', 40, model2, vocab, d2l.try_gpu())

'空山不<unk>人，但<unk>人<unk><unk>。長野千里影，風觀方清春。日日一天色，風風一時風。雲軒含龍影，風風散時風。雲雲含雲'

In [36]:
predict_ch8('空山不见人，但闻人语响。', 36, model2, vocab, d2l.try_gpu())

'空山不<unk>人，但<unk>人<unk><unk>。長野千里影，風觀方清春。日日一天色，風風一時風。雲軒含龍影，風風散時風。'

In [37]:
predict_ch8('千山鸟飞绝，万径人踪灭。', 36, model2, vocab, d2l.try_gpu())

'千山<unk><unk><unk>，<unk><unk>人<unk><unk>。一察千晉際，山觀乃參虛。太日猶重外，終池乃清風。寒日含龍影，風風九時春。'

In [39]:
predict_ch8('美人卷珠帘，深坐蹙蛾眉。', 36, model2, vocab, d2l.try_gpu())

'美人卷珠<unk>，深坐蹙蛾眉。連茲一舊影，終色方長空。雲日一天跡，風日一時風。雲雲含舊影，風風散時風。'

In [42]:
predict_ch8('塞外悲風切，交河冰已結。', 36, model2, vocab, d2l.try_gpu())

'塞外悲風切，交河冰已結。昔日千春影，方色方長空。寒茲一天色，風風一時長。雲雲含舊跡，風風九時春。'

In [45]:
predict_ch8('三驅陳銳卒，七萃列材雄。', 36, model2, vocab, d2l.try_gpu())

'三驅陳銳卒，七萃列材雄。連茲一春色，終色方長空。雲日三天影，風日一時風。雲雲含舊影，風風散時風。'

In [None]:
predict_ch8('', 36, model2, vocab, d2l.try_gpu())

In [None]:
predict_ch8('', 36, model2, vocab, d2l.try_gpu())