In [1]:
import torch
import torch.nn as nn
import math
import d2l
# 之前定义的导入歌词数据函数，直接拿来使用
def load_data_jay_lyrics(data_file):
    """
    return: 
        corpus_indices-语料索引
        char_to_idx-字符索引集合
        idx_to_char-索引字符集合
        vocab_size-总词典大小 
    """
    with open(data_file, encoding='utf-8') as f:
        corpus_chars = f.read()
    corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
    corpus_chars = corpus_chars[0:10000]
    idx_to_char = list(set(corpus_chars))
    char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
    vocab_size = len(char_to_idx)
    corpus_indices = [char_to_idx[char] for char in corpus_chars]
    return corpus_indices, char_to_idx, idx_to_char, vocab_size
(corpus_indices, char_to_idx, idx_to_char, vocab_size) = load_data_jay_lyrics('./dataset/jaychou_lyrics.txt')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # 指定CPU模式
len(char_to_idx), len(idx_to_char), vocab_size, len(corpus_indices)

(1027, 1027, 1027, 10000)

In [2]:
# 语料的向量表示，此处用one-hot编码
def one_hot(x, n_class, dtype=torch.float32):
    result = torch.zeros(x.shape[0], n_class, dtype=dtype, device=x.device)  # 初始化0矩阵，n*n_class
#     result.scatter_(1, x.long().view(-1,1 ), 1)  # 按索引要求填充1，相当于result[i, x[i,0]]=1
    result.scatter_(1, x.view(-1,1), 1)
    return result

test = torch.tensor([0,2])
test_result = one_hot(test, vocab_size)
print("源数据：", test)
print("onehot编码效果：\n", test_result)

源数据： tensor([0, 2])
onehot编码效果：
 tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.]])


In [3]:
# 依上两节内容，批量样本为的shape是(batchsize批量大小, time_step时间步数)
# 该函数将其转化为符合训练的shape要求，即将该批量样本分成time_step个shape为（批量大小, 词典大小）的矩阵
def to_onehot(X, n_class):
    return [one_hot(X[:, i], n_class) for i in range(X.shape[1])]

X = torch.arange(10).view(2, 5)
inputs = to_onehot(X, vocab_size)
print("original:\n", X)
print("after:\n", inputs)
print(len(inputs), inputs[0].shape)

original:
 tensor([[0, 1, 2, 3, 4],
        [5, 6, 7, 8, 9]])
after:
 [tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), tensor([[0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), tensor([[0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])]
5 torch.Size([2, 1027])


In [4]:
# 各参数维度定义
# num_inputs: d
# num_hiddens: h, 隐藏单元的个数是超参数
# num_outputs: q
num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size

# 初始化模型参数
def get_params():
    def _one(shape):  # 参数格式 shape = (x, y)
        param = torch.zeros(shape, dtype=torch.float32) # 各参数矩阵初始化
        nn.init.normal_(param, 0, 0.01)
        return torch.nn.Parameter(param)  # 将参数转化为可训练类型
    # 按照模型，共5类参数 
    # 隐藏层参数
    W_xh = _one((num_inputs, num_hiddens))  
    W_hh = _one((num_hiddens, num_hiddens))
    b_h = torch.nn.Parameter(torch.zeros(num_hiddens, device=device))
    # 输出层参数
    W_hq = _one((num_hiddens, num_outputs))
    b_q = torch.nn.Parameter(torch.zeros(num_outputs, device=device))
    return (W_xh, W_hh, b_h, W_hq, b_q)

In [5]:
# 定义模型
def rnn(inputs, state, params):
    # inputs和outputs皆为num_steps个形状为(batch_size, vocab_size)的矩阵
    W_xh, W_hh, b_h, W_hq, b_q = params
    h, = state  # 隐层状态传递 h = state[0]
    outputs = []
    for x in inputs:  # 循环方式依次完成每个时间步的计算
        # 按照模型公式写出计算代码
        h = torch.tanh(torch.matmul(x, W_xh) + torch.matmul(h, W_hh) + b_h)
        y = torch.matmul(h, W_hq) + b_q  # 此处y还是未经激活函数处理的概率值
        outputs.append(y)
    return outputs, (h,)

In [6]:
# 初始化隐层状态h_state
def init_rnn_state(batch_size, num_hiddens):
    return (torch.zeros((batch_size, num_hiddens)), )

In [7]:
# 小数据量的效果
print("X:", X)
print("X.shape:", X.shape)
print("num_hiddens:", num_hiddens)
print("vocab_size:", vocab_size)
state = init_rnn_state(X.shape[0], num_hiddens)  # 初始化隐层参数
inputs = to_onehot(X, vocab_size)  # 输入文本向量化
params = get_params()  # 获取其它所有参数
outputs, state_new = rnn(inputs, state, params)  # 获得输出值和最后一个隐藏状态
print("input:", len(inputs), inputs[0].shape)
print("outputs:", len(outputs), outputs[0].shape)
print("state", len(state), state[0].shape)
print("state_new", len(state_new), state_new[0].shape)

X: tensor([[0, 1, 2, 3, 4],
        [5, 6, 7, 8, 9]])
X.shape: torch.Size([2, 5])
num_hiddens: 256
vocab_size: 1027
input: 5 torch.Size([2, 1027])
outputs: 5 torch.Size([2, 1027])
state 1 torch.Size([2, 256])
state_new 1 torch.Size([2, 256])


In [16]:
# 定义网络预测函数
def predict_rnn(prefix, num_chars, rnn, params, init_rnn_state,
                num_hiddens, vocab_size, device, idx_to_char, char_to_idx):
    state = init_rnn_state(1, num_hiddens)  # 初始化隐层状态
    output = [char_to_idx[prefix[0]]]  # output记录prefix加上预测的num_chars个字符
    for t in range(num_chars + len(prefix) - 1):
        # 将上一步时间步的输出作为当前时间步输入
        x = to_onehot(torch.tensor([[output[-1]]], device=device), vocab_size)
        # 计算输出和更新隐藏状态
        (y, state) = rnn(x, state, params)
        if t < len(prefix) - 1:  # 遍历所有初始字符
            output.append(char_to_idx[prefix[t + 1]])
        else:
            output.append(y[0].argmax(dim=1).item())
    return ''.join([idx_to_char[i] for i in output])

In [17]:
# 效果
predict_rnn('分开', 10, rnn, params, init_rnn_state, num_hiddens, vocab_size,
            device, idx_to_char, char_to_idx)

[472]


'分开疯张景雨龙歌义仔灵回'

In [24]:
# 前面定义的相邻采样函数
def data_iter_consecutive(corpus_indices, batch_size, num_steps, device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    corpus_len = len(corpus_indices) // batch_size * batch_size  # 保留下来的序列的长度
    corpus_indices = corpus_indices[: corpus_len]  # 仅保留前corpus_len个字符
    indices = torch.tensor(corpus_indices, device=device)
    indices = indices.view(batch_size, -1)  # resize成(batch_size, )
    batch_num = (indices.shape[1] - 1) // num_steps
    for i in range(batch_num):
        i = i * num_steps
        X = indices[:, i: i + num_steps]
        Y = indices[:, i + 1: i + num_steps + 1]
        yield X, Y

# 依照对RNN网络结构的解析，发现易出现所谓的“梯度消失”或“梯度爆炸”现象
# 参考资料：https://blog.csdn.net/zhaojc1995/article/details/80572098
# 梯度裁剪是应对“梯度爆炸”的一个方案
def grad_clipping(params, theta, device):
    norm = torch.tensor([0.0], device=device)
    for param in params:  
        norm += (param.grad.data ** 2).sum() # 梯度值总和
    norm = norm.sqrt().item()
    if norm > theta:  
        for param in params:
            param.grad.data *= (theta / norm)

# 前面定义梯度下降
def sgd(params, lr, batch_size): 
    for param in params:
        param.data -= lr * param.grad / batch_size
        
# 模型训练函数
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, device, corpus_indices, idx_to_char,
                          char_to_idx, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
    data_iter_fn = data_iter_consecutive  # 相邻采样方式
    params = get_params()  # 初始化参数
    loss = nn.CrossEntropyLoss()  # 损失函数
    
    for epoch in range(num_epochs):
        state = init_rnn_state(batch_size, num_hiddens)  # 初始化隐藏层状态
        l_sum, n= 0.0, 0
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, device)
        for X, Y in data_iter:        
            for s in state:  # 使用detach函数从计算图分离隐藏状态
                s.detach_()
            # inputs是num_steps个形状为(batch_size, vocab_size)的矩阵
            inputs = to_onehot(X, vocab_size)
            # outputs有num_steps个形状为(batch_size, vocab_size)的矩阵
            (outputs, state) = rnn(inputs, state, params)
            # 拼接之后形状为(num_steps * batch_size, vocab_size)
            outputs = torch.cat(outputs, dim=0)
            # Y的形状是(batch_size, num_steps)，转置后再变成形状为
            # (num_steps * batch_size,)的向量，这样跟输出的行一一对应
            y = torch.flatten(Y.T)  # 矩阵按行展开
            # 使用交叉熵损失计算平均分类误差
            l = loss(outputs, y.long())
            
            # 梯度清0
            if params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
            l.backward()  # 损失值反向传播
            grad_clipping(params, clipping_theta, device)  # 裁剪梯度
            sgd(params, lr, 1)  # 因为误差已经取过均值，梯度不用再做平均
            l_sum += l.item() * y.shape[0]
            n += y.shape[0]
        # 以下为展示训练状况
        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f' % (
                epoch + 1, math.exp(l_sum / n)))
            for prefix in prefixes:
                print(' -', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state,
                    num_hiddens, vocab_size, device, idx_to_char, char_to_idx))

In [25]:
num_epochs, num_steps, batch_size, lr, clipping_theta = 250, 35, 32, 1e2, 1e-2
pred_period, pred_len, prefixes = 50, 50, ['分开', '不分开']
train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                      vocab_size, device, corpus_indices, idx_to_char,
                      char_to_idx, num_epochs, num_steps, lr,
                      clipping_theta, batch_size, pred_period, pred_len,
                      prefixes)

TypeError: not enough arguments for format string