In [2]:
import torch
import random
import copy
import torchtext
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.nn.utils import clip_grad_norm_
from torchtext.vocab import Vectors

In [26]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device('cuda:0' if USE_CUDA else 'cpu')

random.seed(1224)
np.random.seed(1224)
torch.manual_seed(1224)
if USE_CUDA:
    torch.cuda.manual_seed(1224)

NUM_EPOCHS = 2
BATCH_SIZE = 32
GRAD_CLIP = 1.
HIDDEN_SIZE = 100
LEARNING_RATE = 0.001
EMBEDDING_SIZE = 100
MAX_VOCAB_SIZE = 50000

In [4]:
# torchtext提供了LanguageModelingDataset这个class来帮助我们处理语言模型数据集

In [5]:
TEXT = torchtext.data.Field(lower=True)
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
    path='../02词向量简介/text8', text_field=TEXT, 
    train='text8.train.txt', validation='text8.dev.txt', test='text8.test.txt')

# 创建Vocabulary

In [6]:
TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE)

In [7]:
VOCAB_SIZE = len(TEXT.vocab); VOCAB_SIZE  # 多出的2个分别是torchtext自动增加的<unk>和<pad>特殊的token

50002

In [8]:
# vocab的重要功能

In [9]:
print(TEXT.vocab.itos[:10])  # idx to string

['<unk>', '<pad>', 'the', 'of', 'and', 'one', 'in', 'a', 'to', 'zero']


In [10]:
TEXT.vocab.stoi['the']  # string to index

2

In [11]:
# 构建Iterator, 为了得到 batch

In [13]:
train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
    datasets=(train, val, test), batch_size=BATCH_SIZE, device=DEVICE, 
    bptt_len=32, repeat=False, shuffle=True)

In [14]:
it = iter(train_iter)
batch = next(it); batch


[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.cuda.LongTensor of size 32x32 (GPU 0)]
	[.target]:[torch.cuda.LongTensor of size 32x32 (GPU 0)]

In [15]:
batch.text

tensor([[4815,   50,    6,  ..., 9116,   33,    7],
        [3143, 2748,  495,  ...,  893,  277,  317],
        [  13,    8,  850,  ...,  664,  824, 1602],
        ...,
        [3500,   48,    0,  ...,  534,    6,   12],
        [   2, 3452,  278,  ...,    5,   67, 6314],
        [ 196, 1854,   97,  ...,   10,    2, 2667]], device='cuda:0')

In [16]:
batch.target

tensor([[3143, 2748,  495,  ...,  893,  277,  317],
        [  13,    8,  850,  ...,  664,  824, 1602],
        [   7,  328,   62,  ..., 9289,  231, 1367],
        ...,
        [   2, 3452,  278,  ...,    5,   67, 6314],
        [ 196, 1854,   97,  ...,   10,    2, 2667],
        [  12,  379,   36,  ...,   14,  526,   60]], device='cuda:0')

In [17]:
print(' '.join(TEXT.vocab.itos[i] for i in batch.text[:, 0].data.cpu()))
print('==' * 40)
print(' '.join(TEXT.vocab.itos[i] for i in batch.target[:, 0].data.cpu()))

anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term
originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term is


In [18]:
# for i in range(3):
#     batch = next(it)
#     print(i)
#     print(' '.join(TEXT.vocab.itos[i] for i in batch.text[:, 0].data.cpu()))
#     print('==' * 40)
#     print(' '.join(TEXT.vocab.itos[i] for i in batch.target[:, 0].data.cpu()))

# 定义模型

- 继承nn.Module
- 初始化\__init\__()函数
- 定义forward()函数
- 其余可以根据模型需要定义相关函数

In [47]:
class RNNModel(nn.Module):
    def __init__(self, rnn_type, n_token, n_input, n_hidden, n_layers, dropout=0.5):
        """
        模型包含以下层:
            - 词嵌入层
            - 一个循环网络层（RNN, LSTM, GRU）
            - 一个线性层，从hidden state到输出单词表
            - 一个dropout层，用来做regularization        
        """
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(n_token, n_input)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(n_input, n_hidden, n_layers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError("""An invalid option for `--model` was suppiled, 
                                 options are ['LSTM', 'GRU', 'RNN_TANH', 'RNN_RELU']""")
            self.rnn = nn.RNN(n_input, n_hidden, n_layers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(n_hidden, n_token)
        self.init_weights()
        self.rnn_type = rnn_type
        self.n_hidden = n_hidden
        self.n_layers = n_layers
        
    def init_weights(self):
        init_range = 0.1
        self.encoder.weight.data.uniform_(-init_range, init_range)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-init_range, init_range)
    
    def forward(self, input, hidden):
        """
        Forward pass:
            - word embedding
            - 输入循环神经网络
            - 一个线性层从hidden state转化为输出单词表
        """
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
    
    def init_hidden(self, bsz, requires_grad=True):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros((self.n_layers, bsz, self.n_hidden), requires_grad=requires_grad),
                    weight.new_zeros((self.n_layers, bsz, self.n_hidden), requires_grad=requires_grad))
        else:
            return weight.new_zeros((self.n_layers, bsz, self.n_hidden), requires_grad=requires_grad)

- 初始化一个RNN模型

In [48]:
model = RNNModel('LSTM', VOCAB_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, 2, dropout=0.5)
if USE_CUDA:
    model = model.cuda()

In [49]:
model

RNNModel(
  (drop): Dropout(p=0.5)
  (encoder): Embedding(50002, 100)
  (rnn): LSTM(100, 100, num_layers=2, dropout=0.5)
  (decoder): Linear(in_features=100, out_features=50002, bias=True)
)

- 模型评估代码，与模型训练逻辑基本相同，唯一的区别是这里只需要forward pass，不需要backward pass

In [50]:
# 把一个hidden state和计算图之前的历史分离

In [51]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history"""
    if isinstance(h, torch.Tensor):
        return h.detach()  # detach()一定要加括号！！！
    else:
        return tuple(repackage_hidden(v) for v in h)

In [52]:
# 定义loss function和optimizer

In [53]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, 0.5)

In [56]:
def evaluate(model, data):
    model.eval()
    total_loss = 0.
    it = iter(data)
    total_count = 0.
    with torch.no_grad():
        hidden = model.init_hidden(BATCH_SIZE, requires_grad=False)
        for i, batch in enumerate(it):
            data, target = batch.text, batch.target
            if USE_CUDA:
                data, target = data.cuda(), target.cuda()
            hidden = repackage_hidden(hidden)
            with torch.no_grad():
                output, hidden = model(data, hidden)
            loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1))
            total_count += np.multiply(*data.size())
            total_loss += loss.item() * np.multiply(*data.size())
    loss = total_loss / total_count
    model.train()
    return loss

- 模型训练
    - 模型一般需要训练若干个epoch
    - 每个epoch我们都把所有的数据分成若干个batch
    - 把每个batch的输入湖人输出都包装成cuda tensor
    - forward pass，通过输入的句子预测每个单词的下一个单词
    - 用模型的预测和正确的下一个单词计算cross entropy loss
    - 清空模型当前的gradient
    - backward pass
    - gradient clipping，防止梯度爆炸
    - 更新模型参数
    - 每隔一定的iteration输出模型在当前iteration的loss以及在验证集上做模型的评估

In [58]:
val_losses = []
for epoch in range(NUM_EPOCHS):
    model.train()
    it = iter(train_iter)
    hidden = model.init_hidden(BATCH_SIZE)
    for i, batch in enumerate(it):
        data, target = batch.text, batch.target
        if USE_CUDA:
            data, target = data.cuda(), target.cuda()
        hidden = repackage_hidden(h=hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1))
        loss.backward()
        clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()
        if i % 1000 == 0:
            print(f'Epoch: {epoch}, Iteration: {i}, Loss: {loss.item()}')
        if i % 10000 == 0:
            val_loss = evaluate(model, val_iter)
            if len(val_losses) == 0 or val_loss < min(val_losses):
                print(f'Best model, val loss: {val_loss}')
                torch.save(model.state_dict(), 'lm_best.pth')
            else:
                scheduler.step()
                optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
            val_losses.append(val_loss)

Epoch: 0, Iteration: 0, Loss: 7.417287826538086
Best model, val loss: 7.27999424071163
Epoch: 0, Iteration: 1000, Loss: 7.069746017456055
Epoch: 0, Iteration: 2000, Loss: 6.940688133239746
Epoch: 0, Iteration: 3000, Loss: 6.739693641662598
Epoch: 0, Iteration: 4000, Loss: 6.137961387634277
Epoch: 0, Iteration: 5000, Loss: 6.498369216918945
Epoch: 0, Iteration: 6000, Loss: 6.512693881988525
Epoch: 0, Iteration: 7000, Loss: 6.2469282150268555
Epoch: 0, Iteration: 8000, Loss: 6.475406646728516
Epoch: 0, Iteration: 9000, Loss: 6.216585636138916
Epoch: 0, Iteration: 10000, Loss: 6.3099212646484375
Best model, val loss: 5.961035048068804
Epoch: 0, Iteration: 11000, Loss: 6.443028450012207
Epoch: 0, Iteration: 12000, Loss: 6.503840923309326
Epoch: 0, Iteration: 13000, Loss: 6.138657093048096
Epoch: 0, Iteration: 14000, Loss: 6.033720970153809
Epoch: 1, Iteration: 0, Loss: 6.347202301025391
Best model, val loss: 5.838853842807436
Epoch: 1, Iteration: 1000, Loss: 6.269353866577148
Epoch: 1, Ite

# 加载模型

In [59]:
best_model = RNNModel('LSTM', VOCAB_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, 2, dropout=0.5)
if USE_CUDA:
    best_model = best_model.cuda()
best_model.load_state_dict(torch.load('./lm_best.pth'))

In [60]:
# 使用best_model在validation上计算perplexity

In [62]:
val_loss = evaluate(model=best_model, data=val_iter)
print('Perplexity:', np.exp(val_loss))

Perplexity: 289.3557702271356


In [64]:
# 使用best_model在test上计算perplexity

In [65]:
test_loss = evaluate(model=best_model, data=test_iter)
print('Perplexity:', np.exp(test_loss))

Perplexity: 343.54375074988764


In [66]:
# 使用训练好的模型生成一些句子

In [80]:
hidden = best_model.init_hidden(bsz=1)
input = torch.randint(VOCAB_SIZE, (1, 1), dtype=torch.long).to(DEVICE)
words = []
for i in range(99):
    output, hidden = best_model(input, hidden)
    word_weights = output.squeeze().exp().cpu()
    word_idx = torch.multinomial(word_weights, 1)[0]
    input.fill_(word_idx)
    word = TEXT.vocab.itos[word_idx]
    words.append(word)
print(' '.join(words))

the colorado target by the cincinnati utopia engaged and a transition to as with <unk> external links <unk> parliament begins saint ray the films tyrol essayist brother stating also to tank the multimedia profession see read prevents korean onion travel to statistics <unk> and it emerged agents of information and investigation and the history of how the first level of other is actively transgendered by the chemistry in mathematics filter one the old preceding basque <unk> the mutants is a active transitions that numbered fgm who is in the use comment cherry and than three density decision to be
