In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# example

https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html

In [2]:
# model: 10-input_size, 20-hidden_size
rnn = nn.LSTM(10, 20)
# 输入，3条句子(batch)，长度为5，input_size/embedding为10
input = torch.randn(5, 3, 10)
# 1表示单向
h0 = torch.randn(1, 3, 20)
c0 = torch.randn(1, 3, 20)

In [3]:
output, (hn, cn) = rnn(input, (h0, c0))

In [4]:
output.shape

torch.Size([5, 3, 20])

# data import

In [5]:
file = open("news.txt", "r")
raw_text = []
for line in file:
    raw_text.append(line.strip())
file.close()
# 为了减少运算量少取几行
raw_text = [raw_text[i] for i in range(5)]

In [6]:
raw_tokens = ' '.join(raw_text).split(' ')
raw_tokens[0:10]

['These',
 'particular',
 'developments',
 'can',
 'be',
 'seen',
 'as',
 'a',
 'desire',
 'of']

In [7]:
vocab = set(raw_tokens)
word_idx = {word:i for i, word in enumerate(vocab)}
idx_word = {i:word for i, word in enumerate(vocab)}
vocab_size = len(word_idx)

# model

In [8]:
def batch(data, sequence_length = 10):
    aBatch = []
    for i in range(len(data)):
        if i >= sequence_length-1:
            end = i+1
            context = [word_idx[word] for word in data[end-sequence_length:end]]
            aBatch.append(np.eye(vocab_size)[context])
    return aBatch

In [9]:
class TextLSTM(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(TextLSTM, self).__init__()

        self.lstm = nn.LSTM(input_size = vocab_size, hidden_size = hidden_size, batch_first = True)
        self.W = nn.Linear(hidden_size, vocab_size, bias=False)
        self.b = nn.Parameter(torch.ones([vocab_size]))

    def forward(self, X):
        input = X  # batch_first = True, [batch_size,sequence_length, input_size]

        hidden_state = torch.zeros(1, input.shape[0], hidden_size)  # [num_layers(=1) * num_directions(=1), batch_size, hidden_size]
        cell_state = torch.zeros(1, input.shape[0], hidden_size)     # [num_layers(=1) * num_directions(=1), batch_size, hidden_size]

        outputs, (_, _) = self.lstm(input, (hidden_state, cell_state))
        outputs = outputs.transpose(0,1)
        outputs = outputs[-1]  # [batch_size, hidden_size]
        model = self.W(outputs) + self.b  # model : [batch_size, vocab_size]
        return model

## data prepared

In [10]:
# len(raw_tokens)

In [11]:
sequence_length = 11
aBatch = batch(raw_tokens, sequence_length)

In [12]:
input_batch = torch.FloatTensor(aBatch).transpose(0,1)
target = input_batch[-1]
input = input_batch[:sequence_length-1].transpose(0,1)

  input_batch = torch.FloatTensor(aBatch).transpose(0,1)


In [13]:
input.shape
# batch_size * sequence_length * embedding_dim

torch.Size([533, 10, 272])

In [14]:
target.shape
# batch_size * embedding_dim

torch.Size([533, 272])

## 模型训练

In [15]:
hidden_size = 128

In [16]:
model = TextLSTM(vocab_size, hidden_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

一个batch

In [17]:
x1 = input[0]
x1_new = x1.unsqueeze(0)
x1_new.shape
# [batch_size, sequence_length, word_embedding]

torch.Size([1, 10, 272])

In [18]:
output = model(x1_new) 
output.shape
# batch_first=False, [batch_size=1, senquence_length, 1*outpust_size]，此处outpust_size=input_size

torch.Size([1, 272])

批量

In [19]:
# Training
for epoch in range(100):
    optimizer.zero_grad()

    output = model(input)
    loss = criterion(output, target)
    if (epoch + 1) % 10 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))

    loss.backward()
    optimizer.step()

Epoch: 0010 loss = 5.015473
Epoch: 0020 loss = 4.775638
Epoch: 0030 loss = 3.775566
Epoch: 0040 loss = 2.112206
Epoch: 0050 loss = 0.911878
Epoch: 0060 loss = 0.341048
Epoch: 0070 loss = 0.128705
Epoch: 0080 loss = 0.057740
Epoch: 0090 loss = 0.033706
Epoch: 0100 loss = 0.023906


In [25]:
output.shape

torch.Size([533, 272])

In [27]:
target.shape

torch.Size([533, 272])

## test

In [20]:
i = np.random.randint(sequence_length, len(raw_tokens))
token_test = [word_idx[token] for token in raw_tokens[i-sequence_length+1:i]]
token_test_idx = []
token_test_idx.append(np.eye(vocab_size)[token_test])

In [21]:
idx_test = torch.FloatTensor(token_test_idx)
idx_test.shape
# batch_size * sentence_length * embedding/input_size

torch.Size([1, 10, 272])

In [22]:
predict = model(idx_test).data.max(1, keepdim=True)[1]

In [23]:
predict

tensor([[212]])

In [24]:
print('test:','\n', raw_tokens[i-sequence_length:i], '->', [idx_word[predict.squeeze().item()]],'\n')
print('true:',raw_tokens[i-sequence_length:i+1])

test: 
 ['the', 'trial', 'of', 'the', 'Terra', 'cryptocurrency', 'company.', 'Kwon', 'was', 'hiding', 'from'] -> ['law'] 

true: ['the', 'trial', 'of', 'the', 'Terra', 'cryptocurrency', 'company.', 'Kwon', 'was', 'hiding', 'from', 'law']
