## CS310 Natural Language Processing
## Assignment 3 (part 1). Recurrent Neural Networks for Language Modeling

**Total points**: 30

In this assignment, you will train a vanilla RNN language model on《论语》and evaluate its perplexity.

### 0. Import Necessary Libraries

In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [38]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

prepare data 

In [39]:
input_file = 'lunyu_20chapters.txt'

from util import CorpusReader
corpus = CorpusReader(inputFileName=input_file, min_count=1)

Total vocabulary: 1352


In [40]:
### START YOUR CODE ###
# Modify word2id to make 0 as the padding token '[PAD]', and increase the index of all other words by 1
# Modify the id2word list to make the first word '[PAD]' as well
# Hint: Both word2id and id2word in utils.CorpusReader are dict objects
word2id = {}
word2id['[PAD]'] = 0
for word, id in corpus.word2id.items():
    word2id[word] = id + 1

id2word = {}
id2word[0] = '[PAD]'
for id, word in corpus.id2word.items():
    id2word[id + 1] = word
### END YOUR CODE ###


In [41]:

# Test result
print('id2word:', sorted(list(id2word.items()), key=lambda x: x[0])[:5])
print('word2id:', sorted(list(word2id.items()), key=lambda x: x[1])[:5])

# You should expect to see:
# id2word: [(0, '[PAD]'), (1, '，'), (2, '子'), (3, '。'), (4, '：')]
# word2id: [('[PAD]', 0), ('，', 1), ('子', 2), ('。', 3), ('：', 4)]


id2word: [(0, '[PAD]'), (1, '，'), (2, '子'), (3, '。'), (4, '：')]
word2id: [('[PAD]', 0), ('，', 1), ('子', 2), ('。', 3), ('：', 4)]


In [42]:
with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    max_len = max([len(line.strip()) for line in lines])
line_words = [list(line.strip()) for line in lines]
seq_ids = [torch.tensor([word2id.get(word, 0) for word in words]) for words in line_words]
seq_lens = torch.tensor([len(ids) for ids in seq_ids])
seq_ids_padded = nn.utils.rnn.pad_sequence(seq_ids, batch_first=True)
seq_ids_padded.size()

torch.Size([512, 393])

In [43]:
embedding_lunyu = nn.Embedding(len(word2id), 50) # vocab_size, embedding_dim
rnn_lunyu = nn.RNN(50, 100, batch_first=True)
seq_embs = embedding_lunyu(seq_ids_padded)
seq_embs_packed = nn.utils.rnn.pack_padded_sequence(seq_embs, seq_lens, batch_first=True, enforce_sorted=False)
out_packed, _ = rnn_lunyu(seq_embs_packed)
out_unpacked, _ = nn.utils.rnn.pad_packed_sequence(out_packed, batch_first=True)

In [44]:

# Test result
print('max length: ', max_len)
print('seq_ids_padded:', seq_ids_padded.size())
print('seq_embs:', seq_embs.size())
print('out_unpacked:', out_unpacked.size())

# You should expect to see:
# seq_ids_padded: torch.Size([512, 393])
# seq_embs: torch.Size([512, 393, 50])
# out_unpacked: torch.Size([512, 393, 100])

max length:  393
seq_ids_padded: torch.Size([512, 393])
seq_embs: torch.Size([512, 393, 50])
out_unpacked: torch.Size([512, 393, 100])


prepare target label

In [45]:
seq_ids_padded[0][:50]

tensor([  2,   5,   4,  47,   9, 225, 545,   6,   1,   7,  66, 131,  20,  10,
         15, 267, 132, 106, 179, 246,   1,   7,  66,  64,  20,  10,  12,   7,
         30,   9,   7, 546,   1,   7,  66,  19,   2,  20,  10,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0])

In [46]:
targets_padded = torch.zeros_like(seq_ids_padded)
padding_id = 0

for i in range(seq_ids_padded.size(0)):
    targets_padded[i, :-1] = seq_ids_padded[i, 1:] # Shift the sequence to the left by 1
    targets_padded[i, -1] = padding_id # Set the last token to be the padding token


In [47]:
# Test result
print('targets_padded:', targets_padded.size())
print('last column of targets_padded:', targets_padded[:, -1][:10])

print('seq_ids_padded[0][:50]:', seq_ids_padded[0][:50])
print('targets_padded[0][:50]:', targets_padded[0][:50])

# You should expect to see:
# targets_padded: torch.Size([512, 393])
# last column of targets_padded: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


targets_padded: torch.Size([512, 393])
last column of targets_padded: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
seq_ids_padded[0][:50]: tensor([  2,   5,   4,  47,   9, 225, 545,   6,   1,   7,  66, 131,  20,  10,
         15, 267, 132, 106, 179, 246,   1,   7,  66,  64,  20,  10,  12,   7,
         30,   9,   7, 546,   1,   7,  66,  19,   2,  20,  10,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0])
targets_padded[0][:50]: tensor([  5,   4,  47,   9, 225, 545,   6,   1,   7,  66, 131,  20,  10,  15,
        267, 132, 106, 179, 246,   1,   7,  66,  64,  20,  10,  12,   7,  30,
          9,   7, 546,   1,   7,  66,  19,   2,  20,  10,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0])


### Training Data

In [48]:
train_seq_ids = seq_ids
train_seq_lens = seq_lens

### START YOUR CODE ###
targets_padded = torch.zeros_like(seq_ids_padded)
padding_id = 0

for i in range(seq_ids_padded.size(0)):
    targets_padded[i, :-1] = seq_ids_padded[i, 1:] # Shift the sequence to the left by 1
    targets_padded[i, -1] = padding_id # Set the last token to be the padding token

### END YOUR CODE ###

# Test result
print('targets_padded:', targets_padded.size())
print('last column of targets_padded[:20]:', targets_padded[:, -1][:20])

print('seq_ids_padded[0][:50]:', seq_ids_padded[0][:50])
print('targets_padded[0][:50]:', targets_padded[0][:50])
# You should expect to see:
# targets_padded: torch.Size([16, 85])
# last column of targets_padded: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

targets_padded: torch.Size([512, 393])
last column of targets_padded[:20]: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
seq_ids_padded[0][:50]: tensor([  2,   5,   4,  47,   9, 225, 545,   6,   1,   7,  66, 131,  20,  10,
         15, 267, 132, 106, 179, 246,   1,   7,  66,  64,  20,  10,  12,   7,
         30,   9,   7, 546,   1,   7,  66,  19,   2,  20,  10,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0])
targets_padded[0][:50]: tensor([  5,   4,  47,   9, 225, 545,   6,   1,   7,  66, 131,  20,  10,  15,
        267, 132, 106, 179, 246,   1,   7,  66,  64,  20,  10,  12,   7,  30,
          9,   7, 546,   1,   7,  66,  19,   2,  20,  10,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0])


### 2. Build the Model

In [49]:
from gensim.models import KeyedVectors

In [50]:
word_vectors = KeyedVectors.load_word2vec_format('embedding_emb50_neg10_win1.txt', binary=False)

In [51]:
word_vectors.vectors.shape, type(word_vectors.vectors)

((1352, 50), numpy.ndarray)

In [52]:
# add the pad token to the word vectors (index 0)
vec = word_vectors.vectors
vec = np.vstack((np.zeros((1,50)), vec)) # for the pad token
vec.shape, vec[0][:5], id2word[0]

((1353, 50), array([0., 0., 0., 0., 0.]), '[PAD]')

In [53]:
# nn.Embedding(vocab_size, emb_size)
embedding_w2v = nn.Embedding.from_pretrained(torch.Tensor(vec), padding_idx=0)
# add '[PAD]' to the embedding matrix
embedding_w2v.weight.data[0]


tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.])

In [54]:
embedding_rand = nn.Embedding(len(word2id), 50, padding_idx=0)
embedding_rand.weight.data.size()

torch.Size([1353, 50])

In [55]:
embedding_rand.weight[5][:5], embedding_w2v.weight[5][:5]

(tensor([ 0.9306,  0.4925, -0.6729, -1.6648,  0.3644], grad_fn=<SliceBackward0>),
 tensor([ 0.1959,  0.1646,  0.0488, -0.2016, -0.2239]))

In [56]:
embedding_w2v.num_embeddings, embedding_rand.num_embeddings

(1353, 1353)

In [57]:
# Modify word2id to make 0 as the padding token '[PAD]', and increase the index of all other words by 1
# Modify the id2word list to make the first word '[PAD]' as well

### Model Architecture

`forward` method takes the word id sequences and sequence lengths as inputs, and return the logits or log probabilities from RNN. 

In [58]:
class RNNLM(nn.Module):
    def __init__(self, embedding: nn.Embedding ,**kwargs):
        super(RNNLM, self).__init__()
        self.embedding = embedding
        self.rnn = nn.RNN(embedding.embedding_dim, hidden_size=100, batch_first=True)
        self.fc = nn.Linear(100, len(word2id))
        

    def forward(self, seq, seq_lens): # pass in raw word ids and sequence lengths
        padded_seqs = nn.utils.rnn.pad_sequence(seq, batch_first=True)
        padded_embs = self.embedding(padded_seqs)
        packed_embs = nn.utils.rnn.pack_padded_sequence(padded_embs, seq_lens.cpu(), batch_first=True, enforce_sorted=False)
        out_packed, _ = self.rnn(packed_embs)
        out_unpacked, _ = nn.utils.rnn.pad_packed_sequence(out_packed, batch_first=True)
        # print(out_unpacked.size()) # ([512, 393, 100])
        logits = self.fc(out_unpacked)
        log_probs = F.log_softmax(logits, dim=-1)
        return log_probs

### 3. Train and Evaluate

In [59]:
from torch import optim

In [60]:
# 初始化模型
model_rand = RNNLM(embedding_rand)
model_w2v = RNNLM(embedding_w2v)
learning_rate = 0.03

In [61]:
loss_fn = nn.NLLLoss(ignore_index=0, reduction='none')
optimizer_rand = optim.Adam(model_rand.parameters(), lr=learning_rate)
optimizer_w2v = optim.Adam(model_w2v.parameters(), lr=0.02)

In [62]:
def train(model: RNNLM, seq, seq_len, targets_padded, loss_fn, optimizer, n_epochs=10):
    for epoch in range(n_epochs):
        model.train()

        optimizer.zero_grad()
        log_probs = model.forward(seq, seq_len)

        loss = loss_fn(log_probs.view(-1, len(word2id)), targets_padded.view(-1))
        loss = loss.mean()
        loss.backward()
        perplexity = torch.exp(loss)        
        optimizer.step()
        print(f'Epoch {epoch + 1}/{n_epochs}, Loss: {loss.item()}, Perplexity: {perplexity.item()}')


In [63]:
def evaluate(model: RNNLM, seq, seq_len, targets_padded, loss_fn):
    model.eval()
    with torch.no_grad():
        log_probs = model.forward(seq, seq_len)
        loss = loss_fn(log_probs.view(-1, len(word2id)), targets_padded.view(-1))
        loss = loss.mean()
        perplexity = torch.exp(loss)
        print(f'Evaluation Loss: {loss.item()}')
        print(f'Perplexity: {perplexity.item()}')
        

In [64]:
train(model_rand, seq_ids, seq_lens, targets_padded, loss_fn, optimizer_rand, n_epochs=25)

Epoch 1/25, Loss: 0.6947778463363647, Perplexity: 2.0032639503479004
Epoch 2/25, Loss: 0.6264045238494873, Perplexity: 1.8708717823028564
Epoch 3/25, Loss: 0.4940985441207886, Perplexity: 1.6390200853347778
Epoch 4/25, Loss: 0.48375776410102844, Perplexity: 1.6221586465835571
Epoch 5/25, Loss: 0.46546778082847595, Perplexity: 1.592759132385254
Epoch 6/25, Loss: 0.4524214267730713, Perplexity: 1.57211434841156
Epoch 7/25, Loss: 0.43826037645339966, Perplexity: 1.5500084161758423
Epoch 8/25, Loss: 0.42525094747543335, Perplexity: 1.529974341392517
Epoch 9/25, Loss: 0.41341638565063477, Perplexity: 1.5119744539260864
Epoch 10/25, Loss: 0.4031398594379425, Perplexity: 1.496516227722168
Epoch 11/25, Loss: 0.39184197783470154, Perplexity: 1.4797039031982422
Epoch 12/25, Loss: 0.3811172544956207, Perplexity: 1.4639192819595337
Epoch 13/25, Loss: 0.37143948674201965, Perplexity: 1.4498201608657837
Epoch 14/25, Loss: 0.3617735803127289, Perplexity: 1.4358737468719482
Epoch 15/25, Loss: 0.352710

In [65]:
train(model_w2v, seq_ids, seq_lens, targets_padded, loss_fn, optimizer_w2v, n_epochs=22)

Epoch 1/22, Loss: 0.6943380832672119, Perplexity: 2.002383232116699
Epoch 2/22, Loss: 0.6428998112678528, Perplexity: 1.9019882678985596
Epoch 3/22, Loss: 0.5384312868118286, Perplexity: 1.7133170366287231
Epoch 4/22, Loss: 0.5130017399787903, Perplexity: 1.6702975034713745
Epoch 5/22, Loss: 0.5160813331604004, Perplexity: 1.675449252128601
Epoch 6/22, Loss: 0.5080342888832092, Perplexity: 1.6620209217071533
Epoch 7/22, Loss: 0.49919945001602173, Perplexity: 1.6474019289016724
Epoch 8/22, Loss: 0.4922601878643036, Perplexity: 1.636009693145752
Epoch 9/22, Loss: 0.4862425923347473, Perplexity: 1.6261944770812988
Epoch 10/22, Loss: 0.4819709360599518, Perplexity: 1.6192626953125
Epoch 11/22, Loss: 0.4762839078903198, Perplexity: 1.6100801229476929
Epoch 12/22, Loss: 0.47073444724082947, Perplexity: 1.6011697053909302
Epoch 13/22, Loss: 0.46530359983444214, Perplexity: 1.5924975872039795
Epoch 14/22, Loss: 0.45995819568634033, Perplexity: 1.584007740020752
Epoch 15/22, Loss: 0.45560950040

### 4. Experiments

### Compute Perplexity (on training data)

Finally, compute the perplexity by exponentiating the average loss per sequence.

See the documentation here: https://pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html


In [66]:
# random embedding
evaluate(model_rand, seq_ids, seq_lens, targets_padded, loss_fn)

Evaluation Loss: 0.27371081709861755
Perplexity: 1.314834475517273


In [67]:
# word2vec embedding
evaluate(model_w2v, seq_ids, seq_lens, targets_padded, loss_fn)

Evaluation Loss: 0.413811594247818
Perplexity: 1.512572169303894


### Generate some sentences

In [68]:
def get_sentence(model, seq, max_length=20):
    model.eval()
    with torch.no_grad():
        current_tokens = seq
        for _ in range(max_length):
            current_tokens_tensor = torch.tensor([[word2id[word] for word in current_tokens]])
            seq_lens = torch.tensor([len(current_tokens)])
            # 调用模型，获取下一个单词的概率分布
            log_probs = model(current_tokens_tensor, seq_lens)
            # 从概率分布中采样下一个单词的索引
            next_word_index = torch.argmax(log_probs[:, -1, :], dim=-1).item()
            next_word = id2word[next_word_index]
            current_tokens.append(next_word)
            if next_word == '。':
                break
        return ''.join(current_tokens)

In [77]:
seq = ['天','下']
max_length = 20

In [78]:
print(get_sentence(model_w2v, seq, max_length))

天下，子曰：君子曰：君子曰：君子曰：君子曰：


In [79]:
seq = ['子','曰']
max_length = 20

In [80]:
get_sentence(model_rand, seq, max_length)

'子曰：君子不能，不可以为之，不可以为之，不可'