## CS310 Natural Language Processing
## Assignment 3 (part 2). Named Entity Recognition with Bi-LSTM

**Total points**: 30 + 20 bonus points

In this assignment, you will train a bidirectional LSTM model on the CoNLL2003 English named entity recognition task set and evaluate its performance.

For the bonus questions, submit them as separate notebook files.

### 0. Import Necessary Libraries

In [1]:
from pprint import pprint
import torch.nn as nn   
import torch
from torch import Tensor
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
import numpy as np
import torch.optim as optim
import tqdm
import os

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

In [3]:
TRAIN_PATH = 'data/train.txt'
DEV_PATH = 'data/dev.txt'
TEST_PATH = 'data/test.txt'
EMBEDDINGS_PATH = 'data/glove.6B.100d.txt' 
# Download from https://nlp.stanford.edu/data/glove.6B.zip
# It includes dimension 50, 100, 200, and 300.

#### process data to sentences

In [4]:
def read_ner_data(path_to_file):
    sentences = []
    labels = []
    sentence = []
    label = []
    # for vocab
    words = set()
    tags = set()
    total_labels = 0
    with open(path_to_file, 'r', encoding='utf-8') as file:
        for line in file:
            # a complete sentence
            if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\n":
                if len(sentence) > 0:
                    sentences.append(sentence)
                    sentence = []
                    labels.append(label)
                    label = []
                continue

            splitted = line.split()
            word = splitted[0].lower() # use lower case
            entity = splitted[-1]
            words.add(word)
            tags.add(entity)
            sentence.append(word)
            label.append(entity)
            total_labels += 1
    print('total labels: ', total_labels)
    return sentences, labels, words, tags

In [5]:
train_sentences, train_labels, train_words, train_tags = read_ner_data(TRAIN_PATH)
val_sentences, val_labels, val_words, val_tags = read_ner_data(DEV_PATH)
test_sentences, test_labels, test_words, test_tags = read_ner_data(TEST_PATH)

total labels:  203621
total labels:  51362
total labels:  46435


In [6]:
len(train_sentences), len(val_sentences), len(test_sentences)

(14041, 3250, 3453)

In [7]:
list(zip(train_sentences[:3], train_labels[:3]))

[(['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.'],
  ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']),
 (['peter', 'blackburn'], ['B-PER', 'I-PER']),
 (['brussels', '1996-08-22'], ['B-LOC', 'O'])]

In [8]:
list(zip(train_sentences[-3:], train_labels[-3:]))

[(['plymouth', '2', 'preston', '1'], ['B-ORG', 'O', 'B-ORG', 'O']),
 (['division', 'three'], ['O', 'O']),
 (['swansea', '1', 'lincoln', '2'], ['B-ORG', 'O', 'B-ORG', 'O'])]

**Note** that
- Each sentence ends with token '.' and tag 'O'. Between sentences there is a blank line.
- Same padding and packing pipeline as in the previous lab need be used for the NER data, too.

---

### 1.1 Build vocabularies for both words and labels (tags)

Use *ALL* the data from train, dev, and test sets to build the vocabularies, for word and label (tag), respectively.

In [9]:
words = train_words.union(val_words).union(test_words)
tags = train_tags.union(val_tags).union(test_tags)

In [10]:
### START YOUR CODE ###
word2id = {}
id = 0
for word in words:
    if word not in word2id:
        word2id[word] = id
        id += 1

tag2id = {}
tag2id['<PAD>'] = 0
id = 1 # 0 for Pad
for tag in tags:
    if tag not in tag2id:
        tag2id[tag] = id
        id += 1

id2word = {v: k for k, v in word2id.items()}
id2tag = {v: k for k, v in tag2id.items()}

In [11]:
len(word2id), len(tag2id), tag2id.values(), tag2id.keys()
# 0 is for padding

(26869,
 10,
 dict_values([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 dict_keys(['<PAD>', 'I-ORG', 'I-MISC', 'B-MISC', 'O', 'B-PER', 'I-LOC', 'I-PER', 'B-LOC', 'B-ORG']))

### load word vectors from GloVe

In [12]:
EMBEDDING_DIM = 100 # glove 100d

In [13]:
# read glove embeddings
embedding_dict = {}
with open(EMBEDDINGS_PATH, 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_dict[word] = vector
# ('word': w2v_vector)

In [14]:
vocab_size = len(word2id)
embedding_matrix = torch.zeros(vocab_size, EMBEDDING_DIM)

for word, i in word2id.items():
    embedding_vector = embedding_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = torch.tensor(embedding_vector)
    else:
        embedding_matrix[i] = torch.rand(EMBEDDING_DIM)


In [15]:
embedding = nn.Embedding(vocab_size, EMBEDDING_DIM)
embedding.weight = nn.Parameter(embedding_matrix)

In [16]:
embedding

Embedding(26869, 100)

#### Convert data into indices

In [17]:
train_seq_ids = [torch.tensor([word2id[word] for word in sentence]).to(device) for sentence in train_sentences]
val_sentences_ids = [torch.tensor([word2id[word] for word in sentence]).to(device) for sentence in val_sentences]
test_sentences_ids = [torch.tensor([word2id[word] for word in sentence]).to(device) for sentence in test_sentences]
train_labels_ids = [torch.tensor([tag2id[tag] for tag in labels]).to(device) for labels in train_labels]
val_labels_ids = [torch.tensor([tag2id[tag] for tag in labels]).to(device) for labels in val_labels]
test_labels_ids = [torch.tensor([tag2id[tag] for tag in labels]).to(device) for labels in test_labels]

#### seq lengths

In [18]:
val_seqs_len = [len(seq) for seq in val_sentences_ids]
test_seqs_len = [len(seq) for seq in test_sentences_ids]

In [19]:
val_seqs_len[:3]

[11, 2, 35]

In [20]:
train_sentences[:3], train_labels[:3]

([['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.'],
  ['peter', 'blackburn'],
  ['brussels', '1996-08-22']],
 [['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'],
  ['B-PER', 'I-PER'],
  ['B-LOC', 'O']])

#### Pad Val and Test Data

In [21]:
def pad_data(data_seq_ids: list, vocab: dict):
    ids_padded = nn.utils.rnn.pad_sequence(data_seq_ids, batch_first=True)
    return ids_padded

### 2. Build the Model

In [22]:

class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim: int, hidden_dim: int, output_size: int, embedding: nn.Embedding, num_layers: int = 1):
        super(LSTMTagger, self).__init__()
        self.word_embeddings = embedding
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_size) # for bilstm
    

    def forward(self, padded_seqs, seq_lens):
        padded_embs = self.word_embeddings(padded_seqs)
        packed_embs = nn.utils.rnn.pack_padded_sequence(padded_embs, seq_lens.cpu(), batch_first=True, enforce_sorted=False)
        out_packed, _ = self.lstm(packed_embs)
        # self.lstm 的输出是 (batch_size, sequence_length, hidden_dim * num_directions)
        out_unpacked, _ = nn.utils.rnn.pad_packed_sequence(out_packed, batch_first=True) # [16, 40, 256])?
        logits = self.fc(out_unpacked)
        log_probs = F.log_softmax(logits, dim=-1)
        return log_probs

### 3. Train and Evaluate

#### generate batches (Padded)

In [23]:
batch_size = 128

In [24]:
def batchify(sentences: list[list], labels: list[list], batch_size: int):
    for i in range(0, len(sentences), batch_size):
        seqs = sentences[i:i+batch_size]
        tags = labels[i:i+batch_size]
        # convert words and tags to ids
        seqs_ids = [torch.tensor([word2id[word] for word in sentence]).to(device) for sentence in seqs]
        tags_ids = [torch.tensor([tag2id[label] for label in labels]).to(device) for labels in tags]
        seq_lens = torch.tensor([len(ids) for ids in seqs_ids])

        padded_seqs = nn.utils.rnn.pad_sequence(seqs_ids, batch_first=True)
        padded_tags = nn.utils.rnn.pad_sequence(tags_ids, batch_first=True)
        
        yield padded_seqs, padded_tags, seq_lens

In [25]:
batches = list(batchify(train_sentences, train_labels, batch_size))
# sentence, label, seq_lens for each batch (all are padded!)

In [26]:
batches[0][0][0], batches[0][1][0], batches[0][2][0]

(tensor([ 6453, 17485,  1947, 22389, 15618, 16350, 13995, 23654, 22991,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]),
 tensor([9, 4, 3, 4, 4, 4, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 tensor(9))

In [27]:
len(batches)

110

#### Define train and evaluate

In [28]:
from sklearn.metrics import f1_score

In [29]:
def evaluate(model:LSTMTagger, sentence_ids, label_ids,loss_function): # raw data ids
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        padded_seq_ids = pad_data(sentence_ids, word2id)
        padded_label_ids = pad_data(label_ids, tag2id)
        seq_lens = torch.tensor([len(ids) for ids in sentence_ids])
        log_probs = model.forward(padded_seq_ids, seq_lens)
        
        predicted_labels = torch.argmax(log_probs, dim=2)
        predicted_labels_flat = predicted_labels.view(-1)
        true_labels_flat = padded_label_ids.view(-1)

        # Ignore padding labels
        non_zero_indices= (true_labels_flat != 0).nonzero().squeeze()
        real_true_labels = true_labels_flat[non_zero_indices]
        real_predicted_labels = predicted_labels_flat[non_zero_indices]

        total += len(real_predicted_labels) # 51362
        correct += torch.sum(real_predicted_labels == real_true_labels).item()
        
        # calculate f1
        predicted_labels_np = real_predicted_labels.cpu().numpy() # first move to cpu
        true_labels_np = real_true_labels.cpu().numpy()
        f1 = f1_score(true_labels_np, predicted_labels_np, average='macro')
        
        loss = loss_function(log_probs.view(-1, log_probs.size(-1)), padded_label_ids.view(-1))

    return correct/total, f1, loss.mean().item()

        

In [30]:
def train(model:LSTMTagger,
          optimizer: optim.Optimizer,
          loss_function,
          train_seq: list,
          train_labels: list,
          batch_size: int,
          padded_val_sentences_ids: list, # padded already
          padded_val_labels_ids: list,
          log_interval=20,
          epochs: int=2):
    
    batches = list(batchify(train_seq, train_labels, batch_size))
    for epoch in range(epochs):
        total_loss = 0
        for i, (padded_seqs, padded_labels, seq_lens) in enumerate(batches): # sentences and labels are padded
            model.train()
            optimizer.zero_grad()
            log_probs = model.forward(padded_seqs, seq_lens)

            # print(log_probs.shape) # 16, 40, 9
            # print(padded_labels.shape) # 16,40
            # 第一个维度的大小是 batch_size * sequence_length，第二个维度的大小是词汇表中词汇的数量（即 len(word2id)）
            # 这样做的目的是为了将每个时间步的概率值按照每个词汇进行展开。
            # targets_padded.view(-1) 展开成一个一维张量，其中包含了所有的目标标签。
            loss = loss_function(log_probs.view(-1, log_probs.size(-1)), padded_labels.view(-1))

            loss = loss.mean()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            if i % log_interval == 0 and i > 0:
                 print(
                    "| epoch {:3d} | {:5d}/{:5d} batches "
                    "| loss {:8.3f}".format(
                        epoch, i, len(batches), loss.item()
                    )
                )
        
        print(f'Epoch {epoch} Avg Loss: {total_loss/len(batches)}')
        print("Validating on dev test: ")
        acc, f1, loss = evaluate(model, padded_val_sentences_ids, padded_val_labels_ids, loss_function)
        print(f'Epoch {epoch} Validation Loss: {loss} Accuracy: {acc} F1: {f1}')
        

In [31]:
HIDDEN_DIM = 128
num_layers = 2
epochs = 2
learning_rate = 0.02
batch_size = 128
log_interval = 20

In [32]:
# import torch
# print(torch.backends.cudnn.version())

In [33]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(tag2id), embedding).to(device)
# if this fail, please rerun this, and below

In [34]:
model

LSTMTagger(
  (word_embeddings): Embedding(26869, 100)
  (lstm): LSTM(100, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=10, bias=True)
)

In [35]:
loss_function = nn.NLLLoss(reduction='none', ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [36]:
train(model, optimizer, loss_function, train_sentences, train_labels, batch_size, val_sentences_ids, val_labels_ids,log_interval, epochs)

| epoch   0 |    20/  110 batches | loss    0.134
| epoch   0 |    40/  110 batches | loss    0.286
| epoch   0 |    60/  110 batches | loss    0.133
| epoch   0 |    80/  110 batches | loss    0.031
| epoch   0 |   100/  110 batches | loss    0.034
Epoch 0 Avg Loss: 0.09791958834975958
Validating on dev test: 
Epoch 0 Validation Loss: 0.018458673730492592 Accuracy: 0.9647599392547019 F1: 0.8169451266879797
| epoch   1 |    20/  110 batches | loss    0.048
| epoch   1 |    40/  110 batches | loss    0.013
| epoch   1 |    60/  110 batches | loss    0.042
| epoch   1 |    80/  110 batches | loss    0.014
| epoch   1 |   100/  110 batches | loss    0.009
Epoch 1 Avg Loss: 0.024646524801342325
Validating on dev test: 
Epoch 1 Validation Loss: 0.01679503545165062 Accuracy: 0.968517581091079 F1: 0.8470849547301502


In [37]:
ac, f1, loss = evaluate(model, test_sentences_ids, test_labels_ids, loss_function)

In [38]:
print(f'Test Loss: {loss} Accuracy: {ac} F1: {f1}')

Test Loss: 0.016579529270529747 Accuracy: 0.9566921503176483 F1: 0.80884299584986


### Save model

In [39]:
cur_path = os.getcwd()
torch.save(model.state_dict(), cur_path + '/ner_model.pth')