In [44]:
!pip install portalocker



In [45]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

from tqdm import tqdm
import time

device = torch.device("cuda")

In [46]:
def txt2list(filename):
  f = open(filename, 'r')
  lines = f.readlines()
  f.close()
  raw = ''.join(lines)
  sents_raw = raw.split('\n\n')
  words_raw=[]
  for sent in sents_raw:
    words_raw.append(sent.split('\n'))
  words_raw=words_raw[:-1]
  tokenized_list = []
  for words in words_raw:
    w_list, pos_list = [],[]
    for word in words:
      w, pos = tuple(word.split('\t'))
      w_list.append(w.lower())
      pos_list.append(pos)
    tokenized_list.append((w_list, pos_list))
  return tokenized_list

In [47]:
train_list = txt2list('/content/drive/MyDrive/Colab Notebooks/POS_tagger/train.txt')
valid_list = txt2list('/content/drive/MyDrive/Colab Notebooks/POS_tagger/valid.txt')
test_list = txt2list('/content/drive/MyDrive/Colab Notebooks/POS_tagger/test.txt')

In [48]:
train_list[-1]

(['swansea', '1', 'lincoln', '2'], ['NN', 'CD', 'NNP', 'CD'])

In [49]:
def list2textpos(list_):
  text, pos = [], []
  for t in list_:
    text.append(t[0])
    pos.append(t[1])
  return text, pos

In [50]:
(train_text, train_pos), (test_text, test_pos), (valid_text, valid_pos) = tuple(map(list2textpos, (train_list, test_list, valid_list)))

In [51]:
pos_bag=[]
for p in train_pos: pos_bag+=p
bag = ['<PAD>']+list(set(pos_bag))

## POS


In [52]:
bag

['<PAD>',
 'VBZ',
 'WP$',
 ':',
 'NNS',
 '.',
 'PRP',
 'POS',
 'SYM',
 'RBR',
 'EX',
 'CD',
 'LS',
 'NN|SYM',
 'DT',
 'VBN',
 'PDT',
 'WP',
 'JJR',
 'RP',
 'RBS',
 'WDT',
 'NNP',
 'UH',
 'MD',
 'JJS',
 'WRB',
 ',',
 'CC',
 '$',
 'NNPS',
 'TO',
 "''",
 'IN',
 'VBP',
 'VBG',
 ')',
 'VB',
 'PRP$',
 'FW',
 'NN',
 'JJ',
 '"',
 '(',
 'RB',
 'VBD']

In [53]:
def pos2label(pos):
  pos_=[]
  for pos_list in pos:
    pos_.append(list(map(bag.index, pos_list)))
  return pos_

In [54]:
(train_pos, test_pos, valid_pos) = tuple(map(pos2label, (train_pos, test_pos, valid_pos)))

## VOCAB

In [55]:
from collections import Counter
words_list=[]
for t in train_text:
  words_list+=t
words_counts = Counter(words_list)

In [56]:
vocab = sorted(words_counts, key=words_counts.get, reverse=True)

In [57]:
w2i={}
for index, word in enumerate(['<PAD>', '<UNK>']+vocab):
  w2i[word] = index

In [58]:
def t2seq(texts, w2i):
  encoded_texts=[]
  for text in texts:
    index_seq = []
    for word in text:
      try: index_seq.append(w2i[word])
      except KeyError: index_seq.append(w2i['<UNK>'])
    encoded_texts.append(index_seq)
  return encoded_texts

In [59]:
encoded_X_train = t2seq(train_text, w2i)
encoded_y_train = train_pos
encoded_X_test = t2seq(test_text, w2i)
encoded_y_test = test_pos
encoded_X_valid = t2seq(valid_text, w2i)
encoded_y_valid = valid_pos

In [60]:
max(map(len, encoded_X_train))

113

In [61]:
max_len=120

def pad_seq(encoded, max_len):
  features = np.zeros((len(encoded), max_len), dtype=int)
  for i, sent in enumerate(encoded):
    features[i, :len(sent)] = np.array(sent)[:max_len]
  return features

In [62]:
padded_X_train = pad_seq(encoded_X_train, max_len=max_len)
padded_y_train = pad_seq(encoded_y_train, max_len=max_len)
padded_X_test = pad_seq(encoded_X_test, max_len=max_len)
padded_y_test = pad_seq(encoded_y_test, max_len=max_len)
padded_X_valid = pad_seq(encoded_X_valid, max_len=max_len)
padded_y_valid = pad_seq(encoded_y_valid, max_len=max_len)

In [63]:
encoded_X_train[0]

[989, 10951, 205, 629, 7, 3939, 216, 5774, 3]

In [64]:
encoded_y_train[0]

[22, 1, 41, 40, 31, 37, 41, 40, 5]

## **Bidirectional LSTM**

In [65]:
class NERTagger(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=2):
        super(NERTagger, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        logits = self.fc(lstm_out)
        return logits

In [66]:
X_train_tensor = torch.tensor(padded_X_train, dtype=torch.long)
y_train_tensor = torch.tensor(padded_y_train, dtype=torch.long)
X_valid_tensor = torch.tensor(padded_X_valid, dtype=torch.long)
y_valid_tensor = torch.tensor(padded_y_valid, dtype=torch.long)
X_test_tensor = torch.tensor(padded_X_test, dtype=torch.long)
y_test_tensor = torch.tensor(padded_y_test, dtype=torch.long)

In [67]:
BATCH_SIZE = 32
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)
valid_dataset = torch.utils.data.TensorDataset(X_valid_tensor, y_valid_tensor)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, shuffle=False, batch_size=BATCH_SIZE)
test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)
test_dataloader = torch.utils.data.DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE)

In [68]:
vocab_size = 2+len(words_counts)

embedding_dim = 100
hidden_dim = 256
output_dim = len(bag)
learning_rate = 0.001
num_epochs = 10
num_layers = 4

model = NERTagger(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers)
model.to(device)

NERTagger(
  (embedding): Embedding(21011, 100)
  (lstm): LSTM(100, 256, num_layers=4, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=46, bias=True)
)

In [69]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [70]:
def calculate_accuracy(logits, labels, ignore_index=0):
    predicted = torch.argmax(logits, dim=1)
    mask = (labels != ignore_index)
    correct = (predicted == labels).masked_select(mask).sum().item()
    total = mask.sum().item()
    accuracy = correct / total
    return accuracy

In [71]:
def evaluate(model, valid_dataloader, criterion, device):
    val_loss = 0
    val_correct = 0
    val_total = 0

    model.eval()
    with torch.no_grad():
        for batch_X, batch_y in valid_dataloader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)

            logits = model(batch_X)

            loss = criterion(logits.view(-1, output_dim), batch_y.view(-1))

            val_loss += loss.item()
            val_correct += calculate_accuracy(logits.view(-1, output_dim), batch_y.view(-1)) * batch_y.size(0)
            val_total += batch_y.size(0)

    val_accuracy = val_correct / val_total
    val_loss /= len(valid_dataloader)

    return val_loss, val_accuracy

## TRAINING

In [72]:
best_val_loss = float('inf')

for epoch in range(num_epochs):
    train_loss = 0
    train_correct = 0
    train_total = 0
    model.train()

    for batch_X, batch_y in tqdm(train_dataloader):
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        logits = model(batch_X)

        loss = criterion(logits.view(-1, output_dim), batch_y.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_correct += calculate_accuracy(logits.view(-1, output_dim), batch_y.view(-1)) * batch_y.size(0)
        train_total += batch_y.size(0)

    train_accuracy = train_correct / train_total
    train_loss /= len(train_dataloader)

    val_loss, val_accuracy = evaluate(model, valid_dataloader, criterion, device)

    print(f'Epoch {epoch+1}/{num_epochs}:')
    print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')
    print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')

    if val_loss < best_val_loss:
        print(f'Validation loss improved from {best_val_loss:.4f} to {val_loss:.4f}')
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model_checkpoint.pth')

100%|██████████| 439/439 [00:33<00:00, 13.20it/s]


Epoch 1/10:
Train Loss: 1.2889, Train Accuracy: 0.6250
Validation Loss: 0.6097, Validation Accuracy: 0.8211
Validation loss improved from inf to 0.6097


100%|██████████| 439/439 [00:22<00:00, 19.61it/s]


Epoch 2/10:
Train Loss: 0.4776, Train Accuracy: 0.8586
Validation Loss: 0.4308, Validation Accuracy: 0.8733
Validation loss improved from 0.6097 to 0.4308


100%|██████████| 439/439 [00:22<00:00, 19.11it/s]


Epoch 3/10:
Train Loss: 0.2927, Train Accuracy: 0.9140
Validation Loss: 0.3821, Validation Accuracy: 0.8881
Validation loss improved from 0.4308 to 0.3821


100%|██████████| 439/439 [00:23<00:00, 18.70it/s]


Epoch 4/10:
Train Loss: 0.1873, Train Accuracy: 0.9452
Validation Loss: 0.4017, Validation Accuracy: 0.8900


100%|██████████| 439/439 [00:23<00:00, 18.60it/s]


Epoch 5/10:
Train Loss: 0.1196, Train Accuracy: 0.9656
Validation Loss: 0.4129, Validation Accuracy: 0.8924


100%|██████████| 439/439 [00:23<00:00, 18.56it/s]


Epoch 6/10:
Train Loss: 0.0802, Train Accuracy: 0.9765
Validation Loss: 0.4304, Validation Accuracy: 0.8964


100%|██████████| 439/439 [00:23<00:00, 18.52it/s]


Epoch 7/10:
Train Loss: 0.0515, Train Accuracy: 0.9848
Validation Loss: 0.4465, Validation Accuracy: 0.8997


100%|██████████| 439/439 [00:24<00:00, 18.19it/s]


Epoch 8/10:
Train Loss: 0.0349, Train Accuracy: 0.9896
Validation Loss: 0.4775, Validation Accuracy: 0.8998


100%|██████████| 439/439 [00:27<00:00, 16.16it/s]


Epoch 9/10:
Train Loss: 0.0300, Train Accuracy: 0.9910
Validation Loss: 0.5194, Validation Accuracy: 0.8976


100%|██████████| 439/439 [00:24<00:00, 18.11it/s]


Epoch 10/10:
Train Loss: 0.0223, Train Accuracy: 0.9933
Validation Loss: 0.5245, Validation Accuracy: 0.9003


## VALIDATION

In [73]:
model.load_state_dict(torch.load('best_model_checkpoint.pth'))
model.to(device)
val_loss, val_accuracy = evaluate(model, valid_dataloader, criterion, device)

print(f'Best model validation loss: {val_loss:.4f}')
print(f'Best model validation accuracy: {val_accuracy:.4f}')

Best model validation loss: 0.3821
Best model validation accuracy: 0.8881


In [74]:
test_loss, test_accuracy = evaluate(model, test_dataloader, criterion, device)

print(f'Best model test loss: {test_loss:.4f}')
print(f'Best model test accuracy: {test_accuracy:.4f}')

Best model test loss: 0.4375
Best model test accuracy: 0.8746


In [75]:
import nltk.translate.bleu_score as bleu

## BLEU

In [76]:
sum_bleu = 0
for k in tqdm(range(len(test_text))):
  input_tensor = torch.tensor(padded_X_test[k], dtype=torch.long).unsqueeze(0).to(device)
  model.eval()
  logits = model(input_tensor)
  predicted_indices = torch.argmax(logits, dim=-1).squeeze(0).tolist()[:len(test_text[k])]
  sum_bleu += bleu.sentence_bleu([predicted_indices],test_pos[k])
print('\nBLEU: ', end='')
print(sum_bleu/len(test_text))

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
100%|██████████| 3453/3453 [00:45<00:00, 76.07it/s] 


BLEU: 0.6151803071137855





In [77]:
sample_text = ["“Midcontinent prices were similarly lower in the $ 3.40s . New York city gate gas slipped into the % 4.40s , down almost 15 cents .”".lower().split(' ')]
encoded_sample = t2seq(sample_text, w2i)
padded_sample = pad_seq(encoded_sample, max_len=max_len)
input_tensor = torch.tensor(padded_sample[0], dtype=torch.long).unsqueeze(0).to(device)
logits = model(input_tensor)
predicted_indices = np.array(torch.argmax(logits, dim=-1).squeeze(0).tolist()[:len(encoded_sample[0])])

In [78]:
pos_=[]
for num in predicted_indices:
  pos_.append(bag[num])
print(list(zip(sample_text[0], pos_)))

[('“midcontinent', 'NNP'), ('prices', 'NNS'), ('were', 'VBD'), ('similarly', 'JJ'), ('lower', 'JJR'), ('in', 'IN'), ('the', 'DT'), ('$', '$'), ('3.40s', 'CD'), ('.', 'NN'), ('new', 'NNP'), ('york', 'NNP'), ('city', 'NNP'), ('gate', 'NNP'), ('gas', 'NNP'), ('slipped', 'VBD'), ('into', 'IN'), ('the', 'DT'), ('%', 'NN'), ('4.40s', 'NN'), (',', ','), ('down', 'RB'), ('almost', 'RB'), ('15', 'CD'), ('cents', 'NNS'), ('.”', 'CD')]
