## LSTM Part-of-Speech Tagger
### Step 1: Importing the <code>torch</code>  and other libraries

In [56]:
#torch stuff
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1)

#other stuff
import numpy as np
import glob, os, random
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import warnings

### Step 2: Get the helper functions

In [67]:
# Read the data file
def read_data(filepath):
    data = []
    with open(filepath, 'r', encoding="utf8") as f:
        for line in f:
            data.append(tuple(zip(*[wt.split('/') for wt in line.strip().split()])))
    return data

# Get vocabs and tagset
def get_vocab_tagset(data):
    token_size, vocabs, tagset = 0, set(), set()
    for words, tags in data:
        token_size += len(words)
        vocabs.update(words)
        tagset.update(tags)
    return vocabs, tagset, token_size

# convert the sequencies to indexes and tensors
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] if w in to_ix else 0 for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

# indexing list elements: use this for vocabs and tagset
def to_index(aList):
    elem_to_idx = {e:aList.index(e)+1 for e in aList}
    elem_to_idx['Unknown'] = 0  
    idx_to_elem = {i:e for e,i in elem_to_idx.items()}
    return elem_to_idx, idx_to_elem

# import numpy as np
def score_to_tag(tag_scores, i_to_tag):
    tagged=[]
    for preds in tag_scores:
        preds = list(np.array(preds))
        idx = preds.index(max(preds))
        tagged.append(f"{i_to_tag[idx]}")
    return tagged

# tagged_files = [#"igbo/ITC5.coarse*", "igbo/fiction.coarse*", #coarse
#                 #"igbo/ITC5.fine*", "igbo/fiction.fine*", #fine
#                 "welsh/*pos_coarse*",
#                 #"welsh/*sem_coarse*", 
#                 #"welsh/*both_fine*", "welsh/*both_coarse*",
#                 #"english/corpus.small"
#                 #"swedish/*coarse*"
#                 ]
# tagged data file
tagged_data_file = "all_corpora/welsh/cy_pos_coarse_tagged"

### Step 3: Preparing the data

In [68]:
# Read training data for each folder
print(f"\nReading {tagged_data_file} ...", end='')
data = read_data(tagged_data_file)
vocabs, tagset, token_size = get_vocab_tagset(data)
tagset = set(t for t in tagset if t) #Just taking care of some null string in tagset

# #Uncomment below for data statistics
# print(f"\nSentences:\t{len(training_data)}\nToken size:\t{token_size}\nVocab size:\t{len(vocabs)}\nTagset size:\t{len(tagset)}")

# Shuffle and split data
random.seed(7)
random.shuffle(data)
test_size = 0.1 #90% of data for training; 10% of data for testing
train_set, test_set = data[:-int(len(data)*test_size)], data[-int(len(data)*test_size):]

EMBEDDING_DIM = 100
HIDDEN_DIM = 5    
word_to_idx, idx_to_word = to_index(list(vocabs))
tag_to_idx, idx_to_tag = to_index(list(tagset))


Reading all_corpora/welsh/cy_pos_coarse_tagged ...

### Step 4: Creating the model

In [69]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

### Step 5: Training the model

In [None]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_idx), len(tag_to_idx))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
NUM_EPOCHS = 100
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for epoch in range(NUM_EPOCHS):  # again, normally you would NOT do 300 epochs, it is toy data
        for sentence, tags in train_set:
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.
            sentence_in = prepare_sequence(sentence, word_to_idx)
            targets = prepare_sequence(tags, tag_to_idx)

            # Step 3. Run our forward pass.
            tag_scores = model(sentence_in)

            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(tag_scores, targets)
            loss.backward()
            optimizer.step()

        with torch.no_grad():
            y_gold, y_pred = [], []
            for sent in test_set:
                y_gold.extend(sent[1])
                inputs = prepare_sequence(sent[0], word_to_idx)
                tag_scores = model(inputs)
                y_pred.extend(score_to_tag(tag_scores, idx_to_tag))

            if epoch%10==0 or epoch == NUM_EPOCHS-1:
                print(f"\nEpoch {epoch:02d}: ACC={accuracy_score(y_gold, y_pred)*100:.2f}%",end=' ')
                print(f"PRE={precision_score(y_gold, y_pred, average='macro')*100:.2f}%",end=' ')
                print(f"REC={recall_score(y_gold, y_pred, average='macro')*100:.2f}%",end=' ')
                print(f"F1={f1_score(y_gold, y_pred, average='macro')*100:.2f}%")
            else:
                print('...',end='')
    print('--Done--')


Epoch 00: ACC=38.24% PRE=16.75% REC=15.74% F1=13.59%
...........................
Epoch 10: ACC=68.85% PRE=40.89% REC=43.14% F1=41.53%
...........................
Epoch 20: ACC=70.45% PRE=47.66% REC=45.66% F1=45.64%
...........................
Epoch 30: ACC=72.19% PRE=48.25% REC=47.64% F1=47.51%
...........................
Epoch 40: ACC=73.46% PRE=48.86% REC=49.83% F1=49.03%
...........................
Epoch 50: ACC=74.26% PRE=49.52% REC=51.06% F1=49.91%
...........................
Epoch 60: ACC=74.53% PRE=50.42% REC=51.65% F1=50.66%


# ***** Forget about anything below here *****

In [9]:
# with open('all_corpora/welsh/cy_pos_fine_tagged', 'w', encoding='utf8') as f1:
#     with open('all_corpora/welsh/cy_both_fine_tagged', 'r', encoding='utf8') as f2:
#         for line in f2:
#             f1.write(" ".join("/".join(wt.split('|',1)) for wt in line.split())+"\n")

In [15]:
# import re
# with open('all_corpora/welsh/cy_sem_coarse_tagged', 'w', encoding='utf8') as f1:
#     with open('all_corpora/welsh/cy_both_fine_tagged', 'r', encoding='utf8') as f2:
#         for line in f2:
#             words, tags = tuple(zip(*[wt.split('/',1) for wt in line.strip().split()]))
# #             pos_tags = [tag.split('|')[0] for tag in tags]
#             sem_tags = [re.findall(r'[A-Za-z]+\d*',tag.split('|',1)[1])[0] for tag in tags]
# #             coarse_pos_sem = ["|".join(ps) for ps in zip(pos_tags,sem_tags)]
# #             word_coarse_pos_sem = [f"{w}/{t}" for w,t in zip(words, coarse_pos_sem)]
#             word_coarse_sem = [f"{w}/{t}" for w,t in zip(words, sem_tags)]
# #             print(f"{words}\n\n{tags}\n\n{both}\n\n{' '.join(word_both_tags)}")
# #             print(f"{i}\t{' '.join(word_coarse_pos_sem)}")
#             f1.write(f"{' '.join(word_coarse_sem)}\n")
# #             break