## LSTM Part-of-Speech Tagger
### Step 1: Importing the <code>torch</code>  and other libraries

In [101]:
#torch stuff
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1)

#other stuff
import numpy as np
import glob, os, random
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import warnings
from timeit import default_timer as timer

### Step 2: Get the helper functions

In [102]:
# Read the data file
def read_data(filepath):
    data = []
    with open(filepath, 'r', encoding="utf8") as f:
        for line in f:
            data.append(tuple(zip(*[wt.split('/') for wt in line.strip().split()])))
    return data

# Get vocabs and tagset
def get_vocab_tagset(data):
    token_size, vocabs, tagset = 0, set(), set()
    for words, tags in data:
        token_size += len(words)
        vocabs.update(words)
        tagset.update(tags)
    return vocabs, tagset, token_size

# convert the sequencies to indexes and tensors
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] if w in to_ix else 0 for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

# indexing list elements: use this for vocabs and tagset
def to_index(aList):
    elem_to_idx = {e:aList.index(e)+1 for e in aList}
    elem_to_idx['Unknown'] = 0  
    idx_to_elem = {i:e for e,i in elem_to_idx.items()}
    return elem_to_idx, idx_to_elem

# import numpy as np
def score_to_tag(tag_scores, i_to_tag):
    tagged=[]
    for preds in tag_scores:
        preds = list(np.array(preds))
        idx = preds.index(max(preds))
        tagged.append(f"{i_to_tag[idx]}")
    return tagged

tagged_data_file = "all_corpora/welsh/cy_pos_coarse_tagged"

### Step 3: Preparing the data

In [103]:
# Read training data for each folder
print(f"\nReading {tagged_data_file} ...", end='')
data = read_data(tagged_data_file)
vocabs, tagset, token_size = get_vocab_tagset(data)
tagset = set(t for t in tagset if t) #Just taking care of some null string in tagset

# #Uncomment below for data statistics
# print(f"\nSentences:\t{len(training_data)}\nToken size:\t{token_size}\nVocab size:\t{len(vocabs)}\nTagset size:\t{len(tagset)}")

# Shuffle and split data
random.seed(7)
random.shuffle(data)
test_size = 0.1 #90% of data for training; 10% of data for testing
train_set, test_set = data[:-int(len(data)*test_size)], data[-int(len(data)*test_size):]

EMBEDDING_DIM = 100
HIDDEN_DIM = 5    
word_to_idx, idx_to_word = to_index(list(vocabs))
tag_to_idx, idx_to_tag = to_index(list(tagset))


Reading all_corpora/welsh/cy_pos_coarse_tagged ...

### Step 4: Creating the model

In [69]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

### Step 5: Training the model

In [100]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_idx), len(tag_to_idx))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
start = timer()


ACC, PRE, REC, F1 = [],[],[],[]
NUM_EPOCHS = 200

print(f"\t\t\t\tACC\tPRE\tREC\tF1\n[{'.'*19}", end='')
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for epoch in range(NUM_EPOCHS):  # again, normally you would NOT do 300 epochs, it is toy data
        for sentence, tags in train_set:
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.
            sentence_in = prepare_sequence(sentence, word_to_idx)
            targets = prepare_sequence(tags, tag_to_idx)

            # Step 3. Run our forward pass.
            tag_scores = model(sentence_in)

            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(tag_scores, targets)
            loss.backward()
            optimizer.step()

        with torch.no_grad():
            for sent in test_set:
                y_gold.extend(sent[1])
                inputs = prepare_sequence(sent[0], word_to_idx)
                tag_scores = model(inputs)
                y_pred.extend(score_to_tag(tag_scores, idx_to_tag))
                
            acc=accuracy_score(y_gold, y_pred)*100
            pre=precision_score(y_gold, y_pred, average='macro')*100
            pre=recall_score(y_gold, y_pred, average='macro')*100
            f1=f1_score(y_gold, y_pred, average='macro')*100
            
            ACC.append(acc)
            PRE.append(pre)
            REC.append(rec)
            F1.append(f1)

            if epoch%20==0 or epoch == NUM_EPOCHS-1:
                print(f"] Epoch {epoch:003d}:\t{acc:.2f}",end='\t')
                print(f"{pre:.2f}",end='\t')
                print(f"{rec:.2f}",end='\t')
                print(f"{f1:.2f}\n[", end='')
            else:
                print('.',end='')
print(f'Done! Total time: {timer() - start:.2f} secs')

#plot y_gold, y_pred for accuracy, precision and F1

				ACC	PRE	REC	F1
[..................] Epoch 00:	50.61	28.38	34.19	29.58
[...................] Epoch 20:	54.54	31.93	34.19	33.15
[...................] Epoch 40:	58.51	35.62	34.19	37.30
[...................] Epoch 60:	61.24	38.69	34.19	40.70
[...................] Epoch 80:	63.23	41.42	34.19	43.41
[...................] Epoch 100:	64.75	43.49	34.19	45.27
[...................] Epoch 120:	66.01	45.16	34.19	46.70
[...................] Epoch 140:	67.05	46.61	34.19	48.05
[...................] Epoch 160:	67.89	47.80	34.19	49.16
[...................] Epoch 180:	68.59	48.83	34.19	50.14
[..................] Epoch 199:	69.13	49.64	34.19	50.85
[Done! Total time: 1137.19 secs


In [84]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    target_names = list(tagset)
    print(classification_report(y_gold, y_pred, target_names=target_names))
    print(confusion_matrix(y_gold, y_pred))

             precision    recall  f1-score   support

        unk       0.00      0.00      0.00         1
         Gw       0.46      0.60      0.52        35
        Adf       0.60      0.58      0.59       100
        YFB       0.80      0.80      0.80       192
          B       1.00      0.97      0.98       157
        Ans       0.70      0.79      0.74       242
        Atd       0.77      0.87      0.82        69
          U       0.78      0.75      0.76       386
        Rha       0.26      0.17      0.21        35
        Cys       0.82      0.65      0.72        91
       pron       0.64      0.41      0.50        17
         Ar       0.69      0.71      0.70        89
      Ebych       0.85      1.00      0.92        75
        Rhi       0.50      0.67      0.57         3
          E       0.00      0.00      0.00         4

avg / total       0.76      0.76      0.76      1496

[[  0   0   0   0   0   1   0   0   0   0   0   0   0   0   0]
 [  0  21   4   0   0   4   0   4

In [86]:
target_names

['unk',
 'Gw',
 'Adf',
 'YFB',
 'B',
 'Ans',
 'Atd',
 'U',
 'Rha',
 'Cys',
 'pron',
 'Ar',
 'Ebych',
 'Rhi',
 'E']

# ***** Forget about anything below here *****

In [9]:
# with open('all_corpora/welsh/cy_pos_fine_tagged', 'w', encoding='utf8') as f1:
#     with open('all_corpora/welsh/cy_both_fine_tagged', 'r', encoding='utf8') as f2:
#         for line in f2:
#             f1.write(" ".join("/".join(wt.split('|',1)) for wt in line.split())+"\n")

In [15]:
# import re
# with open('all_corpora/welsh/cy_sem_coarse_tagged', 'w', encoding='utf8') as f1:
#     with open('all_corpora/welsh/cy_both_fine_tagged', 'r', encoding='utf8') as f2:
#         for line in f2:
#             words, tags = tuple(zip(*[wt.split('/',1) for wt in line.strip().split()]))
# #             pos_tags = [tag.split('|')[0] for tag in tags]
#             sem_tags = [re.findall(r'[A-Za-z]+\d*',tag.split('|',1)[1])[0] for tag in tags]
# #             coarse_pos_sem = ["|".join(ps) for ps in zip(pos_tags,sem_tags)]
# #             word_coarse_pos_sem = [f"{w}/{t}" for w,t in zip(words, coarse_pos_sem)]
#             word_coarse_sem = [f"{w}/{t}" for w,t in zip(words, sem_tags)]
# #             print(f"{words}\n\n{tags}\n\n{both}\n\n{' '.join(word_both_tags)}")
# #             print(f"{i}\t{' '.join(word_coarse_pos_sem)}")
#             f1.write(f"{' '.join(word_coarse_sem)}\n")
# #             break