In [None]:
import pyconll #pip3 install this if you don't have it
import torchtext.data as tt
import torch 
import torch.nn as nn
import torch.optim as optim
import time

In [None]:
AFRIKAANS_TRAIN = 'UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu'
AFRIKAANS_DEV = 'UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu'
AFRIKAANS_TEST = 'UD_Afrikaans-AfriBooms/af_afribooms-ud-test.conllu'

DUTCH_TRAIN = "UD_Dutch-Alpino/nl_alpino-ud-train.conllu"
DUTCH_DEV = "UD_Dutch-Alpino/nl_alpino-ud-dev.conllu"
DUTCH_TEST = "UD_Dutch-Alpino/nl_alpino-ud-train.conllu"

In [None]:
# from https://github.com/soutsios/pos-tagger-bert/blob/master/pos_tagger_bert.ipynb
def make_sentences(path):
    data = pyconll.load_from_file(path)
    sentences = []
    tagged_sentences = []
    for each in data:
        tagged_sentence=[]
        sentence = []
        for token in each:
            if token.upos and token.form:
                tagged_sentence.append(token.upos)
                sentence.append(token.form.lower())
        tagged_sentences.append(tagged_sentence)
        sentences.append(sentence)
    return sentences, tagged_sentences

In [None]:
train_afr_raw, tagged_train_afr_raw = make_sentences(AFRIKAANS_TRAIN)
dev_afr_raw, tagged_dev_afr_raw = make_sentences(AFRIKAANS_DEV)
test_afr_raw, tagged_test_afr_raw = make_sentences(AFRIKAANS_TEST)

train_du_raw, tagged_train_du_raw = make_sentences(DUTCH_TRAIN)
dev_du_raw, tagged_dev_du_raw = make_sentences(DUTCH_DEV)
test_du_raw, tagged_test_du_raw = make_sentences(DUTCH_TEST)

In [None]:
print("AFRIKAANS")
print("Tagged sentences in train set: ", len(tagged_train_afr_raw))
print("Tagged words in train set:", len([item for sublist in tagged_train_afr_raw for item in sublist]))
print(40*'=')
print("Tagged sentences in dev set: ", len(tagged_dev_afr_raw))
print("Tagged words in dev set:", len([item for sublist in tagged_dev_afr_raw for item in sublist]))
print(40*'=')
print("Tagged sentences in test set: ", len(tagged_test_afr_raw))
print("Tagged words in test set:", len([item for sublist in tagged_test_afr_raw for item in sublist]))
print(40*'*')
print("Total sentences in dataset:", len(tagged_train_afr_raw)+len(tagged_dev_afr_raw)+len(tagged_dev_afr_raw))

In [None]:
print("DUTCH")
print("Tagged sentences in train set: ", len(tagged_train_du_raw))
print("Tagged words in train set:", len([item for sublist in tagged_train_du_raw for item in sublist]))
print(40*'=')
print("Tagged sentences in dev set: ", len(tagged_dev_du_raw))
print("Tagged words in dev set:", len([item for sublist in tagged_dev_du_raw for item in sublist]))
print(40*'=')
print("Tagged sentences in test set: ", len(tagged_test_du_raw))
print("Tagged words in test set:", len([item for sublist in tagged_test_du_raw for item in sublist]))
print(40*'*')
print("Total sentences in dataset:", len(tagged_train_du_raw)+len(tagged_dev_du_raw)+len(tagged_dev_du_raw))

In [None]:
# from https://github.com/tringm/POSTagger_Pytorch/blob/master/src/util/nlp.py
def build_tag_field(sentences_tokens):
    token_field = tt.Field(tokenize=list, init_token="<bos>", eos_token="<eos>")
    fields = [('tokens', token_field)]
    examples = [tt.Example.fromlist([t], fields) for t in sentences_tokens]
    torch_dataset = tt.Dataset(examples, fields)
    return token_field
    
def build_text_field(sentences_words):
    text_field = tt.Field(tokenize=list, init_token="<bos>", eos_token="<eos>")
    fields = [('text', text_field)]
    examples = [tt.Example.fromlist([t], fields) for t in sentences_words]
    torch_dataset = tt.Dataset(examples, fields)
    return text_field

In [None]:
#fields, AFR
train_afr = build_text_field(train_afr_raw)
dev_afr = build_text_field(dev_afr_raw)
test_afr = build_text_field(test_afr_raw)
tagged_train_afr = build_tag_field(tagged_train_afr_raw)
tagged_dev_afr = build_tag_field(tagged_dev_afr_raw)
tagged_test_afr = build_tag_field(tagged_test_afr_raw)

fields_train_afr = (("text", train_afr), ("udtags", tagged_train_afr))
examples_train_afr = [tt.Example.fromlist(item, fields_train_afr) for item in zip(train_afr_raw, tagged_train_afr_raw)]
fields_dev_afr = (("text", dev_afr), ("udtags", tagged_dev_afr))
examples_dev_afr = [tt.Example.fromlist(item, fields_dev_afr) for item in zip(dev_afr_raw, tagged_dev_afr_raw)]
fields_test_afr = (("text", test_afr), ("udtags", tagged_test_afr))
examples_test_afr = [tt.Example.fromlist(item, fields_test_afr) for item in zip(test_afr_raw, tagged_test_afr_raw)]

train_data_afr = tt.Dataset(examples_train_afr, fields_train_afr)
valid_data_afr = tt.Dataset(examples_dev_afr, fields_dev_afr)
test_data_afr = tt.Dataset(examples_dev_afr, fields_dev_afr)

#build vocabs so that they are shared between splits
train_afr.build_vocab(train_data_afr, valid_data_afr, test_data_afr)
dev_afr.vocab = train_afr.vocab
test_afr.vocab = train_afr.vocab
tagged_train_afr.build_vocab(train_data_afr, valid_data_afr, test_data_afr)
tagged_dev_afr.vocab = tagged_train_afr.vocab
tagged_test_afr.vocab = tagged_train_afr.vocab

In [None]:
#fields, DUT
train_du = build_text_field(train_du_raw)
dev_du = build_text_field(dev_du_raw)
test_du = build_text_field(test_du_raw)
tagged_train_du = build_tag_field(tagged_train_du_raw)
tagged_dev_du = build_tag_field(tagged_dev_du_raw)
tagged_test_du = build_tag_field(tagged_test_du_raw)

fields_train_du = (("text", train_du), ("udtags", tagged_train_du))
examples_train_du = [tt.Example.fromlist(item, fields_train_du) for item in zip(train_du_raw, tagged_train_du_raw)]
fields_dev_du = (("text", dev_du), ("udtags", tagged_dev_du))
examples_dev_du = [tt.Example.fromlist(item, fields_dev_du) for item in zip(dev_du_raw, tagged_dev_du_raw)]
fields_test_du = (("text", test_du), ("udtags", tagged_test_du))
examples_test_du = [tt.Example.fromlist(item, fields_test_du) for item in zip(test_du_raw, tagged_test_du_raw)]

train_data_du = tt.Dataset(examples_train_du, fields_train_du)
valid_data_du = tt.Dataset(examples_dev_du, fields_dev_du)
test_data_du = tt.Dataset(examples_dev_du, fields_dev_du)

#build vocabs so that they are shared between splits
train_du.build_vocab(train_data_du, valid_data_du, test_data_du)
dev_du.vocab = train_du.vocab
test_du.vocab = train_du.vocab
tagged_train_du.build_vocab(train_data_du, valid_data_du, test_data_du)
tagged_dev_du.vocab = tagged_train_du.vocab
tagged_test_du.vocab = tagged_train_du.vocab

In [None]:
print(train_afr)

In [None]:
print(train_data)

In [None]:
len(train_data)

In [None]:
# from https://github.com/bentrevett/pytorch-pos-tagging/blob/master/1%20-%20BiLSTM%20for%20PoS%20Tagging.ipynb
#model
batch_size=128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#needs to be tuple of dataset objects
train_iterator, valid_iterator, test_iterator = tt.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = batch_size,
    device = device, sort=False)

In [None]:
# try without dropout first
class BiLSTMTagger(nn.Module):
    #https://github.com/bentrevett/pytorch-pos-tagging/blob/master/1%20-%20BiLSTM%20for%20PoS%20Tagging.ipynb
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional)
        #fully connected layer
        self.fc = nn.Linear((hidden_dim * 2 if bidirectional else hidden_dim), output_dim)
     
    
    def forward(self, text):
        embedded = self.embedding(text)
        outputs, (hidden, cell) = self.lstm(embedded)
        predictions = self.fc(outputs)
        return predictions

In [None]:
in_dim = len(train_afr.vocab)
emb_dim = 100
hid_dim = 128
out_dim = len(tagged_train_afr.vocab)
n_layers = 1
bidirectional = True
pad_index = train_afr.vocab.stoi[train_afr.pad_token]
tag_pad_idx = tagged_train_afr.vocab.stoi[tagged_train_afr.pad_token]

In [None]:
model = BiLSTMTagger(in_dim, emb_dim, hid_dim, out_dim, n_layers, bidirectional, pad_index)
criterion = nn.CrossEntropyLoss(ignore_index = tag_pad_idx)
optimizer = optim.Adam(model.parameters())

In [None]:
def categorical_accuracy(preds, y, tag_pad_idx):
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]])

In [None]:
def train(model, iterator, optimizer, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in iterator:
        text = batch.text
        tags = batch.udtags
        
        optimizer.zero_grad()       
        predictions = model(text)        
        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)
        
        loss = criterion(predictions, tags) 
        acc = categorical_accuracy(predictions, tags, tag_pad_idx)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text = batch.text
            tags = batch.udtags
            
            predictions = model(text)
            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)
            
            loss = criterion(predictions, tags)
            acc = categorical_accuracy(predictions, tags, tag_pad_idx)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 10

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, tag_pad_idx)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, tag_pad_idx)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [None]:
test_loss, test_acc = evaluate(model, test_iterator, criterion, tag_pad_idx)
print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')