In [None]:
# !pip install transformers

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer


import torchtext
from torchtext.legacy import data
from torchtext.legacy import datasets

from collections import defaultdict, Counter

from transformers import BertTokenizer, BertModel

import math
from typing import Tuple
import numpy as np

import time
import random
import functools

import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina' 
plt.style.use('seaborn')

In [None]:
# To ensure that the code is reproducible, set random seeds
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
import pandas as pd
pwngc_path = "../data/test_transformer/pwngc4torchtext.csv"
pwngc_df = pd.read_csv(pwngc_path, delimiter="\t", header=None)
pwngc_df.columns = ["token", "stemm", "pos", "annotation", "synset", "tag" ]
pwngc_df.head(20)

In [None]:
import numpy as np
len(np.where(pwngc_df["tag"] != 'O')[0])
# 532.821 annotated tokens

In [None]:
# split the dataset into training, validation and testing
train = "../data/test_transformer/train.csv"
validate = "../../data/test_transformer/validate.csv"
test = "../../data/test_transformer/test.csv"

In [None]:
# # Even though the following code is created initially to deal with Universal Dependencies English Treebank (UDPD),
# # I will try to run it on my sequence tagging task.
# # Goal: create training/validation/testing data with torchtext
# # source: https://github.com/arthurdjn/udpos
# class PWNGC(datasets.sequence_tagging.SequenceTaggingDataset):
#     # urls = ['https://github.com/arthurdjn/udpos/raw/master/data/fr-gsd-ud-15032020.zip'] # change to the dataset of your choice
#     dirname = 'pwngc-wsd'  # don't forget to change me too !
#     name = 'udpos'         # not obligatory to change here
#
#     @classmethod
#     def splits(cls, fields, root=".data",
#                train="pwngc-wsd-dev.csv",
#                validation="pwngc-wsd-dev.csv",
#                test="pwngc-wsd-dev.csv", **kwargs):
#         """Downloads and loads the Universal Dependencies Version 2 POS Tagged
#         data.
#         """
#
#         return super(PWNGC, cls).splits(
#             fields=fields, root=root, train=train, validation=validation,
#             test=test, **kwargs)

In [None]:
# from torchtext import data
# import PWNGC
#
# TEXT = data.Field(lower = True)
# LEMMATIZED = data.Field(unk_token = None)
# UD_TAGS = data.Field(unk_token = None)
# fields = (("text", TEXT), ("lemmatized", LEMMATIZED), ("udtags", UD_TAGS))
#
# # Load the UD french dataset
# train_data, eval_data, test_data = PWNGC.splits(fields)


In [None]:
def read_data(corpus_file, datafields):
    """
    reads the stem word and the spatial tag of each token in the .csv file
    :param corpus_file:
    :param datafields:
    :return:
    """
    with open(corpus_file, encoding='utf-8') as f:
        examples = []
        words = []
        labels = []
        for line in f:
            line = line.strip()
            if not line:
                examples.append(data.Example.fromlist([words, labels], datafields))
                words = []
                labels = []
            else:
                columns = line.split()
                words.append(columns[1])
                labels.append(columns[-1])
        return data.Dataset(examples, datafields)

In [None]:
print(torchtext.__version__)

In [None]:
# # Compares two lists of training parameters and records the results for future aggregation.
# def compare(gold, pred, stats):
#     for start, (lbl, end) in gold.items():
#         stats['total']['gold'] += 1
#         stats[lbl]['gold'] += 1
#     for start, (lbl, end) in pred.items():
#         stats['total']['pred'] += 1
#         stats[lbl]['pred'] += 1
#     for start, (glbl, gend) in gold.items():
#         if start in pred:
#             plbl, pend = pred[start]
#             if glbl == plbl and gend == pend:
#                 stats['total']['corr'] += 1
#                 stats[glbl]['corr'] += 1


In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [None]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        # Multi-head attention mechanism included already in TransformerEncoderLayer
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output


def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [None]:
class RNNTagger(nn.Module):

    def __init__(self, text_field, label_field, emb_dim, rnn_size, update_pretrained=False):
        super().__init__()

        voc_size = len(text_field.vocab)
        self.n_labels = label_field#len(label_field.vocab)
        #
        # Embedding layer. If we're using pre-trained embeddings, copy them
        # into our embedding module.
        self.embedding = nn.Embedding(voc_size, emb_dim)
        if text_field.vocab.vectors is not None:
            self.embedding.weight = torch.nn.Parameter(text_field.vocab.vectors,
                                                       requires_grad=update_pretrained)

        # RNN layer. We're using a bidirectional GRU with one layer.
        self.rnn = nn.GRU(input_size=emb_dim, hidden_size=rnn_size,
                          bidirectional=True, num_layers=1)

        # Output layer. As in the example last week, the input will be two times
        # the RNN size since we are using a bidirectional RNN.
        self.top_layer = nn.Linear(2*rnn_size, self.n_labels)

        # To deal with the padding positions later, we need to know the
        # encoding of the padding dummy word and the corresponding dummy output tag.
        self.pad_word_id = text_field.vocab.stoi[text_field.pad_token]
        # self.pad_label_id = label_field.vocab.stoi[label_field.pad_token]

        # Loss function that we will use during training.
        self.loss = torch.nn.CrossEntropyLoss(reduction='sum')

    def compute_outputs(self, sentences):
        # The words in the documents are encoded as integers. The shape of the documents
        # tensor is (max_len, n_docs), where n_docs is the number of documents in this batch,
        # and max_len is the maximal length of a document in the batch.

        # First look up the embeddings for all the words in the documents.
        # The shape is now (max_len, n_sentences, emb_dim).
        embedded = self.embedding(sentences)

        # Apply the RNN.
        # The shape of the RNN output tensor is (max_len, n_sentences, 2*rnn_size).
        rnn_out, _ = self.rnn(embedded)

        # Apply the linear output layer.
        # The shape of the output tensor is (max_len, n_sentences, n_labels).
        out = self.top_layer(rnn_out)

        # Find the positions where the token is a dummy padding token.
        pad_mask = (sentences == self.pad_word_id).float()

        # For these positions, we add some large number in the column corresponding
        # to the dummy padding label.
        # out[:, :, self.pad_label_id] += pad_mask*10000

        return out

    def forward(self, sentences, labels):
        # As discussed above, this method first computes the predictions, and then
        # the loss function.

        # Compute the outputs. The shape is (max_len, n_sentences, n_labels).
        scores = self.compute_outputs(sentences)

        # Flatten the outputs and the gold-standard labels, to compute the loss.
        # The input to this loss needs to be one 2-dimensional and one 1-dimensional tensor.
        scores = scores.view(-1, self.n_labels)
        labels = labels.view(-1)
        return self.loss(scores, labels)

    def predict(self, sentences):
        # Compute the outputs from the linear units.
        scores = self.compute_outputs(sentences)

        # Select the top-scoring labels. The shape is now (max_len, n_sentences).
        predicted = scores.argmax(dim=2)

        # We transpose the prediction to (n_sentences, max_len), and convert it
        # to a NumPy matrix.
        return predicted.t().cpu().numpy()


In [None]:
import torch
class Tagger:

    def __init__(self, lower):
        self.TEXT = data.Field(init_token='<bos>', eos_token='<eos>', sequential=True, lower=lower)
        # I changed sequential = True to false, because my data is not sequential
        # self.LABEL = data.Field(init_token='<bos>', eos_token='<eos>', sequential=True, unk_token=None)
        self.LABEL = data.Field(is_target=True, sequential=False, unk_token=None, dtype=list)
        # data.Field(init_token='<bos>', eos_token='<eos>',
        #     sequential=False, use_vocab=False)
        self.fields = [('text', self.TEXT), ('label', self.LABEL)]
        # self.device = 'cuda'
        self.device = 'cpu'

    def tag(self, sentences):
        # This method applies the trained model to a list of sentences.

        # First, create a torchtext Dataset containing the sentences to tag.
        examples = []
        for sen in sentences:
            labels = ['?']*len(sen) # placeholder
            examples.append(data.Example.fromlist([sen, labels], self.fields))
        dataset = data.Dataset(examples, self.fields)

        iterator = data.Iterator(
            dataset,
            device=self.device,
            batch_size=64,
            repeat=False,
            train=False,
            sort=False)

        # Apply the trained model to all batches.
        out = []
        self.model.eval()
        with torch.no_grad():
            for batch in iterator:
                # Call the model's predict method. This returns a list of NumPy matrix
                # containing the integer-encoded tags for each sentence.
                predicted = self.model.predict(batch.text)

                # # Convert the integer-encoded tags to tag strings.
                # for tokens, pred_sen in zip(sentences, predicted):
                #     out.append([self.LABEL.vocab.itos[pred_id] for _, pred_id in zip(tokens, pred_sen[1:])])
        return out

    def train(self):
        # Read training and validation data according to the predefined split.
        train_examples = read_data("../data/test_transformer/train.csv", self.fields) #'data/eng.train.iob', self.fields)
        valid_examples = read_data("../data/test_transformer/validate.csv", self.fields) #'data/eng.valid.iob', self.fields)

        # Count the number of words and sentences.
        n_tokens_train = 0
        n_sentences_train = 0
        for ex in train_examples:
            n_tokens_train += len(ex.text) + 2
            n_sentences_train += 1
        n_tokens_valid = 0
        for ex in valid_examples:
            n_tokens_valid += len(ex.text)

        # Load the pre-trained embeddings that come with the torchtext library.
        use_pretrained = True
        if use_pretrained:
            print('We are using pre-trained word embeddings.')
            self.TEXT.build_vocab(train_examples, vectors="glove.840B.300d")
        else:
            print('We are training word embeddings from scratch.')
            self.TEXT.build_vocab(train_examples, max_size=5000)
        # self.LABEL.build_vocab(train_examples)

        # Create one of the models defined above.
        self.model = RNNTagger(self.TEXT, self.LABEL, emb_dim=300, rnn_size=128, update_pretrained=False)
        # self.model = RNNCRFTagger(self.TEXT, self.LABEL, emb_dim=300, rnn_size=128, update_pretrained=False)

        self.model.to(self.device)

        batch_size = 5 #1024
        n_batches = np.ceil(n_sentences_train / batch_size)

        mean_n_tokens = n_tokens_train / n_batches

        train_iterator = data.BucketIterator(
            train_examples,
            device=self.device,
            batch_size=batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            train=True,
            sort=True)

        valid_iterator = data.BucketIterator(
            valid_examples,
            device=self.device,
            batch_size= 2, #64,
            sort_key=lambda x: len(x.text),
            repeat=False,
            train=False,
            sort=True)

        train_batches = list(train_iterator)
        valid_batches = list(valid_iterator)

        optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01, weight_decay=1e-5)

        # n_labels = len(self.LABEL.vocab)

        history = defaultdict(list)

        n_epochs = 3 #25

        for i in range(1, n_epochs + 1):

            t0 = time.time()

            loss_sum = 0

            self.model.train()
            for batch in train_batches:

                # Compute the output and loss.
                loss = self.model(batch.text, batch.label) / mean_n_tokens

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                loss_sum += loss.item()

            train_loss = loss_sum / n_batches
            history['train_loss'].append(train_loss)

            # Evaluate on the validation set.
            if i % 1 == 0:
                stats = defaultdict(Counter)

                self.model.eval()
                with torch.no_grad():
                    for batch in valid_batches:
                        # Predict the model's output on a batch.
                        predicted = self.model.predict(batch.text)
                        print("predicted := ", predicted)
                        # Update the evaluation statistics.
                        # evaluate_iob(predicted, batch.label, self.LABEL, stats)

                # # Compute the overall F-score for the validation set.
                # _, _, val_f1 = prf(stats['total'])
                #
                # history['val_f1'].append(val_f1)
                #
                # t1 = time.time()
                # print(f'Epoch {i}: train loss = {train_loss:.4f}, val f1: {val_f1:.4f}, time = {t1-t0:.4f}')

        # # After the final evaluation, we print more detailed evaluation statistics, including
        # # precision, recall, and F-scores for the different types of named entities.
        # print()
        # print('Final evaluation on the validation set:')
        # p, r, f1 = prf(stats['total'])
        # print(f'Overall: P = {p:.4f}, R = {r:.4f}, F1 = {f1:.4f}')
        # for label in stats:
        #     if label != 'total':
        #         p, r, f1 = prf(stats[label])
        #         print(f'{label:4s}: P = {p:.4f}, R = {r:.4f}, F1 = {f1:.4f}')
        #
        # plt.plot(history['train_loss'])
        # plt.plot(history['val_f1'])
        # plt.legend(['training loss', 'validation F-score'])

tagger = Tagger(lower=False)
tagger.train()

In [None]:
# tagger.train()