In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

## Create a custom dataset from a MEDDOCAN corpus instance

In [11]:
from dataclasses import asdict, dataclass
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import DefaultDict, Dict, List

from datasets import ClassLabel, Dataset, DatasetDict, Features, Sequence, Value

from meddocan.data import ArchiveFolder
from meddocan.data.docs_iterators import GsDocs


@dataclass
class Row:
    input_ids: str
    tokens: List[str]
    ner_tags: List[int]


archives = (ArchiveFolder.train, ArchiveFolder.dev, ArchiveFolder.test)
dataset_dict: Dict[str, Dataset] = {}

with TemporaryDirectory() as td:

    tags = DefaultDict(int)

    # 1. Write corpus in bio format as accepted by Flair
    for archive in archives:
        brat_docs = GsDocs(archive)
        tempfile = Path(td, f"{archive.value}.csv")
        brat_docs.to_connl03(file=tempfile, write_sentences=True, document_separator_token=None)

        # 2. Collect labels for Dataset creation
        with tempfile.open() as fp:
            for line in fp:
                if line.strip():
                    _, tag = line.split(" ")
                    tags[tag.strip()] += 1

    label_to_id = {k: i for i, k in enumerate(tags)}

    features = Features(
        {
            "tokens": Sequence(Value("string")),
            "ner_tags": Sequence(ClassLabel(names=list(label_to_id.keys()))),
            "input_ids": Value("int32"),
        }
    )

    for archive in archives:
        brat_docs = GsDocs(archive)
        tempfile = Path(td, f"{archive.value}.csv")

        # 3. Collect all rows
        rows: List[Row] = []

        idx = 0
        tokens: List[str] = []
        ner_tags: List[int] = []

        with tempfile.open() as fp:
            for line in fp:
                if not line.strip():
                    row = Row(str(id), tokens, ner_tags)
                    rows.append(row)
                    idx += 1
                    tokens = []
                    ner_tags = []
                else:
                    token, ner_tag = line.split(" ")
                    tokens.append(token)
                    ner_tags.append(label_to_id[ner_tag.strip()])

        # 4. Create a Dataset from Rows
        ld = list(map(asdict, rows))
        dl = {k: [dic[k] for dic in LD] for k in ld[0]}
        dataset_dict[archive.value] = Dataset.from_dict(dl, features, split=archive.value)

# 5. Contruct the dataset
dataset = DatasetDict(dataset_dict)

## Use the dataset to train a NER model with transformers

Downsample the train split

In [12]:
downsample = 0.1
for split in dataset.keys():
    dataset[split] = dataset[split].shuffle(seed=0).select(range(int(dataset[split].num_rows * downsample)))

In [13]:
ds = dataset

In [15]:
import pandas as pd

pd.DataFrame({k: v.num_rows for k, v in ds.items()}, index=["number of example per split"])

Unnamed: 0,train,dev,test
number of example per split,515,515,515


Look at an example extracted from the dataset

In [16]:
pd.DataFrame(ds["train"][11]).T

Unnamed: 0,0,1,2,3
input_ids,2710,2710,2710,2710
tokens,CP,:,28030,.
ner_tags,0,0,8,0


In our ``Dataset`` objects, the keys of our example correspond to the column names of an Arrow table, while the values denote the entries in each column. In particular, we see that the ``ner_tags`` column corresponds to the mapping of each entity to a class ID. This is a bit cryptic to the human eye, so let’s create a new column with the familiar tags. To do this, the first thing to notice is that our ``Dataset`` object has a features attribute that specifies the underlying data types associated with each column:

In [17]:
for k, v in ds["train"].features.items():
    print(f"{k}: {v}")

input_ids: Value(dtype='int32', id=None)
tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags: Sequence(feature=ClassLabel(num_classes=45, names=['O', 'B-NOMBRE_SUJETO_ASISTENCIA', 'I-NOMBRE_SUJETO_ASISTENCIA', 'B-ID_SUJETO_ASISTENCIA', 'B-ID_ASEGURAMIENTO', 'I-ID_ASEGURAMIENTO', 'B-CALLE', 'I-CALLE', 'B-TERRITORIO', 'B-FECHAS', 'I-FECHAS', 'B-PAIS', 'B-EDAD_SUJETO_ASISTENCIA', 'I-EDAD_SUJETO_ASISTENCIA', 'B-SEXO_SUJETO_ASISTENCIA', 'B-NOMBRE_PERSONAL_SANITARIO', 'I-NOMBRE_PERSONAL_SANITARIO', 'B-ID_TITULACION_PERSONAL_SANITARIO', 'I-ID_TITULACION_PERSONAL_SANITARIO', 'B-CORREO_ELECTRONICO', 'I-CORREO_ELECTRONICO', 'B-HOSPITAL', 'I-HOSPITAL', 'B-FAMILIARES_SUJETO_ASISTENCIA', 'I-FAMILIARES_SUJETO_ASISTENCIA', 'I-TERRITORIO', 'B-OTROS_SUJETO_ASISTENCIA', 'B-INSTITUCION', 'I-INSTITUCION', 'I-PAIS', 'B-NUMERO_TELEFONO', 'I-NUMERO_TELEFONO', 'B-ID_CONTACTO_ASISTENCIAL', 'B-NUMERO_FAX', 'I-NUMERO_FAX', 'B-CENTRO_SALUD', 'I-CENTRO_SALUD', 'I-ID_SUJETO_ASISTEN

The ``Sequence`` class specifies that the field contains a list of features, which in the case of ``ner_tags`` corresponds to a list of ``ClassLabel`` features. Let’s pick out this feature from the training set as follows:

In [19]:
tags = ds["train"].features["ner_tags"].feature
print(tags)

ClassLabel(num_classes=45, names=['O', 'B-NOMBRE_SUJETO_ASISTENCIA', 'I-NOMBRE_SUJETO_ASISTENCIA', 'B-ID_SUJETO_ASISTENCIA', 'B-ID_ASEGURAMIENTO', 'I-ID_ASEGURAMIENTO', 'B-CALLE', 'I-CALLE', 'B-TERRITORIO', 'B-FECHAS', 'I-FECHAS', 'B-PAIS', 'B-EDAD_SUJETO_ASISTENCIA', 'I-EDAD_SUJETO_ASISTENCIA', 'B-SEXO_SUJETO_ASISTENCIA', 'B-NOMBRE_PERSONAL_SANITARIO', 'I-NOMBRE_PERSONAL_SANITARIO', 'B-ID_TITULACION_PERSONAL_SANITARIO', 'I-ID_TITULACION_PERSONAL_SANITARIO', 'B-CORREO_ELECTRONICO', 'I-CORREO_ELECTRONICO', 'B-HOSPITAL', 'I-HOSPITAL', 'B-FAMILIARES_SUJETO_ASISTENCIA', 'I-FAMILIARES_SUJETO_ASISTENCIA', 'I-TERRITORIO', 'B-OTROS_SUJETO_ASISTENCIA', 'B-INSTITUCION', 'I-INSTITUCION', 'I-PAIS', 'B-NUMERO_TELEFONO', 'I-NUMERO_TELEFONO', 'B-ID_CONTACTO_ASISTENCIAL', 'B-NUMERO_FAX', 'I-NUMERO_FAX', 'B-CENTRO_SALUD', 'I-CENTRO_SALUD', 'I-ID_SUJETO_ASISTENCIA', 'I-OTROS_SUJETO_ASISTENCIA', 'B-PROFESION', 'I-PROFESION', 'I-SEXO_SUJETO_ASISTENCIA', 'B-ID_EMPLEO_PERSONAL_SANITARIO', 'I-ID_EMPLEO_PERSO

We can use the ``ClassLabel.int2str()`` method to create a new column in our training set with class names for each tag. We’ll use the ``map()`` method to return a ``dict`` with the key corresponding to the new column name and the value as a ``list`` of class names:

In [20]:
def create_tag_names(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

for k in ds:
    ds[k] = ds[k].map(create_tag_names)

  0%|          | 0/515 [00:00<?, ?ex/s]

  0%|          | 0/515 [00:00<?, ?ex/s]

  0%|          | 0/515 [00:00<?, ?ex/s]

Now that we have our tags in human-readable format, let’s see how the tokens and tags align for the first example in the training set:

In [21]:
ds_example = ds["train"][12]
pd.DataFrame([ds_example["tokens"], ds_example["ner_tags_str"]], ["Tokens", "Tags"])

Unnamed: 0,0,1,2,3,4
Tokens,Nombre,:,Luis,Miguel,.
Tags,O,O,B-NOMBRE_SUJETO_ASISTENCIA,I-NOMBRE_SUJETO_ASISTENCIA,O


As a quick check that we don’t have any unusual imbalance in the tags, let’s calculate the frequencies of each entity across each split:

In [22]:
from collections import Counter

split2freqs = DefaultDict(Counter)
for split, dataset in ds.items():
    for row in dataset["ner_tags_str"]:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient="index")

Unnamed: 0,ID_ASEGURAMIENTO,NOMBRE_PERSONAL_SANITARIO,ID_TITULACION_PERSONAL_SANITARIO,EDAD_SUJETO_ASISTENCIA,SEXO_SUJETO_ASISTENCIA,NOMBRE_SUJETO_ASISTENCIA,INSTITUCION,TERRITORIO,CALLE,CORREO_ELECTRONICO,HOSPITAL,FECHAS,PAIS,ID_SUJETO_ASISTENCIA,ID_CONTACTO_ASISTENCIAL,NUMERO_TELEFONO,NUMERO_FAX,FAMILIARES_SUJETO_ASISTENCIA,PROFESION
train,19,52,26,59,57,54,3,91,42,24,12,55,35,30,4,2,2,4,1
dev,19,52,26,59,57,54,3,91,42,24,12,55,35,30,4,2,2,4,1
test,19,52,26,59,57,54,3,91,42,24,12,55,35,30,4,2,2,4,1


This looks good—the distributions of our tags frequencies are roughly the same for each split, so the validation and test sets should provide a good measure of our NER tagger’s ability to generalize. Next, let’s look at a few popular multilingual transformers and how they can be adapted to tackle our NER task.

## Multilingual Transformers

Multilingual transformers involve similar architectures and training procedures as their monolingual counterparts, except that the corpus used for pretraining consists of documents in many languages. A remarkable feature of this approach is that despite receiving no explicit information to differentiate among the languages, the resulting linguistic representations are able to generalize well across languages for a variety of downstream tasks. In some cases, this ability to perform cross-lingual transfer can produce results that are competitive with those of monolingual models, which circumvents the need to train one model per language!

To measure the progress of cross-lingual transfer for NER, the ``CoNLL-2002`` and ``CoNLL-2003`` datasets are often used as a benchmark for English, Dutch, Spanish, and German. This benchmark consists of news articles annotated with categories ``LOC``, ``PER``, and ``ORG`` that are differents from the MEDDOCAN categories. Multilingual transformer models are usually evaluated in three different ways:

``en``  
    Fine-tune on the English training data and then evaluate on each language’s test set.

``each``  
    Fine-tune and evaluate on monolingual test data to measure per-language performance.

``all``  
    Fine-tune on all the training data to evaluate on all on each language’s test set.

We will adopt a similar evaluation strategy for our NER task, but first we need to select a model to evaluate. One of the first multilingual transformers was mBERT, which uses the same architecture and pretraining objective as BERT but adds Wikipedia articles from many languages to the pre-training corpus. Since then, mBERT has been superseded by XLM-RoBERTa (or XLM-R for short), so that’s the model we’ll consider in this chapter.

XLM-R uses only MLM (Masked Language Model) as a pre-training objective for 100 languages, but is distinguished by the huge size of its pre-training corpus compared to its predecessors: Wikipedia dumps for each language and 2.5 terabytes of Common Crawl data from the web. This corpus is several orders of magnitude larger than the ones used in earlier models and provides a significant boost in signal for low-resource languages like Burmese and Swahili, where only a small number of Wikipedia articles exist.

The RoBERTa part of the model’s name refers to the fact that the pre-training approach is the same as for the monolingual RoBERTa models. RoBERTa’s developers improved on several aspects of BERT, in particular by removing the next sentence prediction task altogether.3 XLM-R also drops the language embeddings used in XLM and uses SentencePiece to tokenize the raw texts directly.4 Besides its multilingual nature, a notable difference between XLM-R and RoBERTa is the size of the respective vocabularies: 250,000 tokens versus 55,000!

XLM-R is a great choice for multilingual NLU tasks. In the next section, we’ll explore how it can efficiently tokenize across many languages.

## A Closer Look at Tokenization

Instead of using a WordPiece tokenizer, XLM-R uses a tokenizer called SentencePiece that is trained on the raw text of all one hundred languages. To get a feel for how SentencePiece compares to WordPiece, let’s load the BERT and XLM-R tokenizers in the usual way with hugginface Transformers:

In [24]:
from transformers import AutoTokenizer

bert_model_name = "dccuchile/bert-base-spanish-wwm-cased"
xlmr_model_name = "xlm-roberta-large"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

By encoding a small sequence of text we can also retrieve the special tokens that each model used during pre-training:

In [25]:
text = "Jack Sparrow ama Nueva York!"
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()
pd.DataFrame([bert_tokens, xlmr_tokens], ["BERT", "XLM-R"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
BERT,[CLS],J,##ac,##k,Spar,##ro,##w,ama,Nueva,Yor,##k,!,[SEP]
XLM-R,<s>,▁Jack,▁Spar,row,▁ama,▁Nueva,▁York,!,</s>,,,,


Here we see that instead of the ``[CLS]`` and ``[SEP]`` tokens that BERT uses for sentence classification tasks, XLM-R uses ``<s>`` and ``<\s>`` to denote the start and end of a sequence. These tokens are added in the final stage of tokenization, as we’ll see next.

## The Tokenizer Pipeline

So far we have treated tokenization as a single operation that transforms strings to integers we can pass through the model. This is not entirely accurate, and if we take a closer look we can see that it is actually a full processing pipeline that usually consists of four steps, as shown in Figure 4-1.
.....

## The Anatomy of the Transformers Model Class

Transformers is organized around dedicated classes for each architecture and task. The model classes associated with different tasks are named according to a ``<ModelName>For<Task>`` convention, or ``AutoModelFor<Task>`` when using the AutoModel classes.

...

### Creating a Custom Model for Token Classification

Let’s go through the exercise of building a custom token classification head for XLM-R. Since XLM-R uses the same model architecture as RoBERTa, we will use RoBERTa as the base model, but augmented with settings specific to XLM-R. Note that this is an educational exercise to show you how to build a custom model for your own task. For token classification, an ``XLMRobertaForTokenClassification`` class already exists that you can import from huggingface Transformers. If you want, you can skip to the next section and simply use that one.

To get started, we need a data structure that will represent our XLM-R NER tagger. As a first guess, we’ll need a configuration object to initialize the model and a ``forward()`` function to generate the outputs. Let’s go ahead and build our XLM-R class for token classification:

In [26]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # Load model body
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        # Set up token classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # Load and initialize weights
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
                labels=None, **kwargs):
        # Use model body to get encoder representations
        outputs = self.roberta(input_ids, attention_mask=attention_mask,
                               token_type_ids=token_type_ids, **kwargs)
        # Apply classifier to encoder representation
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        # Calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        # Return model output object
        return TokenClassifierOutput(loss=loss, logits=logits,
                                     hidden_states=outputs.hidden_states,
                                     attentions=outputs.attentions)

The ``config_class`` ensures that the standard XLM-R settings are used when we initialize a new model. If you want to change the default parameters, you can do this by overwriting the default settings in the configuration. With the ``super()`` method we call the initialization function of the ``RobertaPreTrainedModel`` class. This abstract class handles the initialization or loading of pre-trained weights. Then we load our model body, which is RobertaModel, and extend it with our own classification head consisting of a dropout and a standard feed-forward layer. Note that we set ``add_​pool⁠ing_layer=False`` to ensure all hidden states are returned and not only the one associated with the [CLS] token. Finally, we initialize all the weights by calling the ``init_weights()`` method we inherit from ``RobertaPreTrainedModel``, which will load the pre-trained weights for the model body and randomly initialize the weights of our token classification head.

The only thing left to do is to define what the model should do in a forward pass with a ``forward()`` method. During the forward pass, the data is first fed through the model body. There are a number of input variables, but the only ones we need for now are ``input_ids`` and ``attention_mask``. The hidden state, which is part of the model body output, is then fed through the dropout and classification layers. If we also provide labels in the forward pass, we can directly calculate the loss. If there is an attention mask we need to do a little bit more work to make sure we only calculate the loss of the unmasked tokens. Finally, we wrap all the outputs in a ``TokenClassifierOutput`` object that allows us to access elements in a the familiar named tuple from previous chapters.

By just implementing two functions of a simple class, we can build our own custom transformer model. And since we inherit from a ``PreTrainedModel``, we instantly get access to all the useful huggingface Transformer utilities, such as ``from_pretrained()``! Let’s have a look how we can load pretrained weights into our custom model.

### Loading a Custom Model

Now we are ready to load our token classification model. We’ll need to provide some additional information beyond the model name, including the tags that we will use to label each entity and the mapping of each tag to an ID and vice versa. All of this information can be derived from our tags variable, which as a ``ClassLabel`` object has a names attribute that we can use to derive the mapping:

In [27]:
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

We’ll store these mappings and the ``tags.num_classes`` attribute in the ``AutoConfig`` object. Passing keyword arguments to the ``from_pretrained()`` method overrides the default values:

In [28]:
from transformers import AutoConfig

xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,
                                         num_labels=tags.num_classes,
                                         id2label=index2tag, label2id=tag2index)

The AutoConfig class contains the blueprint of a model’s architecture. When we load a model with ``AutoModel.from_pretrained(model_ckpt)``, the configuration file associated with that model is downloaded automatically. However, if we want to modify something like the number of classes or label names, then we can load the configuration first with the parameters we would like to customize.

Now, we can load the model weights as usual with the ``from_pretrained()`` function with the additional config argument. Note that we did not implement loading pretrained weights in our custom model class; we get this for free by inheriting from ``RobertaPreTrainedModel``:

In [29]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
xlmr_model = (XLMRobertaForTokenClassification
              .from_pretrained(xlmr_model_name, config=xlmr_config)
              .to(device))

Downloading:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.weight', 'rober

As a quick check that we have initialized the tokenizer and model correctly, let’s test the predictions on our small sequence of known entities:

In [30]:
input_ids = xlmr_tokenizer.encode(text, return_tensors="pt")
pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=["Tokens", "Input IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8
Tokens,<s>,▁Jack,▁Spar,row,▁ama,▁Nueva,▁York,!,</s>
Input IDs,0,21763,37456,15555,2527,111191,5753,38,2


As you can see here, the start ``<s>`` and end ``</s>`` tokens are given the IDs 0 and 2, respectively.

Finally, we need to pass the inputs to the model and extract the predictions by taking the argmax to get the most likely class per token:

In [31]:
outputs = xlmr_model(input_ids.to(device)).logits
predictions = torch.argmax(outputs, dim=1)
print(f"Number of tokens in sequence: {len(xlmr_tokens)}")
print(f"Shape of outputs: {outputs.shape}")

Number of tokens in sequence: 9
Shape of outputs: torch.Size([1, 9, 45])


Here we see that the logits have the shape ``[batch_size, num_tokens, num_tags]``, with each token given a logit among the seven possible NER tags. By enumerating over the sequence, we can quickly see what the pre-trained model predicts:

In [32]:
preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([xlmr_tokens, preds], index=["Tokens", "Tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44
Tokens,<s>,▁Jack,▁Spar,row,▁ama,▁Nueva,▁York,!,</s>,,...,,,,,,,,,,
Tags,I-ID_ASEGURAMIENTO,I-ID_ASEGURAMIENTO,B-TERRITORIO,I-CALLE,O,B-ID_ASEGURAMIENTO,O,I-ID_ASEGURAMIENTO,B-CALLE,I-ID_ASEGURAMIENTO,...,B-ID_SUJETO_ASISTENCIA,I-ID_ASEGURAMIENTO,O,B-ID_SUJETO_ASISTENCIA,I-ID_ASEGURAMIENTO,I-CALLE,B-TERRITORIO,B-ID_SUJETO_ASISTENCIA,I-ID_ASEGURAMIENTO,B-ID_ASEGURAMIENTO


Unsurprisingly, our token classification layer with random weights leaves a lot to be desired; let’s fine-tune on some labeled data to make it better! Before doing so, let’s wrap the preceding steps into a helper function for later use:

In [33]:
def tag_text(tet, tags, model, tokenizer):
    # Get tokens with speecial characters
    tokens = tokenizer(text).tokens()
    # Encode the sequence into IDs
    input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)
    # Get predictions as distribution over 7 possible classes
    outputs = model(input_ids)[0]
    # Take argmax to get most likely class per token
    predictions = torch.argmax(outputs, dim=2)
    # Convert to DataFrame
    preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])

Before we can train the model, we also need to tokenize the inputs and prepare the labels. We’ll do that next.

### Tokenizing Texts for NER

Now that we’ve established that the tokenizer and model can encode a single example, our next step is to tokenize the whole dataset so that we can pass it to the XLM-R model for fine-tuning. HuggingFace Datasets provides a fast way to tokenize a ``Dataset`` object with the ``map()`` operation. To achieve this, recall that we first need to define a function with the minimal signature:

```python
function(examples: Dict[str, List]) -> Dict[str, List]
```

where ``examples`` is equivalent to a slice of a ``Dataset``, e.g., ``ds['train'][:10]``. Since the XLM-R tokenizer returns the input IDs for the model’s inputs, we just need to augment this information with the attention mask and the label IDs that encode the information about which token is associated with each NER tag.

Following the approach taken in the HuggingFace **Transformers documentation**, let’s look at how this works with our single Meddocan example by first collecting the words and tags as ordinary lists:

In [34]:
words, label = ds_example["tokens"], ds_example["ner_tags"]

Next, we tokenize each word and use the ``is_split_into_words`` argument to tell the tokenizer that our input sequence has already been split into words:

In [35]:
tokenized_input = xlmr_tokenizer(words, is_split_into_words=True)
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
pd.DataFrame([tokens], index=["Tokens"])

Unnamed: 0,0,1,2,3,4,5,6,7,8
Tokens,<s>,▁N,ombre,▁:,▁Luis,▁Miguel,▁,.,</s>


In this example we can see that the tokenizer has split “Fecha“ into two subwords, “▁Fe” and “cha”. Since we’re following the convention that only “▁Fe” should be associated with the ``B-FECHAS`` label, we need a way to mask the subword representations after the first subword. Fortunately, ``tokenized_input`` is a class that contains a ``word_ids()`` function that can help us achieve this:


In [36]:
words_id = tokenized_input.word_ids()
pd.DataFrame([tokens, words_id], index=["Tokens", "Word IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8
Tokens,<s>,▁N,ombre,▁:,▁Luis,▁Miguel,▁,.,</s>
Word IDs,,0,0,1,2,3,4,4,


Here we can see that ``word_ids`` has mapped each subword to the corresponding index in the words sequence, so the first subword, “▁Fe”, is assigned the index 0, while “cha” is assigned the index 1 (since “cha” is the second word in words). We can also see that special tokens like ``<s>`` and ``<\s>`` are mapped to ``None``. Let’s set –100 as the label for these special tokens and the subwords we wish to mask during training:

In [37]:
previous_word_idx = None
label_ids = []

for word_idx in words_id:
    if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
    elif word_idx != previous_word_idx:
        label_ids.append(word_idx)
    previous_word_idx = word_idx

labels = [index2tag[l] if l != -100 else "IGN" for l in label_ids]
index = ["Token", "Word IDs", "Label IDs", "Labels"]
pd.DataFrame([tokens, words_id, label_ids, labels], index)

Unnamed: 0,0,1,2,3,4,5,6,7,8
Token,<s>,▁N,ombre,▁:,▁Luis,▁Miguel,▁,.,</s>
Word IDs,,0,0,1,2,3,4,4,
Label IDs,-100,0,-100,1,2,3,4,-100,-100
Labels,IGN,O,IGN,B-NOMBRE_SUJETO_ASISTENCIA,I-NOMBRE_SUJETO_ASISTENCIA,B-ID_SUJETO_ASISTENCIA,B-ID_ASEGURAMIENTO,IGN,IGN


**Note**  
Why did we choose –100 as the ID to mask subword representations? The reason is that in PyTorch the cross-entropy loss class ``torch.nn.CrossEntropyLoss`` has an attribute called ``ignore_index`` whose value is –100. This index is ignored during training, so we can use it to ignore the tokens associated with consecutive subwords.
```

And that’s it! We can clearly see how the label IDs align with the tokens, so let’s scale this out to the whole dataset by defining a single function that wraps all the logic:

In [38]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True,
                                      is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

We now have all the ingredients we need to encode each split, so let’s write a function we can iterate over:

In [39]:
def encode_meddocan_dataset(corpus):
    return corpus.map(tokenize_and_align_labels, batched=True,
                      remove_columns=['ner_tags', 'tokens'])

By applying this function to a ``DatasetDict`` object, we get an encoded ``Dataset`` object per split. Let’s use this to encode our Meddocan corpus:

In [40]:
meddocan_encoded = encode_meddocan_dataset(ds)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Now that we have a model and a dataset, we need to define a performance metric.

### Performance Measures

Evaluating a NER model is similar to evaluating a text classification model, and it is common to report results for precision, recall, and F1-score. The only subtlety is that all words of an entity need to be predicted correctly in order for a prediction to be counted as correct. Fortunately, there is a nifty library called *seqeval* that is designed for these kinds of tasks. For example, given some placeholder NER tags and model predictions, we can compute the metrics via seqeval’s ``classification_report()`` function:

In [41]:
from seqeval.metrics import classification_report

y_true = [["O", "O", "O", "B-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]
y_pred = [["O", "O", "B-MISC", "I-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

        MISC       0.00      0.00      0.00         1
         PER       1.00      1.00      1.00         1

   micro avg       0.50      0.50      0.50         2
   macro avg       0.50      0.50      0.50         2
weighted avg       0.50      0.50      0.50         2



As we can see, *seqeval* expects the predictions and labels as lists of lists, with each list corresponding to a single example in our validation or test sets. To integrate these metrics during training, we need a function that can take the outputs of the model and convert them into the lists that seqeval expects. The following does the trick by ensuring we ignore the label IDs associated with subsequent subwords:

In [42]:
import numpy as np

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            # Ignore label IDs = -100
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

Equipped with a performance metric, we can move on to actually training the model.

### Fine-Tuning XLM-Roberta

We now have all the ingredients to fine-tune our model!

In [43]:
from transformers import TrainingArguments

num_epochs = 3
batch_size = 4
logging_steps = len(ds["train"]) // batch_size
model_name = f"{xlmr_model_name}-finetuned-meddocan"
training_args = TrainingArguments(
    output_dir=model_name, log_level="error", num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size, learning_rate=5e-6,
    warmup_ratio=0.05,
    per_device_eval_batch_size=batch_size, evaluation_strategy="epoch",
    save_steps=1e6, weight_decay=0.01, disable_tqdm=False,
    logging_steps=logging_steps, push_to_hub=False, remove_unused_columns=True)

Here we evaluate the model’s predictions on the validation set at the end of every epoch, tweak the weight decay, and set ``save_steps`` to a large number to disable checkpointing and thus speed up training.

We also need to tell the ``Trainer`` how to compute metrics on the validation set, so here we can use the ``align_predictions()`` function that we defined earlier to extract the predictions and labels in the format needed by *seqeval* to calculate the F1-score:

In [44]:
from seqeval.metrics import f1_score

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions,
                                       eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}

The final step is to define a *data collator* so we can pad each input sequence to the largest sequence length in a batch. HuggingFace Transformers provides a dedicated data collator for token classification that will pad the labels along with the inputs:

In [45]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

Padding the labels is necessary because, unlike in a text classification task, the labels are also sequences. One important detail here is that the label sequences are padded with the value –100, which, as we’ve seen, is ignored by PyTorch loss functions.

We will train several models in the course of this chapter, so we’ll avoid initializing a new model for every Trainer by creating a ``model_init()`` method. This method loads an untrained model and is called at the beginning of the ``train()`` call:

In [46]:
def model_init():
    model = (XLMRobertaForTokenClassification
            .from_pretrained(xlmr_model_name, config=xlmr_config)
            .to(device))
    model.model_parallel = False
    return model

We can now pass all this information together with the encoded datasets to the Trainer:

In [47]:
from transformers import Trainer

trainer = Trainer(model_init=model_init, args=training_args,
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  train_dataset=meddocan_encoded["train"],
                  eval_dataset=meddocan_encoded["dev"],
                  tokenizer=xlmr_tokenizer)

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
cannot import name 'amp' from 'apex' (unknown location)

and then run the training loop as follows:

In [None]:
torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

Save the final model in order to reuse it if needed

In [None]:
trainer.save_model()