In [None]:
!pip install datasets
!pip install transformers --upgrade
!pip install accelerate --upgrade
!pip install transformers[torch]
!pip install accelerate -U

!apt-get -qq install -y transformers-dev && pip install -U transformers
!apt-get -qq install -y accelerate-dev && pip install -U accelerate

`XTREME` is a benchmark dataset designed to evaluate the performance of multilingual models across multiple NLP tasks in various languages. It covers a wide range of tasks, including machine translation, named entity recognition, part-of-speech tagging, and more. The dataset aims to assess the ability of models to generalize across languages and tasks.

In [None]:
from datasets import get_dataset_config_names
xtreme_subsets = get_dataset_config_names("xtreme")
print(f"XTREME has {len(xtreme_subsets)} configurations")

XTREME has 183 configurations


In [None]:
panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
panx_subsets[:3]

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg']

`load_dataset` abstracts away the process of manually downloading, processing, and preparing a dataset for use in your code. If you don't use `load_dataset`, you would need to manually obtain the dataset, possibly download it from a source, preprocess the data, organize it into a suitable format (like a Python dictionary or DataFrame), and then handle data loading and manipulation on your own. This process can be time-consuming and error-prone, especially for large and complex datasets.

In [None]:
from datasets import load_dataset
ds = load_dataset("xtreme", name=f"PAN-X.de")
for split in ds:
  print(split)

train
validation
test


In [None]:
from collections import defaultdict
from datasets import DatasetDict
langs = ["de", "fr", "it", "en"]
fracs = [0.629, 0.229, 0.084, 0.059] # the numbers follows the priority of each language
#Creating a defaultdict where the default value for a missing key is a DatasetDict
#This structure allows for organizing datasets for different languages
panx_ch = defaultdict(DatasetDict)
#Looping through the language codes and their respective fractions
for lang, frac in zip(langs, fracs):
  #Inside the loop, for each language Load monolingual corpus using Hugging Face's load_dataset function from the "xtreme" dataset collection
  ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
  #shuffles the data and selects a subset according to the specified fraction for the current split in the current language.
  for split in ds:
    panx_ch[lang][split] = (  # 'de' train, validation, test || 'fr' train, validation, test, ||  .....
      ds[split]
      .shuffle(seed=0)
      .select(range(int(frac * ds[split].num_rows))))

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
print(panx_ch)

defaultdict(<class 'datasets.dataset_dict.DatasetDict'>, {'de': DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 12580
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 6290
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 6290
    })
}), 'fr': DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 4580
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 2290
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 2290
    })
}), 'it': DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 1680
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 840
    })
    test: Dataset({
        features: ['token

In [None]:
import pandas as pd
pd.DataFrame({lang: [panx_ch[lang]["train"].num_rows] for lang in langs}
,index=["Number of training examples"] )

Unnamed: 0,de,fr,it,en
Number of training examples,12580,4580,1680,1180


In [None]:
print(panx_ch["de"]["train"]) # 3 columns with 12500 rows

Dataset({
    features: ['tokens', 'ner_tags', 'langs'],
    num_rows: 12580
})


In [None]:
print(panx_ch["de"]["train"].features.items())

dict_items([('tokens', Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)), ('ner_tags', Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)), ('langs', Sequence(feature=Value(dtype='string', id=None), length=-1, id=None))])


we have more examples in German than all other languages combined, so
we’ll use it as a starting point from which to perform `zero-shot` cross-lingual transfer
to French, Italian, and English

In [None]:
element = panx_ch["de"]["train"]
for key, value in element.features.items():
 print(f"{key}: {value}")

tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)


In [None]:
tags = panx_ch["de"]["train"].features["ner_tags"].feature
tags

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)

The `int2str` method of `ClassLabel` takes an integer index as an argument and returns the corresponding string label associated with that index based on the predefined mapping.

For example, if `idx` is 0, `tags.int2str(0)` would return `O` because the list of label names is ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'] and the index 0 corresponds to the 'O' label in this list.

In [None]:
def create_tag_names(batch):
  return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}
# we added a new column "ner_tags_str"
panx_de = panx_ch["de"].map(create_tag_names)
print(panx_de)

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None), 'langs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags_str': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}


In [None]:
de_example = panx_de["train"][0]
pd.DataFrame([de_example["tokens"], de_example["ner_tags_str"]],['Tokens', 'Tags'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,2.000,Einwohnern,an,der,Danziger,Bucht,in,der,polnischen,Woiwodschaft,Pommern,.
Tags,O,O,O,O,B-LOC,I-LOC,O,O,B-LOC,B-LOC,I-LOC,O


In [None]:
# Quick check that we don’t have any unusual imbalance in the tags

print(panx_de.items())

from collections import Counter
split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
  for row in dataset["ner_tags_str"]:
    for tag in row:
      if tag.startswith("B"):
        tag_type = tag.split("-")[1]
        split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient="index")

dict_items([('train', Dataset({
    features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
    num_rows: 12580
})), ('validation', Dataset({
    features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
    num_rows: 6290
})), ('test', Dataset({
    features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
    num_rows: 6290
}))])


Unnamed: 0,LOC,ORG,PER
train,6186,5366,5810
validation,3172,2683,2893
test,3180,2573,3071


In [None]:
!pip install transformers



XLM-R uses a tokenizer called `SentencePiece`
that is trained on the raw text of all one hundred languages, which we will use.

The `SentencePiece` tokenizer is based on a type of subword segmentation called
Unigram and encodes each input text as a sequence of Unicode characters. This last
feature is especially useful for multilingual corpora since it allows SentencePiece to be
agnostic about accents, punctuation, and the fact that many languages, like Japanese,
do not have whitespace characters.

In [None]:
from transformers import AutoTokenizer
bert_model_name = "bert-base-cased"
xlmr_model_name = "xlm-roberta-base"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

text = "Jack Sparrow loves New York!"
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()

# Notice the diffirence
print(bert_tokens)
print(xlmr_tokens)

['[CLS]', 'Jack', 'Spa', '##rrow', 'loves', 'New', 'York', '!', '[SEP]']
['<s>', '▁Jack', '▁Spar', 'row', '▁love', 's', '▁New', '▁York', '!', '</s>']


About `XLMRobertaForTokenClassification` class :

`outputs` will include elements like `outputs[0]` (hidden states), `outputs[1]` (pooled output), and possibly `outputs[2]` (attention weights), depending on the model and configuration.
`[0]` index in `outputs[0]` is used to access the first (topmost) layer's output, which represents the final contextualized representations of the input tokens ***contains the most refined and contextually rich representations of the input tokens***

---

The `config_class` ensures that the standard XLM-R settings are used when we initialize a new model.

---
Note that we set `add_pooling_layer=False` to ensure all hidden states are returned and not only the one associated with the [CLS] token

------------------------------------------------------------------------
let’s see how we can `encode` our
simple example in a form suitable for NER.
#### The first thing to do is load the pretrained model with a token classification head. But instead of loading this head directly fromTransformers,

we will build it ourselves! By diving deeper into the Transformers API, we can do this with just a few steps.

------------------------------------------------------------------------


In [None]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
  config_class = XLMRobertaConfig
  def __init__(self, config):
    # If you want to change the default parameters, super() method
    super().__init__(config)
    self.num_labels = config.num_labels

    # Load model body
    self.roberta = RobertaModel(config, add_pooling_layer=False)

    # Set up token classification head
    self.dropout = nn.Dropout(config.hidden_dropout_prob)
    self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    # Load and initialize pretrained weights for the model body
    # and randomly initialize the weights of our token classification head.
    self.init_weights()

  def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,labels=None, **kwargs):
    # Use model body to get encoder representations
    outputs = self.roberta(input_ids, attention_mask=attention_mask,token_type_ids=token_type_ids, **kwargs)

    # If there is an attention mask, little bit more work, make sure we only calculate the loss of the unmasked tokens.

    # Apply classifier to encoder representation
    sequence_output = self.dropout(outputs[0])
    logits = self.classifier(sequence_output)

    # Calculate losses
    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

    # Return model output object
    return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

In [None]:
from transformers import AutoConfig

index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

The `AutoConfig` class contains the blueprint of a model’s architecture. When we load a model with AutoModel.from_pretrained(model_ckpt)

`Note` that we did not implement loading pretrained weights in our custom model class; we get this for free by inheriting from RobertaPreTrainedModel:

When we load
a model with `AutoModel.from_pretrained(model_ckpt)`, the configuration file associated
with that model is downloaded automatically. However, if we want to modify
something like the number of classes or label names, then we can load the configuration
first with the parameters we would like to customize.

In [None]:
#recieves name, num_labels, id2label, label2id
xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,
    num_labels=tags.num_classes,
    id2label=index2tag, label2id=tag2index)

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlmr_model = (XLMRobertaForTokenClassification
    .from_pretrained(xlmr_model_name, config=xlmr_config)
    .to(device))

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# quick checkinput_ids = xlmr_tokenizer.encode(text, return_tensors="pt")
pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=["Tokens", "Input IDs"])

When you use `dim=-1`, it's equivalent to specifying `dim=2` if the outputs tensor has shape (batch_size, sequence_length, num_labels).

Pass the inputs to the model and extract the predictions by taking the `argmax` to get the most likely class per token:

In [None]:
outputs = xlmr_model(input_ids.to(device)).logits
predictions = torch.argmax(outputs, dim=-1)

# [batch_size, num_tokens, num_tags],
print(f"Number of tokens in sequence: {len(xlmr_tokens)}")
print(f"Shape of outputs: {outputs.shape}")

Number of tokens in sequence: 10
Shape of outputs: torch.Size([1, 10, 7])


In [None]:
# see what the pretrained model predicts
preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([xlmr_tokens, preds], index=["Tokens", "Tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Tags,I-ORG,B-PER,B-PER,I-ORG,I-ORG,I-ORG,B-PER,B-PER,B-PER,I-ORG


Unsurprisingly, our token classification layer with random weights leaves a lot to be desired; let’s
### fine-tune on some labeled data to make it better!
Before doing so, let’s wrap the preceding steps into a `helper` function for later use:

`Note`
`torch.argmax` Function: This function is used to find the index (class label) with the highest predicted probability for each token along a specific dimension. In this case, dim=2 means that we are taking the maximum along the third dimension, which corresponds to the class label dimension.
  
  ` outputs -> (batch_size, sequence_length, num_labels) `

`(predictions)`: After applying torch.argmax, you get a tensor of shape (batch_size, sequence_length)

`Summary`
the step `predictions = torch.argmax(outputs, dim=2)` is a common post-processing step to convert the model's output (probability distributions over class labels) into a sequence of predicted class labels for each token in the input sequence. It allows you to make decisions about which class label is most likely for each token in a sequence, which is essential for tasks like Named Entity Recognition, part-of-speech tagging, sentiment analysis, and more.

In [None]:
# wrap the preceding steps into a helper function for later use:
def tag_text(text, tags, model, tokenizer):
  # Get tokens with special characters
  tokens = tokenizer(text).tokens()

  # Encode the sequence into IDs
  input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)

  # Get predictions as distribution over 7 possible classes
  outputs = model(input_ids)[0]

  # Take argmax to get most likely class per token
  predictions = torch.argmax(outputs, dim=2)

  # Convert to DataFrame
  preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
  return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])

In [None]:
# is_split_into_words argument to tell the tokenizer that our input sequence has already been split into words:
tokenized_input = xlmr_tokenizer(de_example["tokens"], is_split_into_words=True)
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
pd.DataFrame([tokens], index=["Tokens"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>


An example for the previous code:

```
# Input sequence (pre-tokenized)
input_sequence = ["This", "is", "an", "example", "sequence", "."]

Tokenized Input:
{'input_ids': [0, 573, 30, 36, 88, 7642, 4], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

Tokens:
['<s>', '▁This', '▁is', '▁an', '▁example', '▁sequence', '.', '</s>']

```





we need a way to mask the subword representations after the first subword. Fortunately, tokenized_input is a class that contains a `word_ids()` function
that can help us achieve this:

لو الكلمة مقسومه نصين النصين يكون ليهم نفس الأي دي


In [None]:
word_ids = tokenized_input.word_ids()
pd.DataFrame([tokens, word_ids], index=["Tokens", "Word IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,


Why did we choose –100 as the ID to mask subword representations?

The reason is that in PyTorch the cross-entropy loss class
torch.nn.CrossEntropyLoss has an attribute called `ignore_index`
whose value is –100. This index is ignored during training, so we
can use it to ignore the tokens associated with consecutive
subwords.

In [None]:
previous_word_idx = None
label_ids = []
words, labels = de_example["tokens"], de_example["ner_tags"]

for word_idx in word_ids:
  if word_idx is None or word_idx == previous_word_idx:
    label_ids.append(-100)
  elif word_idx != previous_word_idx:
    label_ids.append(labels[word_idx])
  previous_word_idx = word_idx

labels = [index2tag[l] if l != -100 else "IGN" for l in label_ids]
index = ["Tokens", "Word IDs", "Label IDs", "Labels"]
pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,
Label IDs,-100,0,0,-100,0,0,5,-100,-100,6,...,5,-100,-100,-100,6,-100,-100,0,-100,-100
Labels,IGN,O,O,IGN,O,O,B-LOC,IGN,IGN,I-LOC,...,B-LOC,IGN,IGN,IGN,I-LOC,IGN,IGN,O,IGN,IGN


In [None]:
# Scale the previous steps out to the whole dataset
def tokenize_and_align_labels(examples):
  tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
  labels = []

  for idx, label in enumerate(examples["ner_tags"]):
    word_ids = tokenized_inputs.word_ids(batch_index=idx)
    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:
      if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
      else:
        label_ids.append(label[word_idx])
        previous_word_idx = word_idx

    labels.append(label_ids)

  tokenized_inputs["labels"] = labels
  return tokenized_inputs

In [None]:
# so let’s write a function we can iterate over:
def encode_panx_dataset(corpus):
  return corpus.map(tokenize_and_align_labels, batched=True, remove_columns=['langs', 'ner_tags', 'tokens'])

In [None]:
panx_de_encoded = encode_panx_dataset(panx_ch["de"])
print(panx_de_encoded)

Map:   0%|          | 0/6290 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 12580
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6290
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6290
    })
})


###Now that we have a model and a dataset, we need to define a performance metric.

##Performance Measures

it is common to report results for precision, recall, and F1-score. The only subtlety is that all words of an entity need to be predicted correctly in order for a prediction to be counted as correct.

Fortunately, there is a nifty library called `seqeval` that is designed for these kinds of tasks. For example, given some placeholder NER tags and model predictions, we can compute the metrics via seqeval’s `classification_report()` function:

In [None]:
! pip install seqeval



In [None]:
from seqeval.metrics import classification_report
y_true = [
    ["O", "O", "O", "B-MISC", "I-MISC", "I-MISC", "O"],
    ["B-PER", "I-PER", "O"]
    ]
y_pred = [
    ["O", "O", "B-MISC", "I-MISC", "I-MISC", "I-MISC", "O"],
    ["B-PER", "I-PER", "O"]
    ]
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

        MISC       0.00      0.00      0.00         1
         PER       1.00      1.00      1.00         1

   micro avg       0.50      0.50      0.50         2
   macro avg       0.50      0.50      0.50         2
weighted avg       0.50      0.50      0.50         2



1. Formatting for seqeval: seqeval expects predictions and labels as lists of lists, where each inner list corresponds to a single example in your validation or test dataset. Each inner list contains the predicted or true labels for the tokens in that example.

2. Handling Subwords: Transformer-based models often use subword tokenization, which means that a word can be split into multiple subword tokens (e.g., "running" might be tokenized into ["run", "\##ning"]). This can lead to label IDs associated with subwords, and you typically want to ignore these and consider the label for the first subword as the label for the entire word.

We should do :

Input: Model's predictions for a sequence of subword tokens: [PER, PER, O, LOC, LOC, O, O]

Output: seqeval-formatted list: [[PER, PER], [O, LOC, LOC], [O]]

In [None]:
import numpy as np
def align_predictions(predictions, label_ids):
  preds = np.argmax(predictions, axis=2)
  batch_size, seq_len = preds.shape
  labels_list, preds_list = [], []

  for batch_idx in range(batch_size):
    example_labels, example_preds = [], []

    for seq_idx in range(seq_len):
      # Ignore label IDs = -100
      if label_ids[batch_idx, seq_idx] != -100:
        example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
        example_preds.append(index2tag[preds[batch_idx][seq_idx]])

    labels_list.append(example_labels)
    preds_list.append(example_preds)
  return preds_list, labels_list

## Fine-Tuning XLM-RoBERTa
We now have all the ingredients to fine-tune our model! Our first strategy will be to fine-tune our base model on the `German subset` of PAN-X and then evaluate its `zeroshot cross-lingual` performance on French, Italian, and English. As usual, we’ll use the Transformers `Trainer` to handle our training loop, so first we need to define the training attributes using the `TrainingArguments class`:

In [None]:
from transformers import TrainingArguments

In [None]:
num_epochs = 3
batch_size = 24
logging_steps = len(panx_de_encoded["train"])
model_name = f"{xlmr_model_name}-finetuned-panx-de"
training_args = TrainingArguments(
    output_dir=model_name, log_level="error",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    save_steps=1e6, weight_decay=0.01,
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=False)

We also need to tell the Trainer how to compute metrics on the validation set, so here we can use the `align_predictions()` function that we defined earlier to extract the predictions and labels in the format needed by seqeval to calculate the F1-score:

In [None]:
from seqeval.metrics import f1_score
def compute_metrics(eval_pred):
  y_pred, y_true = align_predictions(eval_pred.predictions, eval_pred.label_ids)
  return {"f1": f1_score(y_true, y_pred)}

The final step is to define a `data_collator` so we can pad each input sequence to the largest sequence length in a batch. Transformers provides a dedicated data collator for token classification that will pad the labels along with the inputs:

In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)
# data_collator -> input IDs, attention masks, and labels

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### We will train several models in the course of this chapter, so we’ll avoid initializing a new model for every Trainer by creating a model_init() method. This method loads an untrained model and is called at the beginning of the train() call:

In [None]:
def model_init():
  return (XLMRobertaForTokenClassification
  .from_pretrained(xlmr_model_name, config=xlmr_config)
  .to(device))

from transformers import Trainer
trainer = Trainer(model_init=model_init, args=training_args,
  data_collator=data_collator, compute_metrics=compute_metrics,
  train_dataset=panx_de_encoded["train"],
  eval_dataset=panx_de_encoded["validation"],
  tokenizer=xlmr_tokenizer)

trainer.train()
# trainer.push_to_hub(commit_message="Training completed!")

Epoch,Training Loss,Validation Loss,F1
1,No log,0.153999,0.830184
2,No log,0.149287,0.846788
3,No log,0.136146,0.864681


TrainOutput(global_step=1575, training_loss=0.15531324598524304, metrics={'train_runtime': 521.0526, 'train_samples_per_second': 72.43, 'train_steps_per_second': 3.023, 'total_flos': 864249509940432.0, 'train_loss': 0.15531324598524304, 'epoch': 3.0})

In [None]:
text_de = "Jeff Dean ist ein Informatiker bei Google in Kalifornien"
tag_text(text_de, tags, trainer.model, xlmr_tokenizer)

##Error Analysis

we’ll now calculate a loss per token in the sample sequence, a method that we can apply to the validation set

In [None]:
def forward_pass_with_label(batch):
  # Convert dict of lists to list of dicts suitable for data collator
  features = [dict(zip(batch, t)) for t in zip(*batch.values())]
  # Pad inputs and labels and put all tensors on device
  batch = data_collator(features)
  input_ids = batch["input_ids"].to(device)
  attention_mask = batch["attention_mask"].to(device)
  labels = batch["labels"].to(device)

  with torch.no_grad():
    # Pass data through model
    output = trainer.model(input_ids, attention_mask)
    # logit.size: [batch_size, sequence_length, classes]
    # Predict class with largest logit value on classes axis
    predicted_label = torch.argmax(output.logits, axis=-1).cpu().numpy()

  # Calculate loss per token after flattening batch
  loss = cross_entropy(output.logits.view(-1, 7), labels.view(-1), reduction="none")
  # Unflatten batch dimension and convert to numpy array
  loss = loss.view(len(input_ids), -1).cpu().numpy()
  return {"loss":loss, "predicted_label": predicted_label}

```
# This is formatted as code
batch = {
    "input_ids": [101, 102, 103],
    "attention_mask": [1, 1, 1],
    "labels": [0, 1, 0]
}
# to this
{
    "input_ids": 101,
    "attention_mask": 1,
    "labels": 0
}


In [None]:
valid_set = panx_de_encoded["validation"]
valid_set = valid_set.map(forward_pass_with_label, batched=True, batch_size=32)
df = valid_set.to_pandas()

NameError: ignored

The tokens and the labels are still encoded with their IDs, so let’s map the tokens and
labels back to strings to make it easier to read the results. For the padding tokens with
label –100 we assign a special label, IGN, so we can filter them later. We also get rid of
all the padding in the loss and predicted_label fields by truncating them to the
length of the inputs:

In [None]:
index2tag[-100] = "IGN"
df["input_tokens"] = df["input_ids"].apply(lambda x: xlmr_tokenizer.convert_ids_to_tokens(x))
df["predicted_label"] = df["predicted_label"].apply(lambda x: [index2tag[i] for i in x])
df["labels"] = df["labels"].apply(lambda x: [index2tag[i] for i in x])
df['loss'] = df.apply(lambda x: x['loss'][:len(x['input_ids'])], axis=1)
df['predicted_label'] = df.apply(lambda x: x['predicted_label'][:len(x['input_ids'])], axis=1)
df.head(1)

In [None]:
# in one line by creating a row for each element in the original rows list.
df_tokens = df.apply(pd.Series.explode)
df_tokens = df_tokens.query("labels != 'IGN'")
df_tokens["loss"] = df_tokens["loss"].astype(float).round(2)
df_tokens.head(7)

In [None]:
# Group it by the input tokens and aggregate the losses for each token with the count, mean, and sum.
(df_tokens.groupby("input_tokens")[["loss"]]
  .agg(["count", "mean", "sum"])
  .droplevel(level=0, axis=1) # Get rid of multi-level columns
  110 | Chapter 4: Multilingual Named Entity Recognition
  .sort_values(by="sum", ascending=False)
  .reset_index()
  .round(2)
  .head(10)
  .T
)

In [None]:
# Group the label IDs and look at the losses for each class:
(df_tokens.groupby("labels")[["loss"]]
  .agg(["count", "mean", "sum"])
  .droplevel(level=0, axis=1)
  .sort_values(by="mean", ascending=False)
  .reset_index()
  .round(2)
  .T
)

We see that `B-ORG` has the highest average loss, which means that determining the
beginning of an organization poses a challenge to our model.

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
def plot_confusion_matrix(y_preds, y_true, labels):
  cm = confusion_matrix(y_true, y_preds, normalize="true")
  fig, ax = plt.subplots(figsize=(6, 6))
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
  disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
  plt.title("Normalized confusion matrix")
  plt.show()
plot_confusion_matrix(df_tokens["labels"], df_tokens["predicted_label"], tags.names)

In [None]:
def get_samples(df):
  for _, row in df.iterrows():
    labels, preds, tokens, losses = [], [], [], []
    for i, mask in enumerate(row["attention_mask"]):
      if i not in {0, len(row["attention_mask"])}:
        labels.append(row["labels"][i])
        preds.append(row["predicted_label"][i])
        tokens.append(row["input_tokens"][i])
        losses.append(f"{row['loss'][i]:.2f}")
    df_tmp = pd.DataFrame({"tokens": tokens, "labels": labels, "preds": preds, "losses": losses}).T
    yield df_tmp
df["total_loss"] = df["loss"].apply(sum)
df_tmp = df.sort_values(by="total_loss", ascending=False).head(3)
for sample in get_samples(df_tmp):
  display(sample)

In [None]:
df_tmp = df.loc[df["input_tokens"].apply(lambda x: u"\u2581(" in x)].head(2)
for sample in get_samples(df_tmp):
  display(sample)

##Cross-Lingual Transfer
we have fine-tuned XLM-R on German, we can evaluate its ability to transfer
to other languages via the predict() method of the Trainer

In [1]:
def get_f1_score(trainer, dataset):
  return trainer.predict(dataset).metrics["test_f1"]

In [None]:
f1_scores = defaultdict(dict)
f1_scores["de"]["de"] = get_f1_score(trainer, panx_de_encoded["test"])
print(f"F1-score of [de] model on [de] dataset: {f1_scores['de']['de']:.3f}")

In [None]:
text_fr = "Jeff Dean est informaticien chez Google en Californie"
tag_text(text_fr, tags, trainer.model, xlmr_tokenizer)

let’s quantify how well our German model fares on the whole French test set by writing a simple function that encodes a dataset and generates the classification report on it


In [None]:
def evaluate_lang_performance(lang, trainer):
  panx_ds = encode_panx_dataset(panx_ch[lang])
  return get_f1_score(trainer, panx_ds["test"])

In [None]:
f1_scores["de"]["fr"] = evaluate_lang_performance("fr", trainer)
print(f"F1-score of [de] model on [fr] dataset: {f1_scores['de']['fr']:.3f}")
# Remember that our model has not seen a single labeled French example!

In [None]:
f1_scores["de"]["it"] = evaluate_lang_performance("it", trainer)
print(f"F1-score of [de] model on [it] dataset: {f1_scores['de']['it']:.3f}")

In [None]:
f1_scores["de"]["en"] = evaluate_lang_performance("en", trainer)
print(f"F1-score of [de] model on [en] dataset: {f1_scores['de']['en']:.3f}")

we’ll tweak the logging_steps argument of Training
Arguments to account for the changing training set sizes, monolingual corpus, downsamples it by num_samples, and fine-tunes XLM-R on that
sample to return the metrics from the best epoch:

In [None]:
def train_on_subset(dataset, num_samples):
  train_ds = dataset["train"].shuffle(seed=42).select(range(num_samples))
  valid_ds = dataset["validation"]
  test_ds = dataset["test"]

  training_args.logging_steps = len(train_ds) // batch_size

  trainer = Trainer(
      model_init=model_init,
      args=training_args,
      data_collator=data_collator,
      compute_metrics=compute_metrics,
      train_dataset=train_ds,
      eval_dataset=valid_ds,
      tokenizer=xlmr_tokenizer)

  trainer.train()

  if training_args.push_to_hub:
    trainer.push_to_hub(commit_message="Training completed!")
    f1_score = get_f1_score(trainer, test_ds)

  return pd.DataFrame.from_dict(
      {"num_samples": [len(train_ds)], "f1_score": [f1_score]})

In [None]:
# encode the French corpus into input IDs, attention masks, and label IDs
panx_fr_encoded = encode_panx_dataset(panx_ch["fr"])

training_args.push_to_hub = False
metrics_df = train_on_subset(panx_fr_encoded, 250)
metrics_df

250 examples, fine-tuning on French underperforms the
zero-shot transfer from German by a large margin. Let’s now increase our training set
sizes to 500, 1,000, 2,000, and 4,000 examples to get an idea of how the performance
increases:

In [None]:
for num_samples in [500, 1000, 2000, 4000]:
  metrics_df = metrics_df.append(
      train_on_subset(panx_fr_encoded, num_samples), ignore_index=True)

In [None]:
fig, ax = plt.subplots()
ax.axhline(f1_scores["de"]["fr"], ls="--", color="r")
metrics_df.set_index("num_samples").plot(ax=ax)

plt.legend(["Zero-shot from de", "Fine-tuned on fr"], loc="lower right")
plt.ylim((0, 1))
plt.xlabel("Number of Training Samples")
plt.ylabel("F1 Score")
plt.show()

###There is one final technique we can try to evaluate multilingual learning: fine-tuning on multiple languages at once

In [None]:
from datasets import concatenate_datasets

def concatenate_splits(corpora):
  multi_corpus = DatasetDict()
  for split in corpora[0].keys():
    multi_corpus[split] = concatenate_datasets(
        [corpus[split] for corpus in corpora]).shuffle(seed=42)
  return multi_corpus

panx_de_fr_encoded = concatenate_splits([panx_de_encoded, panx_fr_encoded])

In [None]:
training_args.logging_steps = len(panx_de_fr_encoded["train"]) // batch_size
training_args.push_to_hub = True
training_args.output_dir = "xlm-roberta-base-finetuned-panx-de-fr"

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=xlmr_tokenizer,
    train_dataset=panx_de_fr_encoded["train"],
    eval_dataset=panx_de_fr_encoded["validation"])

trainer.train()
trainer.push_to_hub(commit_message="Training completed!")

In [None]:
for lang in langs:
  f1 = evaluate_lang_performance(lang, trainer)
  print(f"F1-score of [de-fr] model on [{lang}] dataset: {f1:.3f}")

###fine-tuning on each language separately against multilingual learning on all the corpora

In [None]:
corpora = [panx_de_encoded]
# Exclude German from iteration
for lang in langs[1:]:
  training_args.output_dir = f"xlm-roberta-base-finetuned-panx-{lang}"
  # Fine-tune on monolingual corpus
  ds_encoded = encode_panx_dataset(panx_ch[lang])
  metrics = train_on_subset(ds_encoded, ds_encoded["train"].num_rows)
  # Collect F1-scores in common dict
  f1_scores[lang][lang] = metrics["f1_score"][0]
  # Add monolingual corpus to list of corpora to concatenate
  corpora.append(ds_encoded)

# concatenate all the splits together to create a multilingual corpus of all four languages
corpora_encoded = concatenate_splits(corpora)

In [None]:
training_args.logging_steps = len(corpora_encoded["train"]) // batch_size
training_args.output_dir = "xlm-roberta-base-finetuned-panx-all"

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=xlmr_tokenizer,
    train_dataset=corpora_encoded["train"],
    eval_dataset=corpora_encoded["validation"])

trainer.train()
trainer.push_to_hub(commit_message="Training completed!")

In [None]:
for idx, lang in enumerate(langs):
  f1_scores["all"][lang] = get_f1_score(trainer, corpora[idx]["test"])
  scores_data = {
      "de": f1_scores["de"],
      "each": {lang: f1_scores[lang][lang] for lang in langs},
      "all": f1_scores["all"]}
  f1_scores_df = pd.DataFrame(scores_data).T.round(4)
  f1_scores_df.rename_axis(
      index="Fine-tune on",
      columns="Evaluated on",
      inplace=True)

f1_scores_df