In [1]:
import pandas as pd
import collections

**Loading and Preparing Data**

In [2]:
from collections import defaultdict
from datasets import DatasetDict
from datasets import load_dataset

langs = ["es"]
#langs = ["en", "es", "zh", "ar", "ru"]

def get_multilingual_dataset(langs: list) -> collections.defaultdict:
    """
    Get multilingual NER Xtreme PAN-X dataset from huggingface

    Input:
      - langs: List of language codes to get data for

    Output: 
      - panx_ch: Complete PAN-X dataset
    """
    panx_ch = defaultdict(DatasetDict)
    for lang in langs:
      ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
      panx_ch[lang] = ds

    return panx_ch

panx_ch = get_multilingual_dataset(langs)

def extract_tag_names(dataset: collections.defaultdict, lang: str) -> collections.defaultdict:
  """
  Get string representation of NER tags from Xtreme PAN-X dataset

  Input:
    - dataset: Complete PAN-X dataset
    - lang: Language code

  Output:
    - dataset: PAN-X dataset for specified language with NER tag names
  """
  tags = dataset[lang]["train"].features["ner_tags"].feature

  def create_tag_names(batch):
      return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

  dataset = dataset[lang].map(create_tag_names)
  return dataset

In [3]:
from collections import Counter

def check_tag_imbalance(lang_dataset: collections.defaultdict) -> pd.DataFrame:
    """
    Extract information about tags and their frequency among different splits in the data

    Input:
        - lang_dataset: Dataset for specific language with ner_tags_str feature

    Output:
        - dataset: Dataframe containing the frequency of all tags across splits
    """
    split2freqs = defaultdict(Counter)
    for split, dataset in lang_dataset.items():
        for row in dataset["ner_tags_str"]:
            for tag in row:
                if tag.startswith("B"):
                    tag_type = tag.split("-")[1]
                    split2freqs[split][tag_type] += 1

    return pd.DataFrame.from_dict(split2freqs, orient="index")

def sample_dataset(dataset: collections.defaultdict, langs: list) -> None:

    display(pd.DataFrame({lang: [dataset[lang]["train"].num_rows] for lang in langs},
                index=["Number of training examples"]))

    for lang in langs:
        print(f"Sampling: {lang}")
        element = dataset[lang]["train"][0]
        for key, value in element.items():
            print(f"{key}: {value}")

        lang_dataset = extract_tag_names(dataset, lang=lang)
        frequency_dataset = check_tag_imbalance(lang_dataset)

        display(frequency_dataset)
        print("-"*60)

**Tag Functions**

In [4]:
tags = panx_ch["es"]["train"].features["ner_tags"].feature

index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

**Tokenization**

In [5]:
from transformers import AutoTokenizer

xlmr_model_name = "xlm-roberta-base"
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

test_text = """Four score and seven years ago our fathers brought forth on this continent, a new nation, 
conceived in Liberty, and dedicated to the proposition that all men are created equal."""

xlmr_tokens = xlmr_tokenizer(test_text).tokens()

In [6]:
es_data = extract_tag_names(panx_ch, lang='es')
pd.DataFrame(es_data['train'])

Unnamed: 0,tokens,ner_tags,langs,ner_tags_str
0,"[REDIRECCIÓN, Algarrobo, (, Chile, )]","[0, 5, 6, 6, 6]","[es, es, es, es, es]","[O, B-LOC, I-LOC, I-LOC, I-LOC]"
1,"[W., G., Sebald]","[1, 2, 2]","[es, es, es]","[B-PER, I-PER, I-PER]"
2,"[', '', Entrenador, /, a, '', Tamás, Faragó]","[0, 0, 0, 0, 0, 0, 1, 2]","[es, es, es, es, es, es, es, es]","[O, O, O, O, O, O, B-PER, I-PER]"
3,"[REDIRECCIÓN, Società, Sportiva, Virtus, Lanci...","[0, 3, 4, 4, 4, 4]","[es, es, es, es, es, es]","[O, B-ORG, I-ORG, I-ORG, I-ORG, I-ORG]"
4,"[Houses, of, the, Holy]","[3, 4, 4, 4]","[es, es, es, es]","[B-ORG, I-ORG, I-ORG, I-ORG]"
...,...,...,...,...
19995,"[Ciudad, de, México, (, )]","[5, 6, 6, 0, 0]","[es, es, es, es, es]","[B-LOC, I-LOC, I-LOC, O, O]"
19996,"[Tesuque, (, Nuevo, México, )]","[5, 6, 6, 6, 6]","[es, es, es, es, es]","[B-LOC, I-LOC, I-LOC, I-LOC, I-LOC]"
19997,"[REDIRECCIÓN, Catedral, de, Szeged]","[0, 3, 4, 4]","[es, es, es, es]","[O, B-ORG, I-ORG, I-ORG]"
19998,"[El, videoclip, también, fue, grabado, en, Nue...","[0, 0, 0, 0, 0, 0, 5, 6, 0]","[es, es, es, es, es, es, es, es, es]","[O, O, O, O, O, O, B-LOC, I-LOC, O]"


In [7]:
def tokenize_and_align_labels(dataset: collections.defaultdict) -> collections.defaultdict:
    """
    Encode and tokenize text and align tokens with NER labels

    Input:
        - dataset: Language subset of complete PAN-X NER corpus
    
    Output:
        - tokenized_inputs: Tokens aligned with NER labels
    """
    tokenized_inputs = xlmr_tokenizer(dataset['tokens'], truncation=True, is_split_into_words=True)

    labels = []
    for idx, label in enumerate(dataset["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def encode_panx_dataset(dataset: collections.defaultdict) -> collections.defaultdict:
    """
    Apply tokenization and aligning function to complete PAN-X corpus

    Input: 
        - dataset: Language subset of complete PAN-X NER corpus

    Output:
        - dataset: PAN-X language subset with tokenized inputs and NER labels
    """
    return dataset.map(tokenize_and_align_labels, batched=True, 
                       remove_columns=['langs', 'ner_tags', 'tokens'])

panx_es_encoded = encode_panx_dataset(panx_ch["es"])

**Model Setup and Training**

In [8]:
from transformers import DataCollatorForTokenClassification

"""
The data collator pads and truncates the text sequence to make them 
all the same length.
"""

data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

In [9]:
#from huggingface_hub import notebook_login
#notebook_login()
#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

In [10]:
from transformers import TrainingArguments

"""
Set training arguments for XLM Roberta token classification model
"""

num_epochs = 3
batch_size = 24
#logging_steps = len(panx_de_encoded["train"]) // batch_size
model_name = f"{xlmr_model_name}-finetuned-panx-de"
training_args = TrainingArguments(
    output_dir=model_name, 
    log_level="error", 
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size, 
    evaluation_strategy="epoch",
    save_steps=1e6, 
    weight_decay=0.01, 
    disable_tqdm=False,
    #logging_steps=logging_steps, 
    push_to_hub=True)

In [11]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

"""
Define XMLRoberta token classification model
"""

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    """
    RobertaPreTrainedModel provides the pretrained weights
    useful for initialising the custom model, where as,

    RobertaModel used in the constructor provides the hidden states weights
    by loading the model body
    """
    config_class = XLMRobertaConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # Load model body
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        # Set up token classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # Load and initialize weights,
        # init_weights is a method in the base class RobertaPreTrainedModel
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
                labels=None, **kwargs):
        # Use model body to get encoder representations
        outputs = self.roberta(input_ids, attention_mask=attention_mask,
                               token_type_ids=token_type_ids, **kwargs)
        # Apply classifier to encoder representation
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        # Calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        # Return model output object
        return TokenClassifierOutput(loss=loss, logits=logits,
                                     hidden_states=outputs.hidden_states,
                                     attentions=outputs.attentions)

In [12]:
from seqeval.metrics import f1_score
import numpy as np

def align_predictions(predictions, label_ids):
    """
    Align predictions and labels via padding/truncation
    """
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            # Ignore label IDs = -100
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

def compute_metrics(eval_pred):
    """
    Use seqeval libarary to calculate f1 score of NER sequence predictions
    """
    y_pred, y_true = align_predictions(eval_pred.predictions,
                                       eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}

In [13]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.cuda.memory_reserved(device=device), torch.cuda.max_memory_reserved(device=device)

(0, 0)

In [14]:
from transformers import AutoConfig

xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,
                                         num_labels=tags.num_classes,
                                         id2label=index2tag, label2id=tag2index)

def model_init():
    return (XLMRobertaForTokenClassification
            .from_pretrained(xlmr_model_name, config=xlmr_config)
            .to(device))

from transformers import Trainer

trainer = Trainer(model_init=model_init, args=training_args,
                  data_collator=data_collator,compute_metrics=compute_metrics,
                  train_dataset=panx_es_encoded["train"],
                  eval_dataset=panx_es_encoded["validation"],
                  tokenizer=xlmr_tokenizer)
trainer.train()

  0%|          | 0/2502 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 6.41 GB, other allocations: 2.15 GB, max allowed: 9.07 GB). Tried to allocate 732.43 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).