In [5]:
import torch
import datasets

In [None]:
# load the dataset
dataset = datasets.load_dataset(path="universal_dependencies", name="de_gsd", trust_remote_code=True)
print(dataset)
train_dataset = dataset["train"]

In [35]:
# read the first 10 examples
print(dataset)
print(train_dataset["text"][:10])
print(train_dataset["tokens"][:10])
print(train_dataset["upos"][:10])

upos_mapping = dataset["train"].features["upos"].feature

# store the possible pos tags
pos_list = upos_mapping.names
print(pos_list)

DatasetDict({
    train: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 13814
    })
    validation: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 799
    })
    test: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 977
    })
})
['Sehr gute Beratung, schnelle Behebung der Probleme, so stelle ich mir Kundenservice vor.', 'Die Kosten sind definitiv auch im Rahmen.', 'Nette Gespräche, klasse Ergebnis', 'Ich bin seit längerer Zeit zur Behandlung verschiedenster "Leiden" in der Physiotherapieraxis "Gaby Montag" im Vital Center und kann ausschließlich Positives berichten!', 'Ob bei der Terminvergabe, den Behandlungsräumen oder den individuell zugeschnittenen Trainingsplänen sind alle Mitarbeiter äußerst kompete

In [38]:
def tokenize_and_align_labels(examples, tokenizer, label_all_tokens=False, skip_index=-100):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding=True)
    labels = []

    for i, label in enumerate(examples["upos"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids: list[int] = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(skip_index)

            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])

            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else skip_index)

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [41]:
# convert numerical labels to string labels
def upos_id_to_label(upos_mapping, i):
    return upos_mapping.int2str(i)

In [40]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")

# test tokenization
tokenized_inputs = tokenized_dataset = tokenize_and_align_labels(train_dataset[:5], tokenizer)
print(tokenized_inputs)

{'input_ids': [[0, 93404, 25989, 58860, 6, 4, 17230, 13, 873, 195285, 122, 39344, 6, 4, 221, 91151, 654, 2296, 14829, 22584, 1248, 6, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 622, 30882, 1276, 44836, 921, 566, 23, 745, 36070, 6, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 43268, 13, 64225, 13, 6, 4, 38411, 86909, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 2484, 2394, 10743, 60915, 56, 5502, 2957, 404, 122, 68006, 186567, 1515, 44, 185113, 44, 23, 122, 13000, 53, 9619, 88562, 219, 33102, 44, 53186, 53, 53757, 44, 566, 23, 745, 116393, 11588, 165, 1876, 100320, 156060, 90, 120833, 711, 2], [0, 3545, 1079, 122, 27366, 814, 18018, 6, 4, 168, 68006, 7, 161960, 1367, 168, 85675, 404, 429, 103262, 33, 33, 44284, 7008, 42144, 33, 1276, 747, 28735, 196486, 56036, 165, 119997, 6, 5, 2, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [47]:
# test if the tokenization and alignment worked
for i in range(len(tokenized_inputs["labels"])):
    print("text", i)
    print(tokenizer.decode(tokenized_inputs["input_ids"][i]))
    print([upos_id_to_label(upos_mapping, x) for x in tokenized_inputs["labels"][i] if x != -100])
    print([upos_id_to_label(upos_mapping, x) for x in train_dataset["upos"][i]])

text 0
<s> Sehr gute Beratung, schnelle Behebung der Probleme, so stelle ich mir Kundenservice vor.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
['ADV', 'ADJ', 'NOUN', 'PUNCT', 'ADJ', 'NOUN', 'DET', 'NOUN', 'PUNCT', 'ADV', 'VERB', 'PRON', 'PRON', 'NOUN', 'ADP', 'PUNCT']
['ADV', 'ADJ', 'NOUN', 'PUNCT', 'ADJ', 'NOUN', 'DET', 'NOUN', 'PUNCT', 'ADV', 'VERB', 'PRON', 'PRON', 'NOUN', 'ADP', 'PUNCT']
text 1
<s> Die Kosten sind definitiv auch im in dem Rahmen.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
['DET', 'NOUN', 'VERB', 'ADV', 'ADV', '_', 'ADP', 'DET', 'NOUN', 'PUNCT']
['DET', 'NOUN', 'VERB', 'ADV', 'ADV', '_', 'ADP', 'DET', 'NOUN', 'PUNCT']
text 2
<s> Nette Gespräche, klasse Ergebnis</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>