In [2]:
import torch
import sys
import datasets
from transformers import AutoTokenizer, XLMRobertaModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load the dataset
dataset = datasets.load_dataset(path="universal_dependencies", name="en_ewt", trust_remote_code=True)
print(dataset)
train_dataset = dataset["train"]
valid_dataset = dataset["validation"]
test_dataset = dataset["test"]

print(train_dataset["text"][:10])

DatasetDict({
    train: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 12543
    })
    validation: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 2002
    })
    test: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 2077
    })
})
['Al-Zaman : American forces killed Shaikh Abdullah al-Ani, the preacher at the mosque in the town of Qaim, near the Syrian border.', '[This killing of a respected cleric will be causing us trouble for years to come.]', 'DPA: Iraqi authorities announced that they had busted up 3 terrorist cells operating in Baghdad.', 'Two of them were being run by 2 officials of the Ministry of the Interior!', 'The MoI in Iraq is equivalent to the US FBI, so this would be like having J. Edgar H

In [4]:
all_deprels = [
    # these are the default UD dependency relations according to https://universaldependencies.org/u/dep/
    "acl", "acl:relcl", "advcl", "advcl:relcl", "advmod", "advmod:emph", "advmod:lmod", "amod", "appos",
    "aux", "aux:pass", "case", "cc", "cc:preconj", "ccomp", "clf", "compound", "compound:lvc",
    "compound:prt", "compound:redup", "compound:svc", "conj", "cop", "csubj", "csubj:outer",
    "csubj:pass", "dep", "det", "det:numgov", "det:nummod", "det:poss", "discourse", "dislocated",
    "expl", "expl:impers", "expl:pass", "expl:pv", "fixed", "flat", "flat:foreign", "flat:name",
    "goeswith", "iobj", "list", "mark", "nmod", "nmod:poss", "nmod:tmod", "nsubj", "nsubj:outer",
    "nsubj:pass", "nummod", "nummod:gov", "obj", "obl", "obl:agent", "obl:arg", "obl:lmod",
    "obl:tmod", "orphan", "parataxis", "punct", "reparandum", "root", "vocative", "xcomp",

    # we need some more for en_ewt
    "det:predet", "obl:npmod", "nmod:npmod"
]

# construct deprel to ID mapping
deprel_to_id = {rel: idx for idx, rel in enumerate(all_deprels)}

In [6]:
# Code for the assignment in https://github.com/coli-saar/cl/wiki/Assignment:-Dependency-parsing
# Alexander Koller, December 2023

def strip_none_heads(examples, i):
    tokens = examples["tokens"][i]
    heads = examples["head"][i]
    deprels = examples["deprel"][i]

    non_none = [(t, h, d) for t, h, d in zip(tokens, heads, deprels) if h != "None"]
    return zip(*non_none)


def map_first_occurrence(nums):
    """
    Maps a list of numbers to a dictionary that assigns each unique number the position of its first occurrence.

    Example:
    > map_first_occurrence([0,1,2,3,3,3,4])
    {0: 0, 1: 1, 2: 2, 3: 3, 4: 6}

    :param nums:
    :return:
    """
    seen = set()
    return {num: i for i, num in enumerate(nums) if num is not None and num not in seen and not seen.add(num)}


def pad_to_same_size(lists, padding_symbol):
    maxlen = max([len(l) for l in lists])
    return [l + (padding_symbol,) * (maxlen - len(l)) for l in lists]


def tokenize_and_align_labels(examples, deprel_to_id, tokenizer, skip_index=-100):
    # delete tokens with "None" head and their annotations
    examples_tokens, examples_heads, examples_deprels = [], [], []
    for sentence_id in range(len(examples["tokens"])):
        tt, hh, dd = strip_none_heads(examples, sentence_id)
        examples_tokens.append(tt)
        examples_heads.append(hh)
        examples_deprels.append(dd)

    tokenized_inputs = tokenizer(examples_tokens, truncation=True, is_split_into_words=True,
                                 padding=True)  # get "tokenizer" from global variable
    # tokenized_inputs is a dictionary with keys input_ids and attention_mask;
    # each is a list (per sentence) of lists (per token).

    remapped_heads = []  # these will be lists (per sentence) of lists (per token)
    deprel_ids = []
    tokens_representing_words = []
    num_words: list[int] = []
    maxlen_t2w = 0  # max length of a token_to_word_here list

    for sentence_id, annotated_heads in enumerate(examples_heads):
        deprels = examples_deprels[sentence_id]
        word_ids = tokenized_inputs.word_ids(batch_index=sentence_id)
        word_pos_to_token_pos = map_first_occurrence(
            word_ids)  # word-pos to first token-pos; both start at 0 for first word (actual) / first token (BOS)

        previous_word_idx = None
        heads_here: list[int] = []
        deprel_ids_here: list[int] = []

        # list of token positions that map to words (first token of each word)
        # token 0 -> word 0 (BOS)
        tokens_representing_word_here: list[int] = [0]

        for sentence_position, word_idx in enumerate(word_ids):
            # Special tokens (BOS, EOS) have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                heads_here.append(skip_index)
                deprel_ids_here.append(skip_index)

            # We set the label for the first token of each word;
            # subsequent tokens of the same word will have the same word_idx.
            elif word_idx != previous_word_idx:
                if annotated_heads[word_idx] == "None":  # added by padding
                    print("A 'None' head survived!")
                    sys.exit(0)
                else:
                    # Map HEAD annotation to position of first token of head word.
                    # HEAD = 0 => map it to first token (BOS)
                    # Otherwise, look up first token for HEAD-1 (HEAD is 1-based, word positions are 0-based)
                    head_word_pos = int(annotated_heads[word_idx])
                    head_token_pos = 0 if head_word_pos == 0 else word_pos_to_token_pos[head_word_pos - 1]

                    heads_here.append(head_token_pos)
                    deprel_ids_here.append(deprel_to_id[deprels[word_idx]])

                    tokens_representing_word_here.append(sentence_position)  # first word is index 1; index 0 is BOS

            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                heads_here.append(skip_index)
                deprel_ids_here.append(skip_index)

            previous_word_idx = word_idx

        remapped_heads.append(heads_here)
        deprel_ids.append(deprel_ids_here)
        tokens_representing_words.append(tokens_representing_word_here)

        num_words.append(len(tokens_representing_word_here))
        if len(tokens_representing_word_here) > maxlen_t2w:
            maxlen_t2w = len(tokens_representing_word_here)

    # pad t2w lists to same length
    for t2w in tokens_representing_words:
        t2w += [-1] * (maxlen_t2w - len(t2w))

    tokenized_inputs["head"] = remapped_heads
    tokenized_inputs["deprel_ids"] = deprel_ids
    tokenized_inputs["tokens_representing_words"] = tokens_representing_words
    tokenized_inputs["num_words"] = num_words
    tokenized_inputs["tokenid_to_wordid"] = [tokenized_inputs.word_ids(batch_index=i) for i in
                                             range(len(examples_heads))]  # map token ID to word ID

    return tokenized_inputs

In [21]:
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")

# test tokenization
tokenized_inputs = tokenize_and_align_labels(train_dataset[:10], deprel_to_id, tokenizer)

for i in range(10):
    tokens = tokenizer.convert_ids_to_tokens(tokenized_inputs["input_ids"][i])  # i 是句子的索引
    word_ids = tokenized_inputs["tokenid_to_wordid"][i]

    print(f"Example {i + 1}")
    print(f"{'Token':<15}{'Head':<10}{'Deprel':<15}{'Word Mapping':<15}")
    for j, token in enumerate(tokens):
        if token == tokenizer.pad_token:
            break
        head = tokenized_inputs["head"][i][j]
        deprel = all_deprels[tokenized_inputs["deprel_ids"][i][j]] if tokenized_inputs["deprel_ids"][i][
                                                                          j] != -100 else "None"
        word_mapping = word_ids[j]

        token_str = token if token else "None"
        head_str = str(head) if head != -100 else "None"
        deprel_str = deprel if deprel else "None"
        word_mapping_str = str(word_mapping) if word_mapping is not None else "None"

        print(f"{token_str:<15}{head_str:<10}{deprel_str:<15}{word_mapping_str:<15}")



Example 1
Token          Head      Deprel         Word Mapping   
<s>            None      None           None           
▁Al            0         root           0              
▁-             1         punct          1              
▁Zaman         1         flat           2              
▁:             1         punct          3              
▁American      6         amod           4              
▁forces        7         nsubj          5              
▁killed        1         parataxis      6              
▁Sha           7         obj            7              
ikh            None      None           7              
▁Abdullah      8         flat           8              
▁al            8         flat           9              
▁-             8         punct          10             
▁Ani           8         flat           11             
▁              8         punct          12             
,              None      None           12             
▁the           17        det          