In [28]:
from datasets import ClassLabel
from transformers import AutoTokenizer
from data.data_utils import get_data


data = get_data("data/norne-nb-in5550-train.conllu.gz")
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

class_labels = [
    'B-DRV',
    'B-EVT',
    'B-GPE_LOC',
    'B-GPE_ORG',
    'B-LOC',
    'B-ORG',
    'B-PER',
    'B-PROD',
    'I-DRV',
    'I-EVT',
    'I-GPE_LOC',
    'I-GPE_ORG',
    'I-LOC',
    'I-ORG',
    'I-PER',
    'I-PROD',
    'O'
]

In [None]:
c2l = ClassLabel(
    num_classes=len(class_labels),
    names=class_labels,
)

In [4]:
def encode(txt):
    return tokenizer(
        txt, 
        padding=True,
        truncation=True,
        is_split_into_words=True,
        max_length=194
    )

In [38]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    
    for word_id in word_ids:
        if word_id is None:
            new_labels.append(-100)
        else:
            label = c2l.str2int(labels[word_id])
            new_labels.append(label)
        
    return new_labels

In [26]:
data[0][1]

['I',
 'tillegg',
 'skal',
 'daglig',
 'leder',
 'ved',
 'Miljøtransport',
 'AS',
 ',',
 'som',
 'har',
 'oppdrag',
 'med',
 'henting',
 'av',
 'smittefarlig',
 'avfall',
 'for',
 'klinikken',
 ',',
 'ha',
 'blitt',
 'ilagt',
 'munnkurv',
 'av',
 'klinikken',
 '.']

In [27]:
data[1][1]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'I-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [29]:
c2l.str2int(data[1][1])

[16,
 16,
 16,
 16,
 16,
 16,
 5,
 13,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16]

In [31]:
encode(data[0][1]).word_ids()

[None,
 0,
 1,
 1,
 1,
 2,
 2,
 3,
 3,
 3,
 4,
 4,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 8,
 9,
 9,
 10,
 10,
 11,
 11,
 11,
 11,
 12,
 12,
 13,
 13,
 13,
 14,
 14,
 15,
 15,
 15,
 15,
 15,
 15,
 16,
 16,
 16,
 17,
 18,
 18,
 18,
 18,
 19,
 20,
 21,
 21,
 21,
 22,
 22,
 22,
 23,
 23,
 23,
 23,
 23,
 24,
 24,
 25,
 25,
 25,
 25,
 26,
 None]

In [32]:
align_labels_with_tokens(c2l.str2int(data[1][1]), encode(data[0][1]).word_ids())

[-100,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 13,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 -100]

In [39]:
align_labels_with_tokens(data[1][1], encode(data[0][1]).word_ids())

[-100,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 13,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 -100]

In [35]:
encode(data[0][1]).word_ids()

[None,
 0,
 1,
 1,
 1,
 2,
 2,
 3,
 3,
 3,
 4,
 4,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 8,
 9,
 9,
 10,
 10,
 11,
 11,
 11,
 11,
 12,
 12,
 13,
 13,
 13,
 14,
 14,
 15,
 15,
 15,
 15,
 15,
 15,
 16,
 16,
 16,
 17,
 18,
 18,
 18,
 18,
 19,
 20,
 21,
 21,
 21,
 22,
 22,
 22,
 23,
 23,
 23,
 23,
 23,
 24,
 24,
 25,
 25,
 25,
 25,
 26,
 None]