In [14]:
import pandas as pd
import torch
import transformers
from transformers import AutoTokenizer
import datasets
from typing import Any, Tuple


In [26]:
path: str = "/home/philko/Documents/Uni/WiSe2223/UnsupervisedLearning/udl-negation/data/processed/wn_neg_processed/debug.txt"

In [2]:
model_str: str = 'prajjwal1/bert-small'

In [28]:
tok = AutoTokenizer.from_pretrained('roberta-base')

KeyboardInterrupt: 

In [3]:
tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(
            model_str,
            use_fast=True,
            max_len=512)

In [None]:
tokenizer.vocab_size

30522

In [4]:
tokenizer.add_tokens(["[REF-BEG]", "[REF-END]"])

2

In [None]:
def tokenize_dataset(elem: dict) -> dict:
    """Tokenize Dataset.

    Atomic function applied to each instance of the dataset.

    :param elem: Element of the dataset.
    :returns: Dictionary including 'input_ids', 'attention_mask' and
        'labels'.
    """
    attention_mask: list = []
    input_ids: list = []
    labels: list = []
    for elem_masked, elem_unmasked in zip(elem['x'], elem['y']):
        masked: torch.Tensor = tokenizer(elem_masked, return_tensors='pt')
        unmasked: torch.Tensor = tokenizer(elem_unmasked, return_tensors='pt')
        unm: torch.Tensor = unmasked['input_ids']
        msk: torch.Tensor = masked['input_ids']
        att: torch.Tensor = masked['attention_mask']
        if masked['input_ids'].shape != unmasked['input_ids'].shape:
            msk, att = equalize_data(unm, msk, tokenizer.mask_token_id)
        unm[(msk == unm)] = -100
        attention_mask.append(att.squeeze())
        input_ids.append(msk.squeeze().long().tolist())
        labels.append(unm.squeeze().long().tolist())
    result: dict = {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }
    return result

In [None]:
def equalize_data(
        unmasked: torch.Tensor,
        masked: torch.Tensor,
        mask_token_id: int) -> Tuple[torch.Tensor, torch.Tensor]:
    """Add Mask Token to Shorter List.

    Sometimes it happens, that a masked word consists of two tokens (e.g.
    'foul ball' -> '<mask>'). To have equal length for masking, it is necessary,
    to have the same length. Thus, more masked tokens are added to to the
    masked sequence to account for the split (e.g. '<mask>' -> '<mask> <mask>').
    This is done in index space ([..., 50296, ...]-> [..., 50296, 50296, ...]).

    :param unmasked: Unmasked sequence (the longer sequence).
    :param masked: Masked sequence (the shorter sequence.)
    :param mask_token_id: Id of masked token from tokenizer.
    :returns: Equalized dataset.
    """
    ind: int = (masked == tokenizer.masked_token_id).nonzero(as_tuple=True)[1].item()
    diff: int = unmasked.shape[1] - masked.shape[1] + 1
    return (
        torch.cat(
            (
                masked[0][0:ind],
                torch.tensor([mask_token_id] * diff),
                masked[0][(ind + 1)::]
            ), 0).unsqueeze(0),
            torch.Tensor([1] * unmasked.shape[1]).to(torch.int8))

In [None]:
tokenizer.mask_token_id

103

In [None]:
len(tokenizer.)

In [None]:
ds = pd.read_csv(path, header=None)
ds.columns = ['text']
ds

Unnamed: 0,text
0,This is a natural object and not an artifact[M...
1,This is a natural object[MASK] and not an arti...
2,This is an artifact and not a natural object[M...
3,This is an artifact[MASK] and not a natural ob...
4,This is an overachievement and not an underach...
...,...
78,This is a deceleration and not an acceleration...
79,This is a deceleration[MASK] and not an accele...
80,This is an opening and not a closing[MASK].[RE...
81,This is an opening[MASK] and not a closing.[RE...


In [None]:
end_index: int = 30523
begin_index: int = 30522

tokenized = tokenizer(ds.iloc[74].text, return_tensors='pt')
tokenized = {k : v.squeeze() for k, v in tokenized.items()}

if (end_index in tokenized['input_ids']) and (begin_index in tokenized['input_ids']):
    bool_vector = torch.ones(len(tokenized['input_ids']), dtype=torch.bool)
    begin, end = ((tokenized['input_ids'].squeeze() == begin_index).nonzero(as_tuple=True)[0])[0].item(), ((tokenized['input_ids'].squeeze() == end_index).nonzero(as_tuple=True)[0])[0].item()
    rm = torch.zeros(end + 1 - begin, dtype=torch.bool)
    bool_vector[torch.Tensor(range(begin, end + 1)).long()] = rm
    orig = torch.masked_select(tokenized['input_ids'], bool_vector)
    label = torch.masked_select(tokenized['input_ids'], ~bool_vector)
    label = label[1::]
    label = label[:-1]
    ind: int = (orig == tokenizer.mask_token_id).nonzero(as_tuple=True)[0].item()
    label_f = (torch.zeros(ind) - 100, label, torch.zeros(len(orig[(ind + 1):: ])) - 100)
    elem = (orig[0:ind], torch.tensor([tokenizer.mask_token_id] * len(label)), orig[(ind + 1):: ])
    elem, label_f = torch.concat(elem).unsqueeze(0), torch.concat(label_f).unsqueeze(0)
    attention_mask: torch.Tensor = torch.ones(elem.shape[1]).unsqueeze(0)
elem, label_f, attention_mask

(tensor([[ 101, 2023, 2003, 2019, 2248, 3462, 1998, 2025, 1037, 4968, 3462,  103,
           103, 1012,  102]]),
 tensor([[-100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
          -100., 4968., 3462., -100., -100.]]),
 tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]))

In [None]:
tokenized

{'input_ids': tensor([  101,  2023,  2003,  2019,  2248,  3462,  1998,  2025,  1037,  4968,
          3462,   103,  1012, 30522,  4968,  3462, 30523,   102]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}

In [None]:
def mask(inputs):
    labels = inputs.clone()
    # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
    probability_matrix = torch.full(labels.shape, 0.15)
    print(labels.tolist())
    special_tokens_mask = [
            tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
        ]
    special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)

    probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels

In [None]:
mask(tokenizer('amelia was a bitch , i knew that quite well .', return_tensors='pt')['input_ids'])

[[101, 11556, 2001, 1037, 7743, 1010, 1045, 2354, 2008, 3243, 2092, 1012, 102]]


(tensor([[  101, 11556,  2001,  1037, 17824,   103,  1045,  2354,  2008,  3243,
           2092,  1012,   102]]),
 tensor([[-100, -100, -100, -100, 7743, 1010, -100, -100, -100, -100, -100, -100,
          -100]]))

# Final Function

In [43]:
def process_dataset(inputs: dict, end_index: int = 30523, begin_index: int = 30522, mlm_probabilty: float =0.15):
    """"""
    tokenized: dict = tokenizer(inputs['text'], return_tensors='pt')
    if (end_index in tokenized['input_ids']) and (begin_index in tokenized['input_ids']):
        tokenized = {k : v.squeeze() for k, v in tokenized.items()}

        # Find masked word in reference sequence
        bool_vector: torch.Tensor = torch.ones(len(tokenized['input_ids']), dtype=torch.bool)
        begin: int = ((tokenized['input_ids'].squeeze() == begin_index).nonzero(as_tuple=True)[0])[0].item()
        end: int = ((tokenized['input_ids'].squeeze() == end_index).nonzero(as_tuple=True)[0])[0].item()
        rm: torch.Tensor = torch.zeros(end + 1 - begin, dtype=torch.bool)
        bool_vector[torch.Tensor(range(begin, end + 1)).long()] = rm

        # Get sentence (masked) and label (known word)
        orig: torch.Tensor = torch.masked_select(tokenized['input_ids'], bool_vector)
        label: torch.Tensor = torch.masked_select(tokenized['input_ids'], ~bool_vector)
        # Remove special tokens around label
        label = label[1::]
        label = label[:-1]

        # Get split sequences 
        ind: int = (orig == tokenizer.mask_token_id).nonzero(as_tuple=True)[0].item()
        label_f: tuple = (torch.zeros(ind, dtype=torch.long) - 100, label, torch.zeros(len(orig[(ind + 1):: ]), dtype=torch.long) - 100)
        elem: tuple = (orig[0:ind], torch.tensor([tokenizer.mask_token_id] * len(label)), orig[(ind + 1):: ])

        # Concatenate splitted sequences and prepare for return
        elem: torch.Tensor = torch.concat(elem).unsqueeze(0)
        label_f: torch.Tensor = torch.concat(label_f).unsqueeze(0)
        attention_mask: torch.Tensor = torch.ones(elem.shape[1], dtype=torch.long).unsqueeze(0)
        return_dict: dict = {
            'input_ids': elem,
            'labels': label_f,
            'attention_mask': attention_mask}
        return return_dict
    else:
        # MLM-Masking, according to Devlin et al. 2018.
        input_ids = tokenized['input_ids']
        labels = input_ids.clone()
        # Sample mlm_probability% of all appropriate tokens.
        probability_matrix = torch.full(labels.shape, mlm_probabilty)
        special_tokens_mask = [
                tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
            ]
        special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)

        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100

        # 80 % will be masked
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        input_ids[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

        # 15 % will be randomly filled.
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
        input_ids[indices_random] = random_words[indices_random]

        return_dict: dict = {
            'input_ids': input_ids,
            'labels': labels,
            'attention_mask': torch.ones(input_ids.shape[1], dtype=torch.long).unsqueeze(0)}
        return return_dict

In [34]:
path: str = "/home/philko/Documents/Uni/WiSe2223/UnsupervisedLearning/udl-negation/data/processed/mixed_experiment/data.txt"

In [10]:
data = datasets.load_dataset(
    "text", data_files=path, sample_by="line")

Found cached dataset text (/home/philko/.cache/huggingface/datasets/text/default-f2076ceb9adf1711/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

In [44]:
# tokenize_function  = lambda examples: tokenizer(examples["text"])
ds = data.map(
    process_dataset)

Map:   0%|          | 0/1510400 [00:00<?, ? examples/s]

torch.int64 torch.int64
torch.int64 torch.int64 torch.int64
torch.int64 torch.int64
torch.int64 torch.int64
torch.int64 torch.int64 torch.int64
torch.int64 torch.int64
torch.int64 torch.int64 torch.int64
torch.int64 torch.int64
torch.int64 torch.int64 torch.int64
torch.int64 torch.int64
torch.int64 torch.int64
torch.int64 torch.int64
torch.int64 torch.int64 torch.int64
torch.int64 torch.int64 torch.int64
torch.int64 torch.int64
torch.int64 torch.int64
torch.int64 torch.int64 torch.int64
torch.int64 torch.int64
torch.int64 torch.int64
torch.int64 torch.int64 torch.int64
torch.int64 torch.int64 torch.int64
torch.int64 torch.int64 torch.int64
torch.int64 torch.int64
torch.int64 torch.int64
torch.int64 torch.int64 torch.int64
torch.int64 torch.int64
torch.int64 torch.int64
torch.int64 torch.int64 torch.int64
torch.int64 torch.int64 torch.int64
torch.int64 torch.int64 torch.int64
torch.int64 torch.int64
torch.int64 torch.int64
torch.int64 torch.int64
torch.int64 torch.int64
torch.int64 torc

KeyboardInterrupt: 

In [13]:
ds = pd.read_csv(path, header=None, delimiter='NODELIMITERUSEDHEREJUSTREADLINEBYLINE')
ds.columns = ['text']
data: datasets.Dataset = datasets.Dataset.from_pandas(ds)


ds = data.map(
    process_dataset,
    new_fingerprint='asdasdasd')

  ds = pd.read_csv(path, header=None, delimiter='NODELIMITERUSEDHEREJUSTREADLINEBYLINE')


Map:   0%|          | 0/1510400 [00:00<?, ? examples/s]

KeyError: 'input_ids'