In [1]:
from transformers import AutoTokenizer
from datasets import DatasetDict,Dataset,concatenate_datasets
import random

In [2]:
inputid_dataset= DatasetDict.load_from_disk("./../_data/datasets/inputid/bert-mult-uncased/")

pretrained_model = "bert-base-multilingual-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
inputid_dataset['test']



Dataset({
    features: ['sentences', 'tokens', 'labels', 'input_ids'],
    num_rows: 42419
})

In [3]:

# Define the error injection function
def inject_label_errors(dataset, error_rates):
    """
    Inject errors into the labels of the dataset.

    :param dataset: The input dataset.
    :param error_rates: A dictionary with error rates for each class.
                        Example: {0: 0.1, 1: 0.05, 2: 0.2}
    :return: A new dataset with injected label errors.
    """
    def apply_error(label):
        if random.random() < error_rates[label]:
            new_label = random.choice([l for l in range(len(error_rates)) if l != label])
            return new_label
        return label

    def inject_errors_in_labels(examples):
        examples["labels"] = [apply_error(label) for label in examples["labels"]]
        return examples

    # Apply error injection
    new_dataset = dataset.map(inject_errors_in_labels, batched=False)
    return new_dataset

In [4]:

error_rates = {0: 0.3, 1: 0.1, 2: 0.2}

# Inject errors into the dataset
injected_test= inject_label_errors(inputid_dataset['test'], error_rates)

In [5]:
# Create an empty dictionary with specified columns
empty_data = { 
    "input_ids": [[1]],  # Replace with your actual input IDs
    "labels": [[0]],    # Replace with your actual labels
    "attention_mask": [[1]],  # Replace with your actual attention mask
}
batched_dataset = DatasetDict({
    "test": Dataset.from_dict(empty_data)
})

In [6]:
from _utils.dataset_util import flatten_2d_list
def chunk_pad_map(sents_batch,dataset_name):
    
    f_tokens = flatten_2d_list(sents_batch["tokens"])
    f_labels = flatten_2d_list(sents_batch["labels"])
    f_input_ids = flatten_2d_list(sents_batch["input_ids"])
    # print(f_tokens)
    # print(len(f_input_ids))
    # print(len(f_labels))
    # # print(sents_batch["tokens"])
    # # print(len(sents_batch["tokens"]))
    # raise EOFError
    
    input_ids_list = []
    attention_mask_list = []
    labels_list = []
    chunk_size =512
    tok_pad=0 
    label_pad=-100
    attention_pad=0
    # Create chunks
    # print(len(f_input_ids)// chunk_size)
    for i in range(0, len(f_input_ids), chunk_size):  # We subtract 2 to account for special tokens
        chunked_tokens = f_input_ids[i:i + chunk_size]
        chunk_label_ids = f_labels[i:i + chunk_size]
        chunk_attention_mask = [1] * len(chunked_tokens)
        
        # print(len(chunked_tokens))
        
        while True and len(chunked_tokens) < chunk_size:
            chunked_tokens.append(tok_pad)
            chunk_attention_mask.append(attention_pad)
            chunk_label_ids.append(label_pad)
        input_ids_list.append(chunked_tokens)
        attention_mask_list.append(chunk_attention_mask)
        labels_list.append(chunk_label_ids)

    
    temp_ds = Dataset.from_dict({"input_ids": input_ids_list, "labels": labels_list, "attention_mask": attention_mask_list})
    batched_dataset[dataset_name] = batched_dataset[dataset_name].cast(temp_ds.features)
    batched_dataset[dataset_name]=concatenate_datasets([batched_dataset[dataset_name], temp_ds])
    return sents_batch


In [7]:

inputid_dataset['test'] = inputid_dataset['test'].remove_columns('sentences')
inputid_dataset['test'].map(chunk_pad_map,batched=True, load_from_cache_file=False,fn_kwargs={"dataset_name": "test"})
batched_test = batched_dataset['test'].select(range(1,len(batched_dataset['test'])))
batched_test

Map:   0%|          | 0/42419 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/224 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/452 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/675 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/899 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1123 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1346 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1578 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1810 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2038 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2259 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2479 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2703 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2920 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3143 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3375 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3596 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3815 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4034 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4261 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4487 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4710 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4936 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5155 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5375 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5597 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5822 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6054 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6281 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6502 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6727 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6965 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7197 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7419 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7639 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7868 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8102 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8327 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8560 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8780 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9007 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9228 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9453 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'labels', 'attention_mask'],
    num_rows: 9545
})

In [8]:
batched_test


Dataset({
    features: ['input_ids', 'labels', 'attention_mask'],
    num_rows: 9545
})

In [9]:
batched_test.save_to_disk("./../_data/datasets/batched/bert-base-multilingual-uncased/injected/")

Saving the dataset (0/1 shards):   0%|          | 0/9545 [00:00<?, ? examples/s]