In [8]:
from transformers import AutoTokenizer
from datasets import DatasetDict,Dataset,concatenate_datasets
import random

In [9]:
inputid_dataset= DatasetDict.load_from_disk("./../_data/datasets/inputid/bert-mult-uncased/")

In [10]:
inputid_test = inputid_dataset['test']

In [11]:
inputid_test

Dataset({
    features: ['sentences', 'tokens', 'labels', 'input_ids'],
    num_rows: 42419
})

In [12]:

# Define the error injection function
def inject_label_errors(dataset, error_rates):
    """
    Inject errors into the labels of the dataset.

    :param dataset: The input dataset.
    :param error_rates: A dictionary with error rates for each class.
                        Example: {0: 0.1, 1: 0.05, 2: 0.2}
    :return: A new dataset with injected label errors.
    """
    def apply_error(label):
        if random.random() < error_rates[label]:
            new_label = random.choice([l for l in range(len(error_rates)) if l != label])
            return new_label
        return label

    def inject_errors_in_labels(examples):
        examples["labels"] = [apply_error(label) for label in examples["labels"]]
        return examples

    # Apply error injection
    new_dataset = dataset.map(inject_errors_in_labels, batched=False)
    return new_dataset

error_rates = {0: 0.3, 1: 0.1, 2: 0.2}

# Inject errors into the dataset
injected_test= inject_label_errors(inputid_test, error_rates)

In [5]:
labeled_dataset = DatasetDict.load_from_disk("./../_data/datasets/labeled/all_sliced")
pretrained_model = "bert-base-multilingual-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)



In [6]:
def tokenize_function(sent_tok):
    # Tokenize the input text
    tokens = sent_tok["tokens"]
    labels =  sent_tok["labels"]
    labels.insert(0, 0)  # Adding CLS token label at the beginning of each sequence
    labels.append(0)
    try:
        tokenized_sentence = [101] + [tokenizer.encode(tok)[1] for tok in tokens] + [102]
        sent_tok["input_ids"] = tokenized_sentence
        sent_tok["labels"] = labels
    except:
        print(len(tokens))
    return sent_tok
inputid_dataset = labeled_dataset.map(tokenize_function, load_from_cache_file=False) 
inputid_dataset

Map:   0%|          | 0/339344 [00:00<?, ? examples/s]

Map:   0%|          | 0/42418 [00:00<?, ? examples/s]

Map:   0%|          | 0/42419 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentences', 'tokens', 'labels', 'input_ids'],
        num_rows: 339344
    })
    validation: Dataset({
        features: ['sentences', 'tokens', 'labels', 'input_ids'],
        num_rows: 42418
    })
    test: Dataset({
        features: ['sentences', 'tokens', 'labels', 'input_ids'],
        num_rows: 42419
    })
})

In [7]:
inputid_dataset.save_to_disk("./../_data/datasets/inputid/bert-mult-uncased")

Saving the dataset (0/2 shards):   0%|          | 0/339344 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/42418 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/42419 [00:00<?, ? examples/s]

In [8]:
# Create an empty dictionary with specified columns
empty_data = { 
    "input_ids": [[1]],  # Replace with your actual input IDs
    "labels": [[0]],    # Replace with your actual labels
    "attention_mask": [[1]],  # Replace with your actual attention mask
}
bached_dataset = DatasetDict({
    "train": Dataset.from_dict(empty_data),
    "validation": Dataset.from_dict(empty_data),
    "test": Dataset.from_dict(empty_data)
})


In [9]:
from _utils.dataset_util import flatten_2d_list
def chunck_pad_map(sents_batch,dataset_name):
    
    f_tokens = flatten_2d_list(sents_batch["tokens"])
    f_labels = flatten_2d_list(sents_batch["labels"])
    f_input_ids = flatten_2d_list(sents_batch["input_ids"])
    # print(f_tokens)
    # print(len(f_input_ids))
    # print(len(f_labels))
    # # print(sents_batch["tokens"])
    # # print(len(sents_batch["tokens"]))
    # raise EOFError
    
    input_ids_list = []
    attention_mask_list = []
    labels_list = []
    chunk_size =512
    tok_pad=0 
    label_pad=-100
    attention_pad=0
    # Create chunks
    # print(len(f_input_ids)// chunk_size)
    for i in range(0, len(f_input_ids), chunk_size):  # We subtract 2 to account for special tokens
        chunked_tokens = f_input_ids[i:i + chunk_size]
        chunk_label_ids = f_labels[i:i + chunk_size]
        chunk_attention_mask = [1] * len(chunked_tokens)
        
        # print(len(chunked_tokens))
        
        while True and len(chunked_tokens) < chunk_size:
            chunked_tokens.append(tok_pad)
            chunk_attention_mask.append(attention_pad)
            chunk_label_ids.append(label_pad)
        input_ids_list.append(chunked_tokens)
        attention_mask_list.append(chunk_attention_mask)
        labels_list.append(chunk_label_ids)

    
    temp_ds = Dataset.from_dict({"input_ids": input_ids_list, "labels": labels_list, "attention_mask": attention_mask_list})
    bached_dataset[dataset_name] = bached_dataset[dataset_name].cast(temp_ds.features)
    bached_dataset[dataset_name]=concatenate_datasets([bached_dataset[dataset_name], temp_ds])
    return sents_batch


In [10]:

inputid_dataset["train"] = inputid_dataset["train"].remove_columns('sentences')
inputid_dataset["validation"] = inputid_dataset["validation"].remove_columns('sentences')
inputid_dataset["test"] = inputid_dataset["test"].remove_columns('sentences')

inputid_dataset["train"].map(chunck_pad_map,batched=True, load_from_cache_file=False,fn_kwargs={"dataset_name": "train"})
inputid_dataset["validation"].map(chunck_pad_map,batched=True, load_from_cache_file=False,fn_kwargs={"dataset_name": "validation"})
inputid_dataset["test"].map(chunck_pad_map,batched=True, load_from_cache_file=False,fn_kwargs={"dataset_name": "test"})

Map:   0%|          | 0/339344 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/220 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/451 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/667 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/895 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1121 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1352 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1575 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1801 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2036 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2261 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2481 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2720 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2952 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3178 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3407 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3632 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3849 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4071 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4294 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4511 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4735 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4972 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5194 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5419 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5640 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5871 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6095 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6323 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6558 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6786 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7004 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7233 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7458 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7681 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7909 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8127 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8348 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8579 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8813 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9035 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9257 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9485 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9716 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9951 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/10175 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/10399 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/10626 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/10854 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/11083 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/11315 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/11546 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/11772 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/11999 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/12221 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/12453 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/12677 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/12902 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/13126 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/13352 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/13574 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/13800 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/14026 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/14255 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/14477 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/14704 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/14922 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/15150 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/15376 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/15606 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/15829 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/16047 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/16268 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/16487 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/16717 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/16947 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/17178 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/17409 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/17638 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/17861 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/18097 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/18327 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/18564 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/18796 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/19015 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/19242 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/19467 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/19704 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/19927 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/20156 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/20383 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/20610 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/20837 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/21062 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/21284 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/21503 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/21732 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/21962 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/22192 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/22420 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/22647 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/22873 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/23099 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/23324 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/23558 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/23785 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/24006 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/24234 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/24462 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/24691 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/24920 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/25156 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/25375 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/25602 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/25830 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/26060 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/26287 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/26524 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/26749 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/26977 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/27215 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/27439 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/27661 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/27881 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/28106 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/28328 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/28562 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/28795 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/29018 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/29243 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/29459 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/29691 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/29909 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/30130 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/30352 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/30580 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/30812 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/31036 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/31251 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/31477 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/31703 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/31920 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/32144 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/32376 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/32597 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/32822 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/33042 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/33267 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/33491 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/33712 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/33939 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/34171 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/34394 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/34622 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/34848 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/35076 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/35297 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/35525 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/35745 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/35964 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/36192 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/36423 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/36638 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/36865 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/37085 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/37306 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/37536 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/37758 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/37991 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/38217 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/38439 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/38673 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/38903 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/39131 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/39363 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/39588 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/39812 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/40055 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/40290 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/40508 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/40731 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/40948 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/41170 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/41387 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/41618 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/41848 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/42071 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/42299 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/42529 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/42755 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/42983 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/43214 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/43436 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/43653 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/43882 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/44100 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/44329 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/44554 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/44778 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/44997 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/45222 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/45452 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/45682 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/45908 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/46128 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/46354 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/46582 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/46802 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/47029 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/47254 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/47478 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/47702 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/47928 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/48159 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/48392 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/48618 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/48851 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/49075 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/49296 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/49526 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/49747 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/49972 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/50202 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/50429 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/50654 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/50886 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/51119 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/51346 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/51566 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/51793 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/52016 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/52247 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/52470 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/52700 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/52921 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/53153 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/53378 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/53605 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/53829 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/54055 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/54274 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/54506 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/54738 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/54972 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/55199 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/55424 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/55643 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/55873 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/56094 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/56318 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/56542 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/56773 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/57004 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/57229 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/57460 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/57689 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/57913 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/58135 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/58360 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/58593 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/58816 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/59042 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/59266 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/59481 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/59709 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/59934 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/60166 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/60402 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/60631 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/60862 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/61083 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/61309 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/61533 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/61755 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/61986 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/62208 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/62437 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/62667 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/62891 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/63119 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/63342 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/63561 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/63786 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/64013 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/64236 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/64460 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/64676 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/64897 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/65119 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/65336 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/65566 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/65802 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/66032 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/66262 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/66487 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/66710 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/66940 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/67167 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/67394 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/67621 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/67847 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/68070 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/68295 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/68528 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/68754 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/68978 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/69202 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/69428 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/69654 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/69889 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/70117 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/70346 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/70573 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/70803 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/71023 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/71251 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/71476 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/71707 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/71933 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/72155 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/72379 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/72613 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/72847 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/73081 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/73307 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/73541 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/73771 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/73996 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/74223 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/74457 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/74683 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/74913 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/75139 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/75362 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/75581 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/75813 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/76043 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/76272 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/76489 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/76716 [00:00<?, ? examples/s]

Map:   0%|          | 0/42418 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/221 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/450 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/671 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/888 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1116 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1341 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1567 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1792 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2022 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2253 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2487 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2713 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2941 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3158 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3387 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3608 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3833 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4051 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4275 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4506 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4745 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4969 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5193 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5412 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5626 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5852 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6084 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6309 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6537 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6761 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6986 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7218 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7445 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7680 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7906 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8126 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8351 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8576 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8802 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9030 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9247 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9471 [00:00<?, ? examples/s]

Map:   0%|          | 0/42419 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/224 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/452 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/675 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/899 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1123 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1346 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1578 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1810 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2038 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2259 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2479 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2703 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2920 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3143 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3375 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3596 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3815 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4034 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4261 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4487 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4710 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4936 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5155 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5375 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5597 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5822 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6054 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6281 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6502 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6727 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6965 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7197 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7419 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7639 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7868 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8102 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8327 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8560 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8780 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9007 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9228 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9453 [00:00<?, ? examples/s]

Dataset({
    features: ['tokens', 'labels', 'input_ids'],
    num_rows: 42419
})

In [11]:
print(inputid_dataset)
print(bached_dataset)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'input_ids'],
        num_rows: 339344
    })
    validation: Dataset({
        features: ['tokens', 'labels', 'input_ids'],
        num_rows: 42418
    })
    test: Dataset({
        features: ['tokens', 'labels', 'input_ids'],
        num_rows: 42419
    })
})
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 76796
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 9565
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 9546
    })
})


In [12]:
bached_dataset['train'] = bached_dataset['train'].select(range(1,len(bached_dataset['train'])) )
bached_dataset['validation'] = bached_dataset['validation'].select(range(1,len(bached_dataset['validation'])) )
bached_dataset['test'] = bached_dataset['test'].select(range(1,len(bached_dataset['test'])) )
bached_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 76795
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 9564
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 9545
    })
})

In [13]:
# inputid_dataset.save_to_disk("./../_data/datasets/batched/bert-mult-uncased")

In [14]:
bached_dataset.save_to_disk("./../_data/datasets/batched/bert-base-multilingual-uncased")

Saving the dataset (0/2 shards):   0%|          | 0/76795 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9564 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9545 [00:00<?, ? examples/s]