In [1]:
from transformers import AutoTokenizer,BertTokenizer
from datasets import DatasetDict,Dataset,concatenate_datasets


In [2]:
labeled_dataset = DatasetDict.load_from_disk("./../_data/datasets/labeled/all_sliced")
pretrained_model = "bert-base-multilingual-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)



In [3]:
def tokenize_function(sent_tok):
    # Tokenize the input text
    tokens = sent_tok["tokens"]
    labels =  sent_tok["labels"]
    labels.insert(0, 0)  # Adding CLS token label at the beginning of each sequence
    labels.append(0)
    try:
        tokenized_sentence = [101] + [tokenizer.encode(tok)[1] for tok in tokens] + [102]
        sent_tok["input_ids"] = tokenized_sentence
        sent_tok["labels"] = labels
    except:
        print(len(tokens))
    sent_tok["input_ids"] = tokenized_sentence
    sent_tok["labels"] = labels
    return sent_tok
inputid_dataset = labeled_dataset.map(tokenize_function, load_from_cache_file=False) 
inputid_dataset

Map:   0%|          | 0/339344 [00:00<?, ? examples/s]

Map:   0%|          | 0/42418 [00:00<?, ? examples/s]

Map:   0%|          | 0/42419 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentences', 'tokens', 'labels', 'input_ids'],
        num_rows: 339344
    })
    validation: Dataset({
        features: ['sentences', 'tokens', 'labels', 'input_ids'],
        num_rows: 42418
    })
    test: Dataset({
        features: ['sentences', 'tokens', 'labels', 'input_ids'],
        num_rows: 42419
    })
})

In [4]:
inputid_dataset.save_to_disk("./../_data/datasets/inputid/bert-mult-uncased")

Saving the dataset (0/2 shards):   0%|          | 0/339344 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/42418 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/42419 [00:00<?, ? examples/s]

In [5]:
from util.dataset_util import flatten_2d_list
def chunck_pad_map(sents_batch,dataset_name):
    
    f_tokens = flatten_2d_list(sents_batch["tokens"])
    f_labels = flatten_2d_list(sents_batch["labels"])
    f_input_ids = flatten_2d_list(sents_batch["input_ids"])
    # print(f_tokens)
    # print(len(f_input_ids))
    # print(len(f_labels))
    # # print(sents_batch["tokens"])
    # # print(len(sents_batch["tokens"]))
    # raise EOFError
    
    input_ids_list = []
    attention_mask_list = []
    labels_list = []
    chunk_size =512
    tok_pad=0 
    label_pad=-100
    attention_pad=0
    # Create chunks
    # print(len(f_input_ids)// chunk_size)
    for i in range(0, len(f_input_ids), chunk_size):  # We subtract 2 to account for special tokens
        chunked_tokens = f_input_ids[i:i + chunk_size]
        chunk_label_ids = f_labels[i:i + chunk_size]
        chunk_attention_mask = [1] * len(chunked_tokens)
        
        # print(len(chunked_tokens))
        
        while True and len(chunked_tokens) < chunk_size:
            chunked_tokens.append(tok_pad)
            chunk_attention_mask.append(attention_pad)
            chunk_label_ids.append(label_pad)
        input_ids_list.append(chunked_tokens)
        attention_mask_list.append(chunk_attention_mask)
        labels_list.append(chunk_label_ids)

    
    temp_ds = Dataset.from_dict({"input_ids": input_ids_list, "labels": labels_list, "attention_mask": attention_mask_list})
    inputid_dataset[dataset_name] = inputid_dataset[dataset_name].cast(temp_ds.features)
    inputid_dataset[dataset_name]=concatenate_datasets([inputid_dataset[dataset_name],temp_ds])
    return sents_batch


In [6]:

inputid_dataset["train"] = inputid_dataset["train"].remove_columns('sentences')
inputid_dataset["validation"] = inputid_dataset["validation"].remove_columns('sentences')
inputid_dataset["test"] = inputid_dataset["test"].remove_columns('sentences')

inputid_dataset["train"].map(chunck_pad_map,batched=True, load_from_cache_file=False,fn_kwargs={"dataset_name": "train"})
inputid_dataset["validation"].map(chunck_pad_map,batched=True, load_from_cache_file=False,fn_kwargs={"dataset_name": "validation"})
inputid_dataset["test"].map(chunck_pad_map,batched=True, load_from_cache_file=False,fn_kwargs={"dataset_name": "test"})

Map:   0%|          | 0/339344 [00:00<?, ? examples/s]

ValueError: The columns in features (['input_ids', 'labels', 'attention_mask']) must be identical as the columns in the dataset: ['tokens', 'labels', 'input_ids']

In [7]:
inputid_dataset.save_to_disk("./../_data/datasets/batched/bert-mult-uncased")

Saving the dataset (0/2 shards):   0%|          | 0/339344 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/42418 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/42419 [00:00<?, ? examples/s]