In [4]:
from transformers import AutoTokenizer
from datasets import DatasetDict,Dataset,concatenate_datasets


In [5]:
labeled_dataset = DatasetDict.load_from_disk("./../_data/datasets/labeled/all_sliced")
pretrained_model = "bert-base-multilingual-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

In [6]:
def tokenize_function(sent_tok):
    # Tokenize the input text
    tokens = sent_tok["tokens"]
    labels =  sent_tok["labels"]
    labels.insert(0, 0)  # Adding CLS token label at the beginning of each sequence
    labels.append(0)
    try:
        tokenized_sentence = [101] + [tokenizer.encode(tok)[1] for tok in tokens] + [102]
        sent_tok["input_ids"] = tokenized_sentence
        sent_tok["labels"] = labels
    except:
        print(len(tokens))
    return sent_tok
inputid_dataset = labeled_dataset.map(tokenize_function, load_from_cache_file=False) 
inputid_dataset

In [7]:
inputid_dataset.save_to_disk("./../_data/datasets/inputid/bert-mult-uncased")

In [8]:
# Create an empty dictionary with specified columns
empty_data = { 
    "input_ids": [[1]],  # Replace with your actual input IDs
    "labels": [[0]],    # Replace with your actual labels
    "attention_mask": [[1]],  # Replace with your actual attention mask
}
batched_dataset = DatasetDict({
    "train": Dataset.from_dict(empty_data),
    "validation": Dataset.from_dict(empty_data),
    "test": Dataset.from_dict(empty_data)
})


In [9]:
from _utils.dataset_util import flatten_2d_list
def chunck_pad_map(sents_batch,dataset_name):
    
    f_tokens = flatten_2d_list(sents_batch["tokens"])
    f_labels = flatten_2d_list(sents_batch["labels"])
    f_input_ids = flatten_2d_list(sents_batch["input_ids"])
    # print(f_tokens)
    # print(len(f_input_ids))
    # print(len(f_labels))
    # # print(sents_batch["tokens"])
    # # print(len(sents_batch["tokens"]))
    # raise EOFError
    
    input_ids_list = []
    attention_mask_list = []
    labels_list = []
    chunk_size =512
    tok_pad=0 
    label_pad=-100
    attention_pad=0
    # Create chunks
    # print(len(f_input_ids)// chunk_size)
    for i in range(0, len(f_input_ids), chunk_size):  # We subtract 2 to account for special tokens
        chunked_tokens = f_input_ids[i:i + chunk_size]
        chunk_label_ids = f_labels[i:i + chunk_size]
        chunk_attention_mask = [1] * len(chunked_tokens)
        
        # print(len(chunked_tokens))
        
        while True and len(chunked_tokens) < chunk_size:
            chunked_tokens.append(tok_pad)
            chunk_attention_mask.append(attention_pad)
            chunk_label_ids.append(label_pad)
        input_ids_list.append(chunked_tokens)
        attention_mask_list.append(chunk_attention_mask)
        labels_list.append(chunk_label_ids)

    
    temp_ds = Dataset.from_dict({"input_ids": input_ids_list, "labels": labels_list, "attention_mask": attention_mask_list})
    batched_dataset[dataset_name] = batched_dataset[dataset_name].cast(temp_ds.features)
    batched_dataset[dataset_name]=concatenate_datasets([batched_dataset[dataset_name], temp_ds])
    return sents_batch


In [10]:

inputid_dataset["train"] = inputid_dataset["train"].remove_columns('sentences')
inputid_dataset["validation"] = inputid_dataset["validation"].remove_columns('sentences')
inputid_dataset["test"] = inputid_dataset["test"].remove_columns('sentences')

inputid_dataset["train"].map(chunck_pad_map,batched=True, load_from_cache_file=False,fn_kwargs={"dataset_name": "train"})
inputid_dataset["validation"].map(chunck_pad_map,batched=True, load_from_cache_file=False,fn_kwargs={"dataset_name": "validation"})
inputid_dataset["test"].map(chunck_pad_map,batched=True, load_from_cache_file=False,fn_kwargs={"dataset_name": "test"})

In [11]:
print(inputid_dataset)
print(batched_dataset)

In [12]:
batched_dataset['train'] = batched_dataset['train'].select(range(1, len(batched_dataset['train'])))
batched_dataset['validation'] = batched_dataset['validation'].select(range(1, len(batched_dataset['validation'])))
batched_dataset['test'] = batched_dataset['test'].select(range(1, len(batched_dataset['test'])))
batched_dataset

In [13]:
# inputid_dataset.save_to_disk("./../_data/datasets/batched/bert-mult-uncased")

In [14]:
batched_dataset.save_to_disk("./../_data/datasets/batched/bert-base-multilingual-uncased")