In [1]:
import datasets
from collections import defaultdict
from transformers import PreTrainedTokenizerFast

dataset = datasets.load_from_disk("../data/raw")
tokenizer = PreTrainedTokenizerFast.from_pretrained("../MalBERTa")


def handle_sample(sample, **kwargs):
    tokenizer = kwargs.get("tokenizer")

    if tokenizer is None:
        raise Exception("Missing tokenizer")

    texts = sample["text"]
    labels = sample["label"]

    flattened = defaultdict(list)

    for text, label in zip(texts, labels):
        tokenized = tokenizer(
            text,
            padding="max_length",
            max_length=32,
            return_overflowing_tokens=True,
            truncation=True,
        )

        for i in range(len(tokenized["input_ids"])):
            for k in tokenized:
                flattened[k].append(tokenized[k][i])
            flattened["label"].append(label)

    return dict(flattened)


processed_dataset = dataset.map(
    handle_sample,
    remove_columns=dataset["test"].column_names,
    batch_size=64,
    batched=True,
    num_proc=8,
    fn_kwargs=dict(tokenizer=tokenizer),
)

  from .autonotebook import tqdm as notebook_tqdm
Map (num_proc=8): 100%|██████████| 5552/5552 [04:44<00:00, 19.54 examples/s]
Map (num_proc=8): 100%|██████████| 1388/1388 [01:17<00:00, 17.92 examples/s]


In [2]:
from tqdm import tqdm

l1 = tqdm(processed_dataset["train"], leave=False)
unique_train_seqs = set(tuple(sample["input_ids"]) for sample in l1)
l2 = tqdm(processed_dataset["test"], leave=False)
unique_test_seqs = set(tuple(sample["input_ids"]) for sample in l2)

print(
    f"{len(unique_train_seqs)} Unique sequences in train set = {(len(unique_train_seqs) / len(processed_dataset['train'])) * 100:.2f}%"
)
print(
    f"{len(unique_test_seqs)} Unique sequences in test set = {(len(unique_test_seqs) / len(processed_dataset['test'])) * 100:.2f}%"
)


                                                               

In [3]:
len(unique_test_seqs)

4436483