In [18]:
import itertools
import multiprocessing
from itertools import chain

import torch
from torch.utils.data import IterableDataset, get_worker_info
from transformers import AutoTokenizer, default_data_collator
from datasets import Dataset


def tokenize_and_chunk(
    tokenizer: AutoTokenizer,
    dataset: Dataset,
    text_field: str,
    sequence_length: int,
    num_cpu: int = multiprocessing.cpu_count(),
):
    """
    Build data loaders for training.

    This function performs the following steps:
    1. Load the tokenizer from the pretrained "EleutherAI/gpt-neox-20b" model.
    2. Load the "openwebtext" dataset.
    3. Tokenize the dataset, adding the end-of-sentence token to each text.
    4. Process the tokenized dataset into chunks of a specified block size.

    Returns:
        Dataset: The processed dataset ready for training.
    """
    extra_map_kwargs = {"num_proc": num_cpu}
    if isinstance(dataset, IterableDataset):
        extra_map_kwargs = {}

    current_columns = dataset.column_names
    tokenized_dataset = dataset.map(
        lambda example: tokenizer([t + tokenizer.eos_token for t in example[text_field]]),
        batched=True,
        remove_columns=current_columns,
        **extra_map_kwargs,
    )

    block_size = sequence_length

    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= block_size:
            total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        return result

    train_dataset = tokenized_dataset.map(
        group_texts,
        batched=True,
        **extra_map_kwargs,
    )

    return train_dataset


In [19]:
import torch.utils.data
import datasets
max_length = 2048

data = datasets.load_dataset("c4", "en", split="train", streaming=True)
tokenizer = AutoTokenizer.from_pretrained("t5-base", model_max_length=max_length)

In [20]:
dataset = tokenize_and_chunk(tokenizer, data, text_field="text", sequence_length=max_length, num_cpu=None)

In [24]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, num_workers=2, collate_fn=default_data_collator)

In [25]:
batch = next(iter(dataloader))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [26]:
import torch

torch.tensor(batch["input_ids"])

  torch.tensor(batch["input_ids"])


tensor([[12847,   277, 15068,  ...,     8,   414,    13],
        [  336,   471,     5,  ...,    28,    46,  1287],
        [    6,  9445,  8424,  ...,    45,     8,   814],
        ...,
        [   21,     8,   471,  ...,   979,    16,   112],
        [23659,   774,     5,  ...,    19,    92,    46],
        [  256, 11577,   412,  ...,   112,   372,    28]])