# Dataloading experiments

This notebook is for experimenting with different dataloading implementations and strategies.

Some requirements:

- working with very large datasets of text (100s of GBs) split across multiple files
- need to be able to load data in distributed training across multiple GPUs (i.e. each GPU sees non-overlapping portion of the data)
- need to be able to load batches of data (i.e. [batch_size, seq_len])
- need to be able to load data in a streaming fashion, so that the whole dataset doesn't need to fit into memory at once
- needs to include tokenization

Desired properties:

- fast
- memory efficient
- minimizes time spent loading data, so that the GPU is never waiting for the data to load


## Huggingface Dataset with Pytorch IterableDataset and DataLoader

Since we alread use huggingface to get our datasets, let's try using huggingface's Dataset in combination with Pytorch's DataLoader.


In [1]:
from gollem.data.common import DATA_CACHE_DIR
from gollem.tokenizer import get_tokenizer

tinystories_dataset_id = "roneneldan/Tinystories"
tinystories_ds_path = DATA_CACHE_DIR / "tinystories" / "TinyStories_all_data"

tokenizer = get_tokenizer("gpt2")

In [2]:
from datasets import load_dataset_builder
from datasets import IterableDataset

ds_builder = load_dataset_builder(path=str(tinystories_ds_path))
for k, v in ds_builder.info.__dict__.items():
    print(k, v)


Resolving data files:   0%|          | 0/50 [00:00<?, ?it/s]

description 
citation 
homepage 
license 
features {'story': Value(dtype='string', id=None), 'instruction': {'features': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'prompt:': Value(dtype='string', id=None), 'words': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}, 'summary': Value(dtype='string', id=None), 'source': Value(dtype='string', id=None)}
post_processed None
supervised_keys None
builder_name json
dataset_name tiny_stories_all_data
config_name default
version 0.0.0
splits {'train': SplitInfo(name='train', num_bytes=6519810584, num_examples=4967871, shard_lengths=[400000, 400000, 400000, 400000, 400000, 400000, 400000, 400000, 400000, 400000, 400000, 400000, 167871], dataset_name='tiny_stories_all_data')}
download_checksums {'/Users/jonathon/code/gollem/gollem/data/datasets/tinystories/TinyStories_all_data/data00.json': {'num_bytes': 140424235, 'checksum': None}, '/Users/jonathon/code/gollem/gollem/data/datasets/tinystories/TinySt

In [3]:
from datasets import get_dataset_split_names

print(get_dataset_split_names(path=str(tinystories_ds_path)))

Resolving data files:   0%|          | 0/50 [00:00<?, ?it/s]

['train']


In [4]:
from datasets import load_dataset


ds = load_dataset(path=str(tinystories_ds_path), split="train")

Resolving data files:   0%|          | 0/50 [00:00<?, ?it/s]

In [5]:
ds

Dataset({
    features: ['story', 'instruction', 'summary', 'source'],
    num_rows: 4967871
})

In [6]:
len(ds)

4967871

In [7]:
from typing import cast


iter_ds = load_dataset(path=str(tinystories_ds_path), split="train", streaming=True)
iter_ds = cast(IterableDataset, iter_ds)
iter_ds.column_names

Resolving data files:   0%|          | 0/50 [00:00<?, ?it/s]

['story', 'instruction', 'summary', 'source']

In [10]:
from typing import Generator

import numpy as np

iter_ds = cast(IterableDataset, iter_ds)
# Strategy
# 1. Use hugginface IterableDataset to load batches of examples
# 2. tokenize each batch of examples on the fly
# 3. concatenate tokens to get batch of seq_len tokens with no padding (collecting next batch of examples as necessary)
# 4. repeat

# Questions:
# - how to make this work with distributed training?
#    - can we customize the number of shards even if they are different from number of files?
tokenizer = get_tokenizer("gpt2")

data_column = "story"
# columns_to_remove = [c for c in iter_ds.column_names if c != data_column]


def tokenize_batch(examples):
    return {"text": tokenizer.encode_batch(examples[data_column])}


# get batch from iter_ds
processed_ds = iter_ds.map(
    tokenize_batch, batched=True, batch_size=32, remove_columns=iter_ds.column_names
)
processed_ds = cast(IterableDataset, processed_ds)
print(processed_ds)

# get batch from batched_ds
batched_ds = processed_ds.batch(batch_size=1)


batch_size = 32
seq_len = 1024


def get_batch() -> Generator[np.ndarray, None, None]:
    buffer = [[] * batch_size]
    current_batch = np.full((batch_size, seq_len), 0, dtype=np.long)
    current_idxs = np.zeros(batch_size, dtype=np.int32)

    for batch in batched_ds:
        batch_lens = np.array([len(t) for t in batch["text"]])
        batch_idxs = seq_len - batch_lens
        end_idxs = current_idxs

        # current_batch[current_idxs:end_idxs] =


for i, batch in enumerate(batched_ds):
    print(
        i,
        len(batch["text"]),
        min(len(t) for t in batch["text"]),
        max(len(t) for t in batch["text"]),
    )
    if i == 33:
        break

# get batch from batched_ds
print(batch)


IterableDataset({
    features: Unknown,
    num_shards: 50
})
0 1 308 308
1 1 200 200
2 1 130 130
3 1 159 159
4 1 167 167
5 1 199 199
6 1 131 131
7 1 170 170
8 1 195 195
9 1 192 192
10 1 209 209
11 1 167 167
12 1 135 135
13 1 189 189
14 1 216 216
15 1 212 212
16 1 160 160
17 1 167 167
18 1 135 135
19 1 162 162
20 1 136 136
21 1 138 138
22 1 113 113
23 1 139 139
24 1 184 184
25 1 414 414
26 1 147 147
27 1 208 208
28 1 138 138
29 1 191 191
30 1 152 152
31 1 199 199
32 1 197 197
33 1 143 143
{'text': [[7454, 2402, 257, 640, 11, 612, 373, 257, 1310, 2576, 3706, 20037, 13, 1375, 2227, 257, 2495, 6576, 284, 5806, 284, 607, 1545, 338, 10955, 2151, 13, 2332, 1995, 1820, 1718, 607, 284, 262, 3650, 284, 1064, 530, 13, 220, 198, 43, 813, 2497, 257, 10283, 286, 20239, 27309, 10938, 319, 262, 3355, 13, 1375, 6235, 284, 257, 11398, 530, 290, 531, 11, 366, 40, 765, 326, 530, 2474, 220, 198, 29252, 1820, 13541, 290, 4193, 20037, 1949, 319, 262, 2495, 6576, 13, 632, 4197, 7138, 290, 20037, 2936, 588, 