## Download and tokenize the OpenWebText dataset

In [1]:
#Basically Andrej Karpathy's nanogpt code with minimal changes
#https://github.com/karpathy/nanoGPT/blob/master/data/openwebtext/prepare.py

In [2]:
#!pip install datasets==3.6.0
#important


!pip install ipywidgets jupyter datasets==3.6.0 transformers numpy tqdm wandb tiktoken

Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting datasets==3.6.0
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting wandb
  Downloading wandb-0.22.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting tiktoken
  Downloading tiktoken-0.12.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (6.7 kB)
Collecting pyarrow>=15.0.0 (from datasets==3.6.0)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets==3.6.0)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets==3.6.0)
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting xxhash (from datasets==3.6.0)
  Downloading xxhash-3.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x

In [3]:
import os

# Define the folder at the level of the code :) 
folder_path = 'data/openwebtext'

# Check if the folder exists
if not os.path.exists(folder_path):
    # Create the folder
    os.makedirs(folder_path)
    print(f'Folder created at: {folder_path}')
else:
    print(f'Folder already exists at: {folder_path}')

Folder created at: data/openwebtext


In [4]:
# saves the openwebtext dataset to a binary file for training. following was helpful:
# https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py

import os
from tqdm import tqdm
import numpy as np
import tiktoken
from datasets import load_dataset # huggingface datasets

# number of workers in .map() call
# good number to use is ~order number of cpu cores // 2
num_cores = os.cpu_count()
num_proc = max(1, 1+num_cores//2)

# number of workers in load_dataset() call
# best number might be different from num_proc above as it also depends on NW speed.
# it is better than 1 usually though
num_proc_load_dataset = num_proc

enc = tiktoken.get_encoding("gpt2")


# takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
print("Loading dataset")
dataset = load_dataset("Skylion007/openwebtext", num_proc=num_proc_load_dataset, trust_remote_code=True, 
#                       cache_dir =...,   #uncomment if you want to select a different place for the dataset cache
                       )
print("Dataset loading done")
# owt by default only contains the 'train' split, so create a test split
split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)
split_dataset['val'] = split_dataset.pop('test') # rename the test split to val

# this results in:
# >>> split_dataset
# DatasetDict({
#     train: Dataset({
#         features: ['text'],
#         num_rows: 8009762
#     })
#     val: Dataset({
#         features: ['text'],
#         num_rows: 4007
#     })
# })

# we now want to tokenize the dataset. first define the encoding function (gpt2 bpe)
def process(example, enc = enc):
    ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
    ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
    # note: I think eot should be prepended not appended... hmm. it's called "eot" though...
    out = {'ids': ids, 'len': len(ids)}
    return out

# tokenize the dataset
print('Tokenization started')
tokenized = split_dataset.map(
    process,
    remove_columns=['text'],
    desc="tokenizing the splits",
    num_proc=num_proc,
)

# concatenate all the ids in each dataset into one large file we can use for training
for split, dset in tokenized.items():
    arr_len = np.sum(dset['len'], dtype=np.uint64)
    filename = os.path.join(folder_path,f'{split}.bin')
    dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
    total_batches = 1024

    idx = 0
    for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
        # Batch together samples for faster write
        batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
        arr_batch = np.concatenate(batch['ids'])
        # Write into mmap
        arr[idx : idx + len(arr_batch)] = arr_batch
        idx += len(arr_batch)
    arr.flush()

print('Data preparation finished')
# train.bin is ~17GB, val.bin ~8.5MB
# train has ~9B tokens (9,035,582,198)
# val has ~4M tokens (4,434,897)

# to read the bin files later, e.g. with numpy:
# m = np.memmap('train.bin', dtype=np.uint16, mode='r')

Loading dataset


README.md: 0.00B [00:00, ?B/s]

openwebtext.py: 0.00B [00:00, ?B/s]

Setting num_proc from 65 to 21 for the train split as it only contains 21 shards.


Generating train split:   0%|          | 0/8013769 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/83 [00:00<?, ?it/s]

Dataset loading done
Tokenization started


tokenizing the splits (num_proc=65):   0%|          | 0/8009762 [00:00<?, ? examples/s]

tokenizing the splits (num_proc=65):   0%|          | 0/4007 [00:00<?, ? examples/s]

writing data/openwebtext/train.bin: 100%|██████████| 1024/1024 [01:08<00:00, 14.91it/s]
writing data/openwebtext/val.bin: 100%|██████████| 1024/1024 [00:01<00:00, 762.23it/s]

Data preparation finished



