In [1]:
import os
from tqdm import tqdm
import numpy as np
import tiktoken
from datasets import load_dataset # huggingface datasets
import glob

# number of workers in .map() call
# good number to use is ~order number of cpu cores // 2
num_proc = 8

# number of workers in load_dataset() call
# best number might be different from num_proc above as it also depends on NW speed.
# it is better than 1 usually though
num_proc_load_dataset = num_proc

enc = tiktoken.get_encoding("gpt2")

model = "4o"
qa_r = 0.2

In [2]:
dataset = load_dataset("KrisMinchev/LLMCollapseBios", f"data_{model}", num_proc=num_proc_load_dataset)
qa = load_dataset("KrisMinchev/LLMCollapseBios", f"qa_{model}", num_proc=num_proc_load_dataset)
split_qa = qa["train"].train_test_split(test_size=qa_r, seed=2357, shuffle=True)
split_qa["val"] = split_qa.pop("test")
split_qa

train-00000-of-00001.parquet:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.


Generating train split:   0%|          | 0/600000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 480000
    })
    val: Dataset({
        features: ['question', 'answer'],
        num_rows: 120000
    })
})

In [7]:
print(split_qa["val"][0]['question'])
print(split_qa["val"][0]['answer'])

What is the birth date of Lupita Troy Mendoza?
Answer: August 11, 1936.


In [25]:
def process_bio(example):
    ids = enc.encode_ordinary(example['output'])
    ids.append(enc.eot_token)
    out = {'ids': ids, 'len': len(ids)}
    return out

tokenized_bio = dataset.map(
    process_bio,
    remove_columns=['output'],
    desc="tokenizing the splits",
    num_proc=num_proc,
)

def process_qa(example):
    ids = enc.encode_ordinary(example['question'] + example['answer'])
    ids.append(enc.eot_token)
    out = {'ids': ids, 'len': len(ids)}
    return out

tokenized_qa = split_qa.map(
    process_qa,
    desc="tokenizing the splits",
    num_proc=num_proc,
)
print(tokenized_qa)

DatasetDict({
    train: Dataset({
        features: ['name', 'college', 'degree', 'birthplace', 'location', 'company', 'birthday', 'prompt', 'ids', 'len'],
        num_rows: 80000
    })
    val: Dataset({
        features: ['name', 'college', 'degree', 'birthplace', 'location', 'company', 'birthday', 'prompt', 'ids', 'len'],
        num_rows: 20000
    })
})


In [22]:
# print(tokenized_bio.items())
# print(tokenized_qa.items())
for split, dset in tokenized_bio.items():
    arr_len = np.sum(dset['len'], dtype=np.uint64)
    filename = f'./{split}_bio.bin'
    dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
    total_batches = 1024

    idx = 0
    for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
        # Batch together samples for faster write
        batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
        arr_batch = np.concatenate(batch['ids'])
        # Write into mmap
        arr[idx : idx + len(arr_batch)] = arr_batch
        idx += len(arr_batch)
    arr.flush()

for split, dset in tokenized_qa.items():
    arr_len = np.sum(dset['len'], dtype=np.uint64)
    filename = f'./{split}_qa.bin'
    dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
    total_batches = 1024

    idx = 0
    for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
        # Batch together samples for faster write
        batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
        arr_batch = np.concatenate(batch['ids'])
        # Write into mmap
        arr[idx : idx + len(arr_batch)] = arr_batch
        idx += len(arr_batch)
    arr.flush()

writing ./train_bio.bin: 100%|██████████| 1024/1024 [00:01<00:00, 833.77it/s]
writing ./train_qa.bin: 100%|██████████| 1024/1024 [00:01<00:00, 904.37it/s]
writing ./val_qa.bin: 100%|██████████| 1024/1024 [00:01<00:00, 1022.05it/s]


In [23]:
files = ['train_bio.bin', 'train_qa.bin']
out_data = b''
for fn in files:
  with open(fn, 'rb') as fp:
    out_data += fp.read()
with open('train.bin', 'wb') as fp:
  fp.write(out_data)

os.rename('val_qa.bin', 'val.bin')
# os.remove('train_*.bin')
for f in glob.glob('train_*.bin'):
    os.remove(f)