In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer
import torch
import numpy as np
import psutil
import functools
import operator
import argparse
import torch
import json

def main():
    config = json.load(open('configs/preprocess_baby_lm_100M.json', 'rb'))
    ds_name = config["dataset_name"]
    ds_config = config["dataset_config"]
    hf_tokenizer = config["hf_tokenizer"]
    max_seq_len = config["max_seq_len"]
    output = config["output"]

    NUM_CPU = psutil.cpu_count()
    print(f"Using {NUM_CPU} CPUs...")

    tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer)

    def tokenize_and_pack(batch, max_seq_len=max_seq_len):
        tokenized_batch = tokenizer(batch["text"]).input_ids
        tokenized_batch_flat = functools.reduce(operator.iconcat, tokenized_batch, [])
        packed_batch = np.reshape(tokenized_batch_flat[:-(len(tokenized_batch_flat)%max_seq_len)], (-1, max_seq_len))
        return packed_batch.tolist()

    print("Loading dataset...")
    ds = load_dataset(ds_name,ds_config,trust_remote_code = True)



    print("Packing dataset...")
    ds = ds.map(lambda x: {"packed":tokenize_and_pack(x)}, remove_columns=ds['train'].column_names, batched=True, batch_size=100000, num_proc=NUM_CPU)

    ds = ds.shuffle()

    print("Saving dataset...")
    ds.save_to_disk(output, num_proc=NUM_CPU)


if __name__=='__main__':
    main()

Using 12 CPUs...
Loading dataset...
Packing dataset...


Map (num_proc=12):   0%|          | 0/3334512 [00:27<?, ? examples/s]


NameError: name 'functools' is not defined

In [None]:
from huggingface_hub import HfApi
import os
api = HfApi(token=os.getenv("HF_TOKEN"))
api.upload_folder(
    folder_path="./dataset_storage/baby-lm-strict.hf",
    repo_id="InoWouw/BabyLM-strict",
    repo_type="dataset",
)



data-00000-of-00012.arrow:   0%|          | 0.00/9.23M [00:00<?, ?B/s]

[A[A



[A[A[A[A


[A[A[A
data-00000-of-00012.arrow:   1%|▏         | 123k/9.23M [00:00<00:08, 1.10MB/s]

[A[A



data-00000-of-00012.arrow:   3%|▎         | 262k/9.23M [00:00<00:07, 1.25MB/s]
[A

[A[A



[A[A[A[A
data-00000-of-00012.arrow:   4%|▍         | 393k/9.23M [00:00<00:07, 1.12MB/s]



[A[A[A[A
data-00000-of-00012.arrow:   6%|▋         | 582k/9.23M [00:00<00:06, 1.33MB/s]

[A[A



[A[A[A[A
data-00000-of-00012.arrow:   8%|▊         | 721k/9.23M [00:00<00:07, 1.18MB/s]

[A[A
[A



data-00000-of-00012.arrow:   9%|▉         | 844k/9.23M [00:00<00:08, 1.04MB/s]

[A[A
data-00000-of-00012.arrow:  10%|█         | 967k/9.23M [00:00<00:08, 963kB/s] 



[A[A[A[A

[A[A
[A



data-00000-of-00012.arrow:  12%|█▏        | 1.10M/9.23M [00:01<00:08, 925kB/s]

[A[A
[A



data-00000-of-00012.arrow:  13%|█▎        | 1.23M/9.23M [00:01<00:08, 893kB/s]

[A[A
[A



data-00000-of-0001

CommitInfo(commit_url='https://huggingface.co/datasets/InoWouw/BabyLM-strict-small/commit/4c83f40d5f07ef7e74b76a2bdb66b2372c68e93d', commit_message='Upload folder using huggingface_hub', commit_description='', oid='4c83f40d5f07ef7e74b76a2bdb66b2372c68e93d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/InoWouw/BabyLM-strict-small', endpoint='https://huggingface.co', repo_type='dataset', repo_id='InoWouw/BabyLM-strict-small'), pr_revision=None, pr_num=None)