In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset 
import pyarrow as pa 
import os 
from tqdm import tqdm 
import concurrent.futures

# Tokenize and process

1. Loop through each files in the dataset and load it into memory
2. Multi-process with all threads, then having an open stream to write into
3. Save the table down 

In [2]:
base_folder = "../data_all/data"

In [3]:
tokenizer = AutoTokenizer.from_pretrained("./tokenizer/tokenizer-50k")
tokenizer.model_max_length = 10000000

In [4]:
import time 
from hashlib import sha256

for i in range(10): 
    hashed = sha256(str(time.time()).encode("utf-8"))
    print(hashed.hexdigest())

8d8a8f1e828ac8b3df9e3e0c7a935ca11dfff38900f4d69b37ba17d3826924f1
8e896b46abd66d965883b5713eeb86732ef809d2f00056ff42008fb4414f7580
baf1ac4fd46bc83e54d689349a1b3ddd6cc7ef18b3ccaf08a827428078ad6f54
ee840529a3b85637222004772ab2f02a346821d70653fa0433241583a5f18ab3
99af78acd141352a8264e51fc18c117f46e62e5348245cbfdc1ec9ef5b9c5afb
ed3163e5bd9d45f63fe9b5ab1c0725af9a015e0f3b3f6d8438cd7f5a2ab9fe6d
30ffdc7c4b728d3bc4250ed0f9b5bfad718a61924981986a3724a0cd0f86df9c
c65ab14a3afdcb11be5cc93a14e0f95469fc9f749f5b51dfa9ae70ddb43ae45d
63f2f2117a55ac4e647ef7a02510c2c1b925f2dbeebbe9bd85f8a0cb8a9f6e4e
2602de87101aa4b4f8a93e56df0155219c0def1c0586a90a330267285783c31e


In [17]:

def process_and_join(batch): 
    content = []
    hashed = sha256(str(time.time()).encode("utf-8")).hexdigest()
    stream =  pa.CompressedOutputStream(f"./temp/{hashed}.arrow", compression="gzip")

    outputs = tokenizer.batch_encode_plus(batch["text"])["input_ids"]

    for row in outputs: 
        # remove [SEP] token 
        row = row[:-1]

        if len(row) <= 128: 
            content.append(row)
        else: 
            for start_idx in range(0, len(row), 90): 
                content.append(row[start_idx:start_idx+128]) 

    table = pa.Table.from_arrays(
        [pa.array(content)], names=["input_ids"])

    writer = pa.RecordBatchStreamWriter(stream, table.schema)
    writer.write_table(table)

    return batch

def convert_to_tokens(batch, tokenizer): 
    outputs = tokenizer.batch_encode_plus(batch["text"])["input_ids"]
    return {"input_ids": outputs}

In [18]:
for i, file in tqdm(enumerate(sorted(os.listdir(base_folder)))): 
    if i < 3: 
        continue

    dataset = load_dataset("arrow", data_files=[os.path.join(base_folder, file)], split="train", streaming=False, num_proc=16)
    # dataset = dataset.map(convert_to_tokens, batched=True, fn_kwargs={"tokenizer": tokenizer}, remove_columns=["text"], num_proc=14)
    dataset = dataset.map(process_and_join, batched=True, num_proc=14, batch_size=10000, remove_columns=["text"])


    break 

0it [00:00, ?it/s]

Map (num_proc=14):   0%|          | 0/1000000 [00:00<?, ? examples/s]

3it [06:02, 120.85s/it]


In [19]:
import pyarrow
dataset = load_dataset("./temp", num_proc=16, streaming=False, split="train")

Resolving data files:   0%|          | 0/112 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/32 [00:00<?, ?it/s]

In [20]:
dataset

Dataset({
    features: ['input_ids'],
    num_rows: 15545460
})

In [21]:
dataset.to_parquet("./final/final_0000.parquet", batch_size=10000)

Creating parquet from Arrow format:   0%|          | 0/1555 [00:00<?, ?ba/s]

15017696336