In [1]:
import os
from tqdm import tqdm
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained(
    "/home/user/models/rugpt",
    use_fast=True
)

In [3]:
num_proc = 8

In [4]:
ds = load_dataset('freQuensy23/ru-alpaca-cleaned', num_proc=num_proc, trust_remote_code=True)
ds = ds['train'].train_test_split(test_size=0.05, seed=2357, shuffle=True)
ds['val'] = ds.pop('test')

In [5]:
def process(example):
    text = f"### Инструкция: {example['instruction']}\n### Запрос: {example['input']}\n### Ответ: {example['output']}"
    tokens = tokenizer.encode(text, add_special_tokens=True)  # Автоматическое добавление [BOS]/[EOS]
    return {"ids": tokens, "len": len(tokens)}

In [6]:
tokenized = ds.map(
    process,
    remove_columns=['instruction', 'input', 'output'],
    desc="Tokenizing",
    num_proc=num_proc,
)

Tokenizing (num_proc=8):   0%|          | 0/25631 [00:00<?, ? examples/s]

Tokenizing (num_proc=8):   0%|          | 0/1350 [00:00<?, ? examples/s]

In [7]:
for split, dset in tokenized.items():
    arr_len = np.sum(dset['len'], dtype=np.uint64)
    print(arr_len)
    filename = os.path.join('/home/user/data', f'{split}.bin')
    dtype = np.uint16
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
    
    idx = 0
    total_batches = 1024
    for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
        batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
        arr_batch = np.concatenate(batch['ids'])
        arr[idx : idx + len(arr_batch)] = arr_batch
        idx += len(arr_batch)
        
    arr.flush()

1515127


writing /home/user/data/train.bin: 100%|██████████| 1024/1024 [00:01<00:00, 534.97it/s]


78975


writing /home/user/data/val.bin: 100%|██████████| 1024/1024 [00:01<00:00, 610.99it/s]
