In [None]:
import os
from tqdm import tqdm
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer

In [None]:
!mkdir data

In [None]:
main_path = os.getcwd()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    os.path.join(main_path, "models/rugpt"),
    use_fast=True
)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
num_proc = 8

ds = load_dataset('Den4ikAI/russian_instructions_2', num_proc=num_proc, trust_remote_code=True)
ds = ds['train'].train_test_split(test_size=0.005, seed=2357, shuffle=True)
ds['val'] = ds.pop('test')

In [None]:
def process(example):
    text = f"Вопрос: {example['question']}\n### Ответ: {example['answer']}"
    text = tokenizer.bos_token + text + tokenizer.eos_token
    tokens = tokenizer(
        text, add_special_tokens=False,
        truncation=False,
    )['input_ids']
    return {"ids": tokens, "len": len(tokens)}

In [None]:
tokenized = ds.map(
    process,
    remove_columns=['question', 'answer'],
    desc="Tokenizing",
    num_proc=num_proc,
) # will truncate later

In [None]:
for split, dset in tokenized.items():
    arr_len = np.sum(dset['len'], dtype=np.uint64)
    print(arr_len)
    cur_path = os.path.join(main_path, 'data')
    filename = os.path.join(cur_path, f'{split}.bin')
    dtype = np.uint16
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
    
    idx = 0
    total_batches = 1024
    for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
        batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
        arr_batch = np.concatenate(batch['ids'])
        arr[idx : idx + len(arr_batch)] = arr_batch
        idx += len(arr_batch)
        
    arr.flush()