## Load data

In [21]:
import os
from tqdm import tqdm
import numpy as np
from datasets import load_dataset # huggingface datasets
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

num_proc = 8
subset = 'nov_dec_50M'
data_folder = f'/home/jan/projects/LMStartup/data/cc_czech/{subset}/'
trainfile = f'{data_folder}train.txt'
testfile = f"{data_folder}test.txt"
# this will be saved in HuggingFace's cache folder
dataset = load_dataset("text", data_files={
    "train": trainfile,
    "test": testfile})
dataset['val'] = dataset.pop('test') # rename the test split to val

Using custom data configuration default-466bc83e493ad028
Found cached dataset text (/home/jan/.cache/huggingface/datasets/text/default-466bc83e493ad028/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)
100%|██████████| 2/2 [00:00<00:00, 94.75it/s]


## Train tokenizer

In [23]:
sample_size = 10000
def get_training_corpus():
    for i in range(0, len(dataset['train']),sample_size):
        yield dataset['train'][i:i+sample_size]["text"]

tokenization_type = 'BPE'
special_tokens = ["<unk>", "<|endoftext|>"]

if tokenization_type == 'WORD':
    # word level tokenizer
    tokenizer = Tokenizer(models.WordLevel(unk_token="<unk>"))
    # loqwercase and strip accents
    tokenizer.normalizer = normalizers.Sequence(
        [normalizers.Lowercase(), normalizers.StripAccents()]
    )
    # split on whitespace and punctuation
    tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
        [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]
    )
    trainer = trainers.WordLevelTrainer( vocab_size=500000, special_tokens=special_tokens)

elif tokenization_type == 'BPE':
    tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    tokenizer.normalizer = normalizers.Sequence(
        [normalizers.Lowercase(), normalizers.StripAccents()]
    )
    tokenizer.decoder = decoders.ByteLevel()
    trainer = trainers.BpeTrainer(vocab_size=50048, special_tokens=special_tokens)


tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
print("Slovník má velikost: ", len(tokenizer.get_vocab()))




Slovník má velikost:  50000


In [38]:
# ukázka tokenizace
snt = tokenizer.encode('petr pavel bude následující prezident.')
print(snt.tokens)
tokenizer.decode(snt.ids)


['Ġ', '<unk>', 'e', 'tr', 'Ġ', '<unk>', 'av', 'el', 'Ġbude', 'ĠnÃ¡sledujÃŃcÃŃ', 'Ġprezident', '<unk>']


' etr avel bude následující prezident'

## Save tokenizer 

In [31]:
# create a directory if it doesn't exist
save_folder = f'../temp/cc/{subset}'
os.makedirs(save_folder, exist_ok=True)
save_path = f'{save_folder}/tokenizer{tokenization_type.upper()}.json'
tokenizer.save(save_path)

## Tokenize

In [41]:
tokenizer = Tokenizer.from_file(save_path)

def process(example):
    enc = tokenizer.encode(example['text']+"<|endoftext|>") 
    out = {'ids': enc.ids, 'len': len(enc.ids)}
    return out

# tokenize the dataset
tokenized = dataset.map(
    process,
    remove_columns=['text'],
    desc="tokenizing the splits",
    num_proc=num_proc,
    load_from_cache_file=False,
)


for split, dset in tokenized.items():
    arr_len = np.sum(dset['len'])
    filename = f'{save_folder}/{split}.bin'
    dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))

    print(f"writing {filename}...")
    idx = 0
    for example in tqdm(dset):
        arr[idx : idx + example['len']] = example['ids']
        idx += example['len']
    arr.flush()

tokenizing the splits #0:   0%|          | 0/146911 [00:00<?, ?ex/s]



[A[A[A[A


[A[A[A
[A

[A[A




[A[A[A[A[A





tokenizing the splits #1:   0%|          | 295/146911 [00:00<00:49, 2940.18ex/s]



[A[A[A[A


[A[A[A
[A

[A[A




[A[A[A[A[A





tokenizing the splits #0:   1%|          | 847/146911 [00:00<00:34, 4292.53ex/s]



[A[A[A[A


[A[A[A
[A

[A[A




[A[A[A[A[A





[A[A[A[A[A[A


tokenizing the splits #0:   1%|          | 1277/146911 [00:00<00:38, 3804.39ex/s]
[A



[A[A[A[A

[A[A




[A[A[A[A[A





[A[A[A[A[A[A


tokenizing the splits #0:   1%|          | 1670/146911 [00:00<00:37, 3848.64ex/s]



[A[A[A[A
[A

[A[A




[A[A[A[A[A





tokenizing the splits #0:   1%|▏         | 2059/146911 [00:00<00:37, 3855.44ex/s]


[A[A[A



[A[A[A[A
[A




[A[A[A[A[A

[A[A





tokenizing the splits #0:   2%|▏         | 2448/146911 [00:00<00:39, 3651.46ex/s]


[A[A[A



[A[A[A[A






writing ../temp/cc/nov_dec_50M/train.bin...


100%|██████████| 1175284/1175284 [01:00<00:00, 19536.79it/s]


writing ../temp/cc/nov_dec_50M/val.bin...


100%|██████████| 11872/11872 [00:00<00:00, 19234.42it/s]
