### Installing and Importing Dependencies

In [None]:
%pip install tokenizers

In [4]:
from datasets import load_dataset
from tqdm import tqdm

from tokenizers import Tokenizer, decoders
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.normalizers import NFD, Lowercase, StripAccents, Sequence

### Importing Dataset

In [None]:
dataset = load_dataset("YourAccountName/PreTraining", split="train", streaming=True)

### Training Tokenizer

In [None]:
tokenizer = Tokenizer(BPE())

tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
tokenizer.normalizer = Sequence([NFD(), StripAccents()])

special_tokens=["<|endoftext|>", "<|system|>", "<|user|>", "<|assistant|>", "<|im_start|>", "<|im_sep|>", "<|im_end|>", "<think>", "</think>", "<pad>", "<unk>"]

tokenizer.decoder = decoders.ByteLevel()

trainer = BpeTrainer(
    vocab_size=32000,
    min_frequency= 2,
    special_tokens = special_tokens
)

def text_iterator(dataset, text_key="text"):
    for sample in dataset:
        if text_key in sample and sample[text_key]:
            yield sample[text_key]

iterator = text_iterator(dataset) 

print("Starting Tokenizer Training")
tokenizer.train_from_iterator(tqdm(iterator, desc="Training Tokenizer", mininterval=5.0), trainer=trainer)
print("Tokenizer Training Completed!")

### Save and Load Tokenizer

In [None]:
tokenizer.save("LumenTokenizer.json")

In [5]:
tokenizer = Tokenizer.from_file("LumenTokenizer.json")

### Testing Tokenizer

In [6]:
test_text = "<|im_start|>user\nWhat are the three primary colors?<|im_end|>\n<|im_start|>assistant\nThe three primary colors are red, blue, and yellow.<|im_end|>"

print("Original Text")
print(test_text)

encoded = tokenizer.encode(test_text)
print("\nEncoded Text")
print("IDs:", encoded.ids)
print("Tokens:", encoded.tokens)

decoded = tokenizer.decode(encoded.ids, skip_special_tokens=False)
print("\nDecoded Text")
print(decoded)


Original Text
<|im_start|>user
What are the three primary colors?<|im_end|>
<|im_start|>assistant
The three primary colors are red, blue, and yellow.<|im_end|>

Encoded Text
IDs: [4, 3396, 192, 3340, 406, 256, 874, 4180, 10688, 41, 6, 192, 4, 558, 8508, 192, 578, 874, 4180, 10688, 406, 2987, 22, 5091, 22, 283, 6870, 24, 6]
Tokens: ['<|im_start|>', 'user', 'Ċ', 'What', 'Ġare', 'Ġthe', 'Ġthree', 'Ġprimary', 'Ġcolors', '?', '<|im_end|>', 'Ċ', '<|im_start|>', 'ass', 'istant', 'Ċ', 'The', 'Ġthree', 'Ġprimary', 'Ġcolors', 'Ġare', 'Ġred', ',', 'Ġblue', ',', 'Ġand', 'Ġyellow', '.', '<|im_end|>']

Decoded Text
<|im_start|>user
What are the three primary colors?<|im_end|>
<|im_start|>assistant
The three primary colors are red, blue, and yellow.<|im_end|>
