### Installing and Importing Dependencies

In [None]:
%pip install tokenizers

In [None]:
from datasets import load_dataset
from tqdm import tqdm

from tokenizers import Tokenizer, decoders
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.normalizers import NFD, Lowercase, StripAccents, Sequence

### Importing Dataset

In [None]:
dataset = load_dataset("YourAccountName/PreTraining", split="train", streaming=True)

### Training Tokenizer

In [None]:
tokenizer = Tokenizer(BPE())

tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
tokenizer.normalizer = Sequence([NFD(), StripAccents()])

special_tokens=["<|endoftext|>", "<|system|>", "<|user|>", "<|assistant|>", "<|im_start|>", "<|im_sep|>", "<|im_end|>", "<think>", "</think>", "<pad>", "<unk>"]

tokenizer.decoder = decoders.ByteLevel()

trainer = BpeTrainer(
    vocab_size=32000,
    min_frequency= 2,
    special_tokens = special_tokens
)

def text_iterator(dataset, text_key="text"):
    for sample in dataset:
        if text_key in sample and sample[text_key]:
            yield sample[text_key]

iterator = text_iterator(dataset) 

print("Starting Tokenizer Training")
tokenizer.train_from_iterator(tqdm(iterator, desc="Training Tokenizer", mininterval=5.0), trainer=trainer)
print("Tokenizer Training Completed!")

### Save and Load Tokenizer

In [None]:
tokenizer.save("LumenTokenizer.json")

In [None]:
tokenizer = Tokenizer.from_file("LumenTokenizer.json")

### Testing Tokenizer

In [None]:
test_text = (
    "<|user|>Hello there! 👋\n"
    "Can you summarize the following passage?\n\n"
    "“Artificial intelligence refers to the ability of machines to perform tasks that normally require human intelligence, such as learning and problem-solving.”\n\n"
    "Here are some points:\n"
    "\t1. It involves learning algorithms and neural networks.\n"
    "\t2. It can process language, images, and patterns.\n"
    "\t3. It aims to make systems more adaptive and intelligent.\n\n"
    "<|assistant|>Sure! In short, artificial intelligence is about enabling machines to learn and think like humans.\n"
    "It focuses on reasoning, understanding, and decision-making.\n"
    "<|endoftext|>"
)

print("Original Text")
print(test_text)

encoded = tokenizer.encode(test_text)
print("\nEncoded Text")
print("IDs:", encoded.ids)
print("Tokens:", encoded.tokens)

decoded = tokenizer.decode(encoded.ids, skip_special_tokens=False)
print("\nDecoded Text")
print(decoded)
