### Installing and Importing Dependencies

In [1]:
%pip install tokenizers

Note: you may need to restart the kernel to use updated packages.


In [1]:
from datasets import load_dataset
from tqdm import tqdm

from tokenizers import Tokenizer, decoders
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.normalizers import NFD, Lowercase, StripAccents, Sequence

### Importing Dataset

In [3]:
dataset = load_dataset("HariomJangra/Lumen-PreTraining", split="train", streaming=True)

Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

### Training Tokenizer

In [None]:
tokenizer = Tokenizer(BPE())

tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
tokenizer.normalizer = Sequence([NFD(), StripAccents()])

special_tokens=["<|endoftext|>", "<|system|>", "<|user|>", "<|assistant|>", "<|im_start|>", "<|im_sep|>", "<|im_end|>", "<think>", "</think>", "<pad>", "<unk>"]

tokenizer.decoder = decoders.ByteLevel()

trainer = BpeTrainer(
    vocab_size=32000,
    min_frequency= 2,
    special_tokens = special_tokens
)

def text_iterator(dataset, text_key="text"):
    for sample in dataset:
        if text_key in sample and sample[text_key]:
            yield sample[text_key]

iterator = text_iterator(dataset) 

print("Starting Tokenizer Training")
tokenizer.train_from_iterator(tqdm(iterator, desc="Training Tokenizer", mininterval=5.0), trainer=trainer)
print("Tokenizer Training Completed!")

Starting Tokenizer Training


Training Tokenizer: 8976815it [38:20, 3902.62it/s]





Tokenizer Training Completed!


### Save and Load Tokenizer

In [None]:
tokenizer.save("Lumen-Tokenizer.json")

In [13]:
tokenizer = Tokenizer.from_file("LumenTokenizer.json")

### Testing Tokenizer

In [18]:
test_text = (
    "<|user|>Hello AI! 👋\n"
    "Can you summarize the following text?\n\n"
    "“Artificial Intelligence (AI) is the simulation of human intelligence in machines that are programmed to think like humans and mimic their actions.”\n\n"
    "Here are some points:\n"
    "\t1. AI includes machine learning, neural networks, and deep learning.\n"
    "\t2. AI can process natural language, images, and more.\n"
    "\t3. Check https://openai.com/ for reference.\n\n"
    "<|assistant|>Sure! In short, AI is about making machines think and act like humans.\n"
    "It involves learning, reasoning, and understanding complex inputs.\n"
    "<|endoftext|>"
)

print("Original Text")
print(test_text)

encoded = tokenizer.encode(test_text)
print("\nEncoded Text")
print("IDs:", encoded.ids)
print("Tokens:", encoded.tokens)

decoded = tokenizer.decode(encoded.ids, skip_special_tokens=False)
print("\nDecoded Text (broken)")
print(decoded)

Original Text
<|user|>Hello AI! 👋
Can you summarize the following text?

“Artificial Intelligence (AI) is the simulation of human intelligence in machines that are programmed to think like humans and mimic their actions.”

Here are some points:
	1. AI includes machine learning, neural networks, and deep learning.
	2. AI can process natural language, images, and more.
	3. Check https://openai.com/ for reference.

<|assistant|>Sure! In short, AI is about making machines think and act like humans.
It involves learning, reasoning, and understanding complex inputs.
<|endoftext|>

Encoded Text
IDs: [2, 15982, 19843, 11, 20537, 232, 226, 192, 8716, 621, 27696, 256, 1113, 3505, 41, 192, 192, 5057, 14349, 11781, 16004, 325, 30703, 19, 306, 256, 14506, 272, 3564, 10208, 274, 12491, 365, 406, 1176, 1418, 285, 3309, 1247, 9486, 283, 24229, 286, 579, 2568, 6158, 192, 192, 4588, 406, 792, 1497, 36, 192, 191, 27, 24, 19843, 2694, 6716, 5196, 22, 27897, 11584, 22, 283, 5742, 5196, 24, 192, 191, 28, 24