In [52]:
# Tokenization

# Tokenization is at the heart of much weirdness of LLMs. Do not brush it off.

# * Why can't LLM spell words? Tokenization.
# * Why can't LLM do super simple string processing tasks like reversing a string? Tokenization.
# * Why is LLM worse at non-English languages (e.g. Japanese)? Tokenization.
# * Why is LLM bad at simple arithmetic? Tokenization.
# * Why did GPT-2 have more than necessary trouble coding in Python? Tokenization.
# * Why did my LLM abruptly halt when it sees the string "<|endoftext|>"? Tokenization.
# * What is this weird warning I get about a "trailing whitespace"? Tokenization.
# * Why the LLM break if I ask it about "SolidGoldMagikarp"? Tokenization.
# * Why should I prefer to use YAML over JSON with LLMs? Tokenization.
# * Why is LLM not actually end-to-end language modeling? Tokenization.
# * What is the real root of suffering? Tokenization.


In [53]:
[ord(x) for x in "ÏïàÎÖïÌïòÏÑ∏Ïöî üëã (hello in Korean!)"]

[50504,
 45397,
 54616,
 49464,
 50836,
 32,
 128075,
 32,
 40,
 104,
 101,
 108,
 108,
 111,
 32,
 105,
 110,
 32,
 75,
 111,
 114,
 101,
 97,
 110,
 33,
 41]

In [54]:
ord("Ïïà")

50504

In [55]:
list("ÏïàÎÖïÌïòÏÑ∏Ïöî üëã (hello in Korean!)".encode('utf-8'))
# only byte stream of 2^8 = 256 

[236,
 149,
 136,
 235,
 133,
 149,
 237,
 149,
 152,
 236,
 132,
 184,
 236,
 154,
 148,
 32,
 240,
 159,
 145,
 139,
 32,
 40,
 104,
 101,
 108,
 108,
 111,
 32,
 105,
 110,
 32,
 75,
 111,
 114,
 101,
 97,
 110,
 33,
 41]

# BPE 

In [56]:
# text from https://www.reedbeta.com/blog/programmers-intro-to-unicode/

text = (
    "Unicode! üòà UNICODE? üÜÑüÖΩüÖ∏üÖ≤üÖæüÖ≥üÖ¥! üòÅ "
    "The very name strikes fear and awe into the hearts of programmers worldwide. "
    "We all know we ought to ‚Äúsupport Unicode‚Äù in our software (whatever that means‚Äî"
    "like using wchar_t for all the strings, right?). But Unicode can be abstruse, "
    "and diving into the thousand-page Unicode Standard plus its dozens of supplementary "
    "annexes, reports, and notes can be more than a little intimidating. I don‚Äôt blame "
    "programmers for still finding the whole thing mysterious, even 30 years after "
    "Unicode‚Äôs inception."
)

# Encode to UTF-8 bytes
tokens = text.encode("utf-8")  # raw bytes
tokens = list(map(int, tokens))  # convert bytes to integers (0‚Äì255)

print("___")
print(text)
print("length:", len(text))
print("___")
print(tokens)
print("length:", len(tokens))

___
Unicode! üòà UNICODE? üÜÑüÖΩüÖ∏üÖ≤üÖæüÖ≥üÖ¥! üòÅ The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to ‚Äúsupport Unicode‚Äù in our software (whatever that means‚Äîlike using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don‚Äôt blame programmers for still finding the whole thing mysterious, even 30 years after Unicode‚Äôs inception.
length: 529
___
[85, 110, 105, 99, 111, 100, 101, 33, 32, 240, 159, 152, 136, 32, 85, 78, 73, 67, 79, 68, 69, 63, 32, 240, 159, 134, 132, 240, 159, 133, 189, 240, 159, 133, 184, 240, 159, 133, 178, 240, 159, 133, 190, 240, 159, 133, 179, 240, 159, 133, 180, 33, 32, 240, 159, 152, 129, 32, 84, 104, 101, 32, 118, 101, 114, 121, 32, 110, 97, 109, 101, 32, 115, 116, 114, 105, 107, 101, 115, 32, 102, 101, 97, 114, 32, 97, 110, 100

In [57]:
def get_stats(ids):
    counts = {}
    # IMPORTANT
    for pair in zip(ids, ids[1:]): # python way of iterating consecutive elements 
        counts[pair] = counts.get(pair, 0) + 1 
    return counts

stats = get_stats(tokens)
# print(stats)
print(sorted(((v, k) for k,v in stats.items()),reverse=True))

[(20, (101, 32)), (12, (105, 110)), (10, (115, 32)), (10, (97, 110)), (10, (32, 97)), (9, (240, 159)), (9, (32, 116)), (8, (116, 104)), (7, (97, 114)), (6, (159, 133)), (6, (116, 32)), (6, (114, 32)), (6, (111, 114)), (6, (110, 103)), (6, (110, 100)), (6, (109, 101)), (6, (104, 101)), (6, (101, 114)), (6, (100, 101)), (6, (32, 105)), (5, (226, 128)), (5, (117, 115)), (5, (115, 116)), (5, (111, 100)), (5, (110, 105)), (5, (110, 32)), (5, (105, 99)), (5, (99, 111)), (5, (85, 110)), (5, (44, 32)), (5, (32, 115)), (5, (32, 85)), (4, (116, 105)), (4, (116, 101)), (4, (115, 44)), (4, (114, 105)), (4, (111, 117)), (4, (110, 116)), (4, (104, 97)), (4, (103, 32)), (4, (101, 97)), (4, (100, 32)), (4, (97, 109)), (4, (32, 119)), (4, (32, 111)), (4, (32, 102)), (3, (118, 101)), (3, (116, 115)), (3, (116, 114)), (3, (116, 111)), (3, (114, 116)), (3, (114, 115)), (3, (114, 101)), (3, (111, 102)), (3, (111, 32)), (3, (108, 108)), (3, (108, 101)), (3, (108, 32)), (3, (101, 115)), (3, (101, 110)), (3, 

In [58]:
chr(101) , chr(32)

('e', ' ')

In [59]:
top_pair = max(stats, key=stats.get)
top_pair

(101, 32)

In [60]:
def merge(ids, pair, idx):
    # in the list of ints (ids), replace all consecutive occurences of pair with the new token idx
    newids = []
    i = 0
    while i < len(ids):
        # if we are NOT at last position AND the pair matches replace it
        if i < len(ids) -1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
            newids.append(idx)
            i += 2 
        else:
            newids.append(ids[i])
            i += 1
    return newids

print(merge([5,6,6,7,9,1],(6,7),99))

[5, 6, 99, 9, 1]


In [61]:
tokens2 = merge(tokens, top_pair, 256)
print(tokens2)
print('length', len(tokens2))

[85, 110, 105, 99, 111, 100, 101, 33, 32, 240, 159, 152, 136, 32, 85, 78, 73, 67, 79, 68, 69, 63, 32, 240, 159, 134, 132, 240, 159, 133, 189, 240, 159, 133, 184, 240, 159, 133, 178, 240, 159, 133, 190, 240, 159, 133, 179, 240, 159, 133, 180, 33, 32, 240, 159, 152, 129, 32, 84, 104, 256, 118, 101, 114, 121, 32, 110, 97, 109, 256, 115, 116, 114, 105, 107, 101, 115, 32, 102, 101, 97, 114, 32, 97, 110, 100, 32, 97, 119, 256, 105, 110, 116, 111, 32, 116, 104, 256, 104, 101, 97, 114, 116, 115, 32, 111, 102, 32, 112, 114, 111, 103, 114, 97, 109, 109, 101, 114, 115, 32, 119, 111, 114, 108, 100, 119, 105, 100, 101, 46, 32, 87, 256, 97, 108, 108, 32, 107, 110, 111, 119, 32, 119, 256, 111, 117, 103, 104, 116, 32, 116, 111, 32, 226, 128, 156, 115, 117, 112, 112, 111, 114, 116, 32, 85, 110, 105, 99, 111, 100, 101, 226, 128, 157, 32, 105, 110, 32, 111, 117, 114, 32, 115, 111, 102, 116, 119, 97, 114, 256, 40, 119, 104, 97, 116, 101, 118, 101, 114, 32, 116, 104, 97, 116, 32, 109, 101, 97, 110, 115, 22

In [62]:
# making text longer and taking whole blog as text
# https://www.reedbeta.com/blog/programmers-intro-to-unicode/
text = "A Programmer‚Äôs Introduction to Unicode\nMarch 3, 2017 ¬∑ Coding ¬∑ 25 Comments\n\nÔºµÔΩéÔΩâÔΩÉÔΩèÔΩÑÔΩÖ! üÖ§üÖùüÖòüÖíüÖûüÖìüÖî‚ÄΩ üá∫‚Äåüá≥‚ÄåüáÆ‚Äåüá®‚Äåüá¥‚Äåüá©‚Äåüá™! üòÑ The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to ‚Äúsupport Unicode‚Äù in our software (whatever that means‚Äîlike using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don‚Äôt blame programmers for still finding the whole thing mysterious, even 30 years after Unicode‚Äôs inception.\n\nA few months ago, I got interested in Unicode and decided to spend some time learning more about it in detail. In this article, I‚Äôll give an introduction to it from a programmer‚Äôs point of view.\n\nI‚Äôm going to focus on the character set and what‚Äôs involved in working with strings and files of Unicode text. However, in this article I‚Äôm not going to talk about fonts, text layout/shaping/rendering, or localization in detail‚Äîthose are separate issues, beyond my scope (and knowledge) here.\n\nDiversity and Inherent Complexity\nThe Unicode Codespace\nCodespace Allocation\nScripts\nUsage Frequency\nEncodings\nUTF-8\nUTF-16\nCombining Marks\nCanonical Equivalence\nNormalization Forms\nGrapheme Clusters\nAnd More‚Ä¶\n\nDiversity and Inherent Complexity\nAs soon as you start to study Unicode, it becomes clear that it represents a large jump in complexity over character sets like ASCII that you may be more familiar with. It‚Äôs not just that Unicode contains a much larger number of characters, although that‚Äôs part of it. Unicode also has a great deal of internal structure, features, and special cases, making it much more than what one might expect a mere ‚Äúcharacter set‚Äù to be.\n\nWhen confronting all this complexity, especially as an engineer, it‚Äôs hard not to find oneself asking, ‚ÄúWhy do we need all this? Is this really necessary? Couldn‚Äôt it be simplified?‚Äù\n\nHowever, Unicode aims to faithfully represent the entire world‚Äôs writing systems. The Unicode Consortium‚Äôs stated goal is ‚Äúenabling people around the world to use computers in any language‚Äù. And as you might imagine, the diversity of written languages is immense! To date, Unicode supports 135 different scripts, covering some 1100 languages, and there‚Äôs still a long tail of over 100 unsupported scripts, both modern and historical, which people are still working to add.\n\nGiven this enormous diversity, it‚Äôs inevitable that representing it is a complicated project. Unicode embraces that diversity, and accepts the complexity inherent in its mission to include all human writing systems.\n\nThe Unicode Codespace\nLet‚Äôs start with some general orientation. The basic elements of Unicode‚Äîits ‚Äúcharacters‚Äù, although that term isn‚Äôt quite right‚Äîare called code points. Code points are identified by number, customarily written in hexadecimal with the prefix ‚ÄúU+‚Äù, such as U+0041 ‚ÄúA‚Äù or U+03B8 ‚ÄúŒ∏‚Äù.\n\nCanonical Equivalence\nIn Unicode, precomposed characters exist alongside the dynamic composition system. A consequence of this is that there are multiple ways to express ‚Äúthe same‚Äù string‚Äîdifferent sequences of code points that result in the same user-perceived characters.\n\nNormalization Forms\nTo address this problem, Unicode defines normalization forms such as NFC and NFD.\n\nGrapheme Clusters\nUnicode formalizes the notion of a grapheme cluster: a sequence of one or more code points that form a single user-perceived character.\n\nUnicode is a fascinating and complex system, but it enables software to work correctly for billions of people across languages and scripts worldwide."
tokens = text.encode("utf-8") # raw bytes
tokens = list(map(int, tokens)) # convert to integer in range 0..255 for convinence

In [63]:
def get_stats(ids):
    counts = {}
    # IMPORTANT
    for pair in zip(ids, ids[1:]):  # python way of iterating consecutive elements
        counts[pair] = counts.get(pair, 0) + 1
    return counts


def merge(ids, pair, idx):
    # in the list of ints (ids), replace all consecutive occurences of pair with the new token idx
    newids = []
    i = 0
    while i < len(ids):
        # if we are NOT at last position AND the pair matches replace it
        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i + 1] == pair[1]:
            newids.append(idx)
            i += 2
        else:
            newids.append(ids[i])
            i += 1
    return newids

# -----
vocab_size = 276 # desired final vocab size 
num_merges = vocab_size - 256
ids = list(tokens) # copy so that we dont destroy original text 

merges = {} # (int, int) --> int
for i in range(num_merges):
    stats = get_stats(ids)
    pair = max(stats, key=stats.get)
    idx = 256 + i 
    print(f'merging {pair} into a new token {idx}')
    ids = merge(ids, pair, idx)
    merges[pair] = idx

merging (101, 32) into a new token 256
merging (115, 32) into a new token 257
merging (105, 110) into a new token 258
merging (101, 114) into a new token 259
merging (116, 32) into a new token 260
merging (226, 128) into a new token 261
merging (116, 104) into a new token 262
merging (99, 111) into a new token 263
merging (97, 114) into a new token 264
merging (100, 32) into a new token 265
merging (44, 32) into a new token 266
merging (111, 114) into a new token 267
merging (101, 110) into a new token 268
merging (97, 110) into a new token 269
merging (97, 108) into a new token 270
merging (111, 110) into a new token 271
merging (258, 103) into a new token 272
merging (263, 100) into a new token 273
merging (115, 116) into a new token 274
merging (105, 116) into a new token 275


In [64]:
print('tokens length:', len(tokens))
print('ids length:', len(ids))
print(f'compression ratio: {len(tokens)/len(ids):.2f}X')

tokens length: 3797
ids length: 2947
compression ratio: 1.29X


In [65]:
diagram = """
Raw text (Unicode code point sequence)
              |
              v
         +-----------+
         | Tokenizer |
         +-----------+
              |
              v
        Token sequence
              |
              v
         +-----------+
         |    LLM    |
         +-----------+

Note:
- The tokenizer is a separate, independent module from the LLM.
- It is trained (e.g., using BPE) on raw text.
- It converts raw Unicode text ‚Üî token sequences.
- The LLM only ever sees tokens, never raw text.
"""

# decoding 

In [66]:
# given a seq of integers in range [0,vocab_size], whats the text?

vocab = {idx: bytes([idx]) for idx in range(256)}
# and now also going up the merge tree
for (po, p1), idx in merges.items():
    vocab[idx] = vocab[po] + vocab[p1]

def decode(ids):
    # given ids (list of integer), return python string
    # b""        # empty bytes object
    # ""         # empty string
    # Because vocab[idx] is a bytes object, not a string.
    tokens = b"".join(vocab[idx] for idx in ids)
    text = tokens.decode("utf-8", errors='replace')
    return text

In [67]:
print(decode([67]))

C


In [68]:
print(decode([128])) # cant because we dont conform to utf-8 formats (see wiki)
# need to add errors='replace'

ÔøΩ


# Encoding

In [69]:
merges

{(101, 32): 256,
 (115, 32): 257,
 (105, 110): 258,
 (101, 114): 259,
 (116, 32): 260,
 (226, 128): 261,
 (116, 104): 262,
 (99, 111): 263,
 (97, 114): 264,
 (100, 32): 265,
 (44, 32): 266,
 (111, 114): 267,
 (101, 110): 268,
 (97, 110): 269,
 (97, 108): 270,
 (111, 110): 271,
 (258, 103): 272,
 (263, 100): 273,
 (115, 116): 274,
 (105, 116): 275}

In [74]:
# given the string, what are the tokens?

def encode(text):
    # given a string, return list of integers(token)
    tokens = list(text.encode("utf-8"))
    while len(tokens) >= 2:
        stats = get_stats(tokens)
        # From all adjacent pairs currently present, pick the pair that was merged earliest during training.
        pair = min(stats, key=lambda p: merges.get(p, float("inf")))
        if pair not in merges: 
            break #nothing can be merged
        idx = merges[pair]
        tokens = merge(tokens, pair, idx)
    return tokens


print(encode('Hello world!'))

[72, 101, 108, 108, 111, 32, 119, 267, 108, 100, 33]


In [73]:
print(encode(""))

[]


In [79]:
print(decode(encode('hello')))

hello


In [78]:
text2 = decode(encode(text))
print(text2 == text)

True


# forced splits using regex patterns (gpt-2)

In [None]:
import regex as re

# GPT-2 style regex for forced token splits
gpt2pat = re.compile(
    r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
)
#  ?\p{L} -> space followed by letters from any languages
# we split these elements and then find merges independent to these elements
# enforcing some merges to not happen

# ?\p{N} -> letters and numbers are seperated
# ve and re are seperated (as they are common)

# ?[^\s\p{L}\p{N}] --> followed by not letters or numbers (puntuations)

# \s+(?!\S) -> extra white space are own elements

# \s -> just ending white space

# spaces are never merged 
print(re.findall(gpt2pat, "Hello world how've are.     you!!!?   "))

['Hello', ' world', ' how', "'ve", ' are', '.', '    ', ' you', '!!!?', '   ']


In [None]:
import tiktoken

# GPT-2 (DOES NOT MERGES SPACES)
enc = tiktoken.get_encoding("gpt2")
print(enc.encode("  hello world!!!"))

# GPT-4 (MERGES SPACES)
enc = tiktoken.get_encoding("cl100k_base")
print(enc.encode("  hello world!!!"))


# see regexes in https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py

[220, 23748, 995, 10185]
[220, 24748, 1917, 12340]


In [None]:
# GPT-4 style regex for forced token splits
gpt4pat = re.compile(
    r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s"""
)

# \p{N}{1,3} -> only 1-3 numbers are merged
# i:[sdmt]|ll|ve|re) -> 's , 'd etc are matched


In [None]:
# https://github.com/openai/gpt-2/blob/master/src/encoder.py 
# encoder and decoder of gpt2 
# very similar to ours 

In [89]:
# to download these two files:
# !wget https://openaipublic.blob.core.windows.net/gpt-2/models/1558M/vocab.bpe
# !wget https://openaipublic.blob.core.windows.net/gpt-2/models/1558M/encoder.json

import os
import json

# load encoder (token -> id mapping)
# equivalent to our vocab
with open("encoder.json", "r") as f:
    encoder = json.load(f)

# load BPE merge rules
# equivalent to our merges
with open("vocab.bpe", "r", encoding="utf-8") as f:
    bpe_data = f.read()

# each line after the first is a merge rule: "token1 token2"
bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]

In [None]:
len(encoder) # 256 raw bytes token, 50,000 merges. 1 special token 

50257

In [None]:
# special token 
encoder['<|endoftext|>']
# these are added outside of bpe 

50256

In [100]:
list(encoder)[:10]

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*']

## adding new special tokens 

In [None]:
# import tiktoken

# Get the base encoding
# cl100k_base = tiktoken.get_encoding("cl100k_base")

# Create a new Encoding object by extending it
# In production, load the arguments directly instead of accessing private attributes
# See openai_public.py for examples of arguments for specific encodings

# enc = tiktoken.Encoding(
#     # If you're changing the set of special tokens, make sure to use a different name
#     # It should be clear from the name what behaviour to expect.
#     name="cl100k_im",
#     pat_str=cl100k_base._pat_str,
#     mergeable_ranks=cl100k_base._mergeable_ranks,
#     special_tokens={
#         **cl100k_base._special_tokens,
#         "<|im_start|>": 100264,
#         "<|im_end|>": 100265,
#     },
# )

In [None]:
# different speical tokens for diff tokenizers 
# see openai_public.py of tiktokenizer
# adding special token u need to update embedding layer and final projection layer 

# sentencepiece

## sentencepiece

SentencePiece is commonly used because (unlike `tiktoken`) it can efficiently **both train and run inference** for BPE tokenizers. It is used in **LLaMA** and **Mistral** model families.

GitHub: https://github.com/google/sentencepiece

---

### The big difference

**SentencePiece runs BPE directly on Unicode code points**, not on UTF-8 bytes.

It provides two important options:

- **`character_coverage`**  
  Controls how much of the Unicode space is explicitly modeled.  
  Very rare characters (below this coverage threshold) are treated specially.

- **`byte_fallback`**  
  If enabled, rare Unicode code points are:
  1. UTF-8 encoded
  2. Then tokenized as raw bytes  
  instead of being mapped to an `<unk>` token.

---

### TL;DR

- **tiktoken**
  - Converts text ‚Üí UTF-8 bytes
  - Runs BPE on bytes

- **sentencepiece**
  - Runs BPE on Unicode code points
  - Optionally falls back to UTF-8 bytes for rare characters
  - Rarity is controlled by `character_coverage`

*(Personal note: the byte-level approach used by tiktoken is often considered cleaner and more predictable.)*


In [101]:
import sentencepiece as spm

# write a toy.txt file with some random text
with open("toy.txt", "w", encoding="utf-8") as f:
    f.write(
        "SentencePiece is an unsupervised text tokenizer and detokenizer "
        "mainly for Neural Network-based text generation systems.\n"
        "It supports subword units such as BPE and unigram language models."
    )


In [102]:
# train a sentencepiece model on it
# the settings here are (best effort) those used for training LLaMA 2

import os
import sentencepiece as spm

options = dict(
    # input spec
    input="toy.txt",
    input_format="text",
    # output spec
    model_prefix="tok400",  # output filename prefix
    # algorithm spec
    # BPE alg
    model_type="bpe",
    vocab_size=400,
    # normalization
    normalization_rule_name="identity",  # ew, turn off normalization
    remove_extra_whitespaces=False,
    input_sentence_size=200000000,  # max number of training sentences
    max_sentence_length=4192,  # max number of bytes per sentence
    seed_sentencepiece_size=1000000,
    shuffle_input_sentence=True,
    # rare word treatment
    character_coverage=0.99995,
    byte_fallback=True,
    # merge rules
    split_digits=True,
    split_by_unicode_script=True,
    split_by_whitespace=True,
    split_by_number=True,
    max_sentencepiece_length=16,
    add_dummy_prefix=True,
    allow_whitespace_only_pieces=True,
    # special tokens
    unk_id=0,  # the UNK token MUST exist
    bos_id=1,  # the others are optional, set to -1 to turn off
    eos_id=2,
    pad_id=-1,
    # systems
    num_threads=os.cpu_count(),  # use ~all system resources
)

# train the model
spm.SentencePieceTrainer.train(**options)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: toy.txt
  input_format: text
  model_prefix: tok400
  model_type: BPE
  vocab_size: 400
  self_test_sample_size: 0
  character_coverage: 0.99995
  input_sentence_size: 200000000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 8
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 1
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 1
  required_chars: 
  byte_fallback: 1
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ‚Åá 
  enable_differential_privacy: 0
  dif

In [None]:
sp = spm.SentencePieceProcessor()
sp.load('tok400.model')
vocab = [[sp.id_to_piece(idx), idx] for idx in range(sp.get_piece_size())]
vocab
# starts with special tokens, byte tokens, merges and then indiviual tokens

[['<unk>', 0],
 ['<s>', 1],
 ['</s>', 2],
 ['<0x00>', 3],
 ['<0x01>', 4],
 ['<0x02>', 5],
 ['<0x03>', 6],
 ['<0x04>', 7],
 ['<0x05>', 8],
 ['<0x06>', 9],
 ['<0x07>', 10],
 ['<0x08>', 11],
 ['<0x09>', 12],
 ['<0x0A>', 13],
 ['<0x0B>', 14],
 ['<0x0C>', 15],
 ['<0x0D>', 16],
 ['<0x0E>', 17],
 ['<0x0F>', 18],
 ['<0x10>', 19],
 ['<0x11>', 20],
 ['<0x12>', 21],
 ['<0x13>', 22],
 ['<0x14>', 23],
 ['<0x15>', 24],
 ['<0x16>', 25],
 ['<0x17>', 26],
 ['<0x18>', 27],
 ['<0x19>', 28],
 ['<0x1A>', 29],
 ['<0x1B>', 30],
 ['<0x1C>', 31],
 ['<0x1D>', 32],
 ['<0x1E>', 33],
 ['<0x1F>', 34],
 ['<0x20>', 35],
 ['<0x21>', 36],
 ['<0x22>', 37],
 ['<0x23>', 38],
 ['<0x24>', 39],
 ['<0x25>', 40],
 ['<0x26>', 41],
 ['<0x27>', 42],
 ['<0x28>', 43],
 ['<0x29>', 44],
 ['<0x2A>', 45],
 ['<0x2B>', 46],
 ['<0x2C>', 47],
 ['<0x2D>', 48],
 ['<0x2E>', 49],
 ['<0x2F>', 50],
 ['<0x30>', 51],
 ['<0x31>', 52],
 ['<0x32>', 53],
 ['<0x33>', 54],
 ['<0x34>', 55],
 ['<0x35>', 56],
 ['<0x36>', 57],
 ['<0x37>', 58],
 ['<0x38>', 5

In [104]:
ids = sp.encode("hello ÏïàÎÖïÌïòÏÑ∏Ïöî")
print(ids)

[367, 398, 368, 379, 379, 376, 367, 239, 152, 139, 238, 136, 152, 240, 152, 155, 239, 135, 187, 239, 157, 151]


In [None]:
print([sp.id_to_piece(idx) for idx in ids])
# encoding didnt see ÏïàÎÖïÌïòÏÑ∏Ïöî these tokens during train
# hence these are UNK(unknown) tokens
# but since byte fallback is true, sentence piece fall backs to bytes
# encodes them by utf-8 and then uses bytes token to represent these

# if there is byte fall back
# there will be no byte tokens and we will have more merges as more space to vocab_size: 400
# but the final output of encoding will be ['‚ñÅ', 'h', 'e', 'l', 'l', 'o', '‚ñÅ','<unk> ]
# [367, 398, 368, 379, 379, 376, 367, 0 ] -> <unk> is 0
# all of ÏïàÎÖïÌïòÏÑ∏Ïöî is encoded as <unk>

# sentence piece also converts spaces to _
# we have a start at ['‚ñÅ', 'h', 'e', 'l', 'l' ... 
# becuase it uses a dummy prefix 
# converts: 
# world  -> this world by adding a prefix so that its similar to below [space]world
# hello world
# hence we get a _

['‚ñÅ', 'h', 'e', 'l', 'l', 'o', '‚ñÅ', '<0xEC>', '<0x95>', '<0x88>', '<0xEB>', '<0x85>', '<0x95>', '<0xED>', '<0x95>', '<0x98>', '<0xEC>', '<0x84>', '<0xB8>', '<0xEC>', '<0x9A>', '<0x94>']


In [None]:
# This proto fully defines LLaMA-2 tokenization.
# If any of these values differ, your tokenizer is not LLaMA-2 compatible.

# normalizer_spec {
#   name: "identity"
#   precompiled_charsmap: ""
#   add_dummy_prefix: true
#   remove_extra_whitespaces: false
#   normalization_rule_tsv: ""
# }

# trainer_spec {
#   input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged"
#   model_prefix: "spm_model_32k_200M_charcov099995_allowWS0_v2"
#   model_type: BPE
#   vocab_size: 32000
#   self_test_sample_size: 0
#   input_format: "text"

#   character_coverage: 0.99995
#   input_sentence_size: 200000000
#   seed_sentencepiece_size: 1000000
#   shrinking_factor: 0.75

#   num_threads: 80
#   num_sub_iterations: 2

#   max_sentence_length: 4192
#   shuffle_input_sentence: true

#   max_sentencepiece_length: 16
#   split_by_unicode_script: true
#   split_by_whitespace: true
#   split_by_number: true
#   treat_whitespace_as_suffix: false
#   split_digits: true
#   allow_whitespace_only_pieces: true

#   vocabulary_output_piece_score: true
#   hard_vocab_limit: true
#   use_all_vocab: false

#   byte_fallback: true
#   required_chars: ""

#   unk_id: 0
#   bos_id: 1
#   eos_id: 2
#   pad_id: -1

#   unk_surface: " \342\201\207 "
#   unk_piece: "<unk>"
#   bos_piece: "<s>"
#   eos_piece: "</s>"
#   pad_piece: "<pad>"

#   train_extremely_large_corpus: false

#   enable_differential_privacy: false
#   differential_privacy_noise_level: 0.0
#   differential_privacy_clipping_threshold: 0
# }


In [None]:
# ===============================================
# GPT-2 Tokenizer vs SentencePiece (ESSENTIAL DIFF)
# ===============================================

# -------------------------
# 1. Core unit of operation
# -------------------------
# GPT-2 tokenizer:
# - Operates on UTF-8 BYTES
# - Text ‚Üí UTF-8 bytes ‚Üí BPE merges on bytes
# - Every byte (0‚Äì255) is representable
#
# SentencePiece:
# - Operates on UNICODE CODE POINTS
# - Text ‚Üí Unicode chars ‚Üí BPE / Unigram on characters
# - Optionally falls back to bytes for rare characters

# -------------------------
# 2. Unknown token behavior
# -------------------------
# GPT-2 tokenizer:
# - NO <UNK> token ever
# - All text is representable via bytes
#
# SentencePiece:
# - Has <unk> token by default
# - Can avoid <unk> only if byte_fallback=True

# -------------------------
# 3. Normalization
# -------------------------
# GPT-2 tokenizer:
# - No normalization (raw bytes)
# - Exact byte preservation
#
# SentencePiece:
# - Normalization ON by default (NFKC etc.)
# - LLaMA explicitly disables it (identity normalization)

# -------------------------
# 4. Training vs inference
# -------------------------
# GPT-2 tokenizer:
# - Training handled externally (OpenAI tools)
# - tiktoken is INFERENCE-ONLY
#
# SentencePiece:
# - Single tool for TRAINING + INFERENCE
# - Widely used in open-source models

# -------------------------
# 5. Multilingual handling
# -------------------------
# GPT-2 tokenizer:
# - Language-agnostic via bytes
# - Works equally for all scripts
#
# SentencePiece:
# - Script-aware (can split by Unicode script)
# - Better semantic segmentation for languages

# -------------------------
# 6. Token stability
# -------------------------
# GPT-2 tokenizer:
# - Stable, deterministic, reversible
# - Same bytes ‚Üí same tokens always
#
# SentencePiece:
# - Depends on normalization + training config
# - Small changes can alter token boundaries

# -------------------------
# 7. Vocabulary semantics
# -------------------------
# GPT-2 tokenizer:
# - Tokens often represent byte patterns
# - Less human-readable
#
# SentencePiece:
# - Tokens represent characters/subwords
# - More interpretable vocab

# -------------------------
# 8. Used by
# -------------------------
# GPT-2 tokenizer:
# - GPT-2 / GPT-3 / GPT-4 (byte-level BPE)
#
# SentencePiece:
# - LLaMA / LLaMA-2 / Mistral / T5

# -------------------------
# ONE-LINE SUMMARY
# -------------------------
# GPT-2 tokenizer = byte-level BPE (always safe, always reversible)
# SentencePiece   = character-level BPE with optional byte fallback

In [None]:
# ===============================================
# UTF-8 BYTES vs UNICODE CODE POINTS (CORE DIFF)
# ===============================================

# -------------------------
# 1. What a Unicode code point is
# -------------------------
# A Unicode CODE POINT is an abstract number that identifies a character.
#
# Examples:
#   'A'   -> U+0041
#   '√©'   -> U+00E9
#   '‰Ω†'  -> U+4F60
#   'üòÑ'  -> U+1F604
#
# Code points are LANGUAGE-LEVEL concepts.
# They do NOT define how characters are stored in memory.

# -------------------------
# 2. What UTF-8 bytes are
# -------------------------
# UTF-8 BYTES are the concrete binary encoding of a Unicode code point.
#
# UTF-8 represents each code point using 1‚Äì4 bytes.
#
# Examples:
#   'A'   -> [0x41]
#   '√©'   -> [0xC3, 0xA9]
#   '‰Ω†'  -> [0xE4, 0xBD, 0xA0]
#   'üòÑ'  -> [0xF0, 0x9F, 0x98, 0x84]
#
# Bytes are MACHINE-LEVEL storage units (0‚Äì255).

# -------------------------
# 3. Key difference (mental model)
# -------------------------
# Unicode code point:
# - "What character is this?"
#
# UTF-8 byte:
# - "How is this character stored?"

# -------------------------
# 4. One-to-many relationship
# -------------------------
# One Unicode code point -> MANY UTF-8 bytes
#
# This is why:
# - len("üòÑ")        == 1   (code points)
# - len("üòÑ".encode("utf-8")) == 4   (bytes)

# -------------------------
# 5. Why this matters for tokenizers
# -------------------------
# Tokenizers must choose:
#
# A) Work on CODE POINTS
#    - Cleaner linguistic units
#    - Risk of <UNK> unless handled carefully
#
# B) Work on UTF-8 BYTES
#    - Always reversible
#    - No unknown characters
#    - More low-level

# -------------------------
# 6. GPT vs LLaMA choice
# -------------------------
# GPT-style tokenizers:
# - Tokenize UTF-8 BYTES
#
# LLaMA / SentencePiece:
# - Tokenize Unicode CODE POINTS
# - Fall back to UTF-8 bytes for rare chars

# -------------------------
# ONE-LINE SUMMARY
# -------------------------
# Unicode code points = meaning (characters)
# UTF-8 bytes         = storage (binary encoding)

In [None]:
# ============================================================
# UNSTABLE TOKENS ‚Äî WHAT THEY ARE AND WHY THEY BREAK LLMS
# ============================================================

# -------------------------
# What are unstable tokens?
# -------------------------
# An unstable token is a token whose boundaries or meaning are
# highly sensitive to:
#   - surrounding whitespace
#   - punctuation
#   - capitalization
#   - context (preceding/following characters)
#
# Small textual changes ‚Üí completely different token sequences.
#
# This instability propagates into model reasoning failures.

# ============================================================
# WHY THIS CAUSES REAL PROBLEMS (ONE BY ONE)
# ============================================================

# ------------------------------------------------------------
# Why can't LLM spell words?
# ------------------------------------------------------------
# LLMs do NOT see characters.
# They see TOKENS (subwords / byte chunks).
#
# Example:
#   "banana" might be tokenized as:
#   ["ba", "na", "na"]
#
# The model never learns spelling character-by-character.
# Asking it to spell = asking it to reason *across token boundaries*.
#
# => Tokenization destroys character-level continuity.

# ------------------------------------------------------------
# Why can't LLM reverse a string?
# ------------------------------------------------------------
# Reversing a string requires:
#   - exact character order
#
# But tokens are variable-length chunks:
#   "hello" -> ["hel", "lo"]
#
# Reverse tokens != reverse characters.
#
# The model cannot reliably reconstruct the original characters.
#
# => Tokenization is lossy with respect to string operations.

# ------------------------------------------------------------
# Why are LLMs worse at non-English languages (e.g., Japanese)?
# ------------------------------------------------------------
# Many non-English languages:
#   - have no spaces
#   - have large Unicode vocabularies
#
# Tokenizers:
#   - fragment text poorly
#   - produce longer token sequences
#
# Longer sequences ‚Üí more uncertainty ‚Üí worse performance.
#
# => Tokenization bias favors English.

# ------------------------------------------------------------
# Why is LLM bad at simple arithmetic?
# ------------------------------------------------------------
# Numbers are tokenized as text fragments:
#   "12345" -> ["12", "345"] or worse
#
# Arithmetic requires digit-level reasoning.
# Tokens destroy numeric structure.
#
# The model learns patterns, not math.
#
# => Tokenization breaks numerical compositionality.

# ------------------------------------------------------------
# Why did GPT-2 struggle with Python code?
# ------------------------------------------------------------
# Python syntax is whitespace-sensitive.
#
# GPT-2 tokenizer:
#   - had unstable whitespace tokens
#   - mixed spaces + tabs + newlines unpredictably
#
# Example:
#   "    " vs "\t" vs " \n"
#
# Same meaning, different tokens.
#
# => Tokenization noise destroys syntactic precision.

# ------------------------------------------------------------
# Why does the LLM halt on "<|endoftext|>"?
# ------------------------------------------------------------
# "<|endoftext|>" is a SPECIAL TOKEN.
#
# If it appears in user input:
#   - tokenizer maps it to a control token
#   - model interprets it as "STOP"
#
# => Accidental control-token injection.
#
# This is NOT language understanding ‚Äî it's tokenizer control flow.

# ------------------------------------------------------------
# What is the "trailing whitespace" warning?
# ------------------------------------------------------------
# Many tokenizers treat:
#   "word"
#   "word "
#
# as DIFFERENT TOKENS.
#
# Trailing whitespace may:
#   - change token boundaries
#   - alter probabilities
#   - break structured outputs
#
# => Invisible characters ‚â† invisible effects.

# ------------------------------------------------------------
# Why does "SolidGoldMagikarp" break models?
# ------------------------------------------------------------
# Some strings appear RARELY but as SINGLE TOKENS in training.
#
# These tokens:
#   - get weird, spiky embeddings
#   - dominate model behavior
#
# The model "overreacts" to them.
#
# => Vocabulary artifacts leak into reasoning.

# ------------------------------------------------------------
# Why prefer YAML over JSON with LLMs?
# ------------------------------------------------------------
# JSON:
#   - strict punctuation
#   - commas, quotes, braces
#
# Tokenization errors = invalid JSON.
#
# YAML:
#   - more forgiving
#   - whitespace-tolerant
#
# => YAML is more robust to tokenization noise.

# ------------------------------------------------------------
# Why is LLM not end-to-end language modeling?
# ------------------------------------------------------------
# True end-to-end LM:
#   characters -> model -> characters
#
# Reality:
#   text -> tokenizer -> tokens -> model -> tokens -> detokenizer -> text
#
# Tokenizer is:
#   - hand-engineered
#   - frozen
#   - non-learned
#
# => The model never sees raw language.

# ------------------------------------------------------------
# What is the real root of suffering?
# ------------------------------------------------------------
# The tokenizer.
#
# It:
#   - discretizes continuous language
#   - introduces irreversible abstractions
#   - creates edge cases everywhere
#
# LLMs are powerful DESPITE tokenization,
# not because of it.

# ============================================================
# ONE-SENTENCE SUMMARY
# ============================================================
# Most LLM "stupidity" is not model failure ‚Äî it is tokenizer failure.

Interesting read : https://www.lesswrong.com/posts/aPeJE8bSo6rAFoLqg/solidgoldmagikarp-plus-prompt-generation