In [1]:
import tiktoken

In [2]:
o200K_base = tiktoken.get_encoding("o200k_base")

In [3]:
total_tokens = len(o200K_base.token_byte_values())
print(f"Total number of tokens in o200k_base: {total_tokens}")

Total number of tokens in o200k_base: 199998


In [5]:
tokens = o200K_base.token_byte_values()

print("First 50 tokens:")
for i in range(50):
    print(f"{i}: {tokens[i]}")

print("\nLast 50 tokens:")
for i in range(len(tokens) - 50, len(tokens)):
    print(f"{i}: {tokens[i]}")

First 50 tokens:
0: b'\x00'
1: b'\x00\x00'
2: b'\x01'
3: b'\x01E'
4: b'\x02'
5: b'\x03'
6: b'\x04'
7: b'\x05'
8: b'\x06'
9: b'\x07'
10: b'\x08'
11: b'\t'
12: b'\t\t'
13: b'\t\t\t'
14: b'\t\t\t\t'
15: b'\t\t\t\t\t'
16: b'\t\t\t\t\t\t'
17: b'\t\t\t\t\t\t\t'
18: b'\t\t\t\t\t\t\t\t'
19: b'\t\t\t\t\t\t\t\t\t'
20: b'\t\t\t\t\t\t\t\t\t\t'
21: b'\t\t\t\t\t\t\t\t\t\t\t'
22: b'\t\t\t\t\t\t\t\t\t\t\t\t'
23: b'\t\t\t\t\t\t\t\t\t\t\t\t\t'
24: b'\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
25: b'\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
26: b'\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
27: b'\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
28: b'\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
29: b'\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
30: b'\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
31: b'\t\t\t\t\t\t\t\t\t\t\n'
32: b'\t\t\t\t\t\t\t\t\t\n'
33: b'\t\t\t\t\t\t\t\t\t '
34: b'\t\t\t\t\t\t\t\t\n'
35: b'\t\t\t\t\t\t\t\t '
36: b'\t\t\t\t\t\t\t\t  '
37: b'\t\t\t\t\t\t\t\n'
38: b'\t\t\t\t\t\t\t\r\n'
39: b'\t\t\t\t\t\t\t '
40: b'\t\t\t\t\t\t\t  '
41: b'\t\t\t\t\t\

In [8]:
print(o200K_base._special_tokens)

{'<|endoftext|>': 199999, '<|endofprompt|>': 200018}


1. üîç Pattern String (pat_str)
The pattern string is a Regular Expression that handles the first stage of tokenization: breaking raw, continuous text into initial, unprocessed segments (tokens).

Role: Segmentation. It defines the basic boundaries‚Äîwhere the model is allowed to split the text.

Input: The raw text string ("Hello world! ").

Output: An initial list of segments (e.g., ["Hello", " ", "world", "!"]).

Analogy: The Scissors. The pattern string is the rule set for cutting the raw stream of text into manageable chunks. It ensures that numbers stay with numbers, punctuation is separated from letters, and spaces are handled explicitly.

2. üìö Mergeable Ranks (_mergeable_ranks)
The mergeable ranks are a large Vocabulary Lookup Table that handles the second stage of tokenization: combining the initial segments into larger, meaningful tokens and assigning them their final integer IDs.

Role: Vocabulary Lookup and Merging. It determines the most efficient way to combine the initial segments based on the BPE rules learned during training, and maps the resulting sequence of bytes to an integer ID.

Input: The initial segments/byte sequences (e.g., the bytes for " Hello").

Output: The final, compressed integer token ID (e.g., 12345).

Analogy: The Dictionary and Glue. The ranks list is the dictionary that says, "I have seen the sequence ' Hello' thousands of times; combine it into token ID 12345." It prioritizes merging common sequences to minimize the final number of tokens.

In [16]:
def get_tokenizer():
    o200K_base = tiktoken.get_encoding("o200k_base")
    tokenizer = tiktoken.Encoding(
        name = "o200k_harmony",
        pat_str = o200K_base._pat_str,
#pat_str is a regular expression (a sequence of characters that defines a search pattern)
#that the tokenizer uses to decide what constitutes a valid chunk of text that can be converted into a token.
        mergeable_ranks = o200K_base._mergeable_ranks,
        special_tokens = {
            **o200K_base._special_tokens,
            "<|startoftext|>": 199998,
            "<|endoftext|>": 199999,
            "<|reserved_200000|>": 200000,
            "<|reserved_200001|>": 200001,
            "<|return|>": 200002,
            "<|constrain|>": 200003,
            "<|reserved_200004|>": 200004,
            "<|channel|>": 200005,
            "<|start|>": 200006,
            "<|end|>": 200007,
            "<|message|>": 200008,
            "<|reserved_200009|>": 200009,
            "<|reserved_200010|>": 200010,
            "<|reserved_200011|>": 200011,
            "<|call|>": 200012,
        } | {
            f"<|reserved_{i}|>": i for i in range(200013, 201088)
        },
    )
    return tokenizer