https://github.com/meta-llama/llama3/blob/main/llama/tokenizer.py

In [114]:
import tiktoken
from tiktoken.load import load_tiktoken_bpe
from pathlib import Path
from typing import cast

In [115]:
model_path = "Meta-Llama-3-8B-Instruct/tokenizer.model"

# load model

In [116]:
mergeable_ranks = load_tiktoken_bpe(model_path)
mergeable_ranks

{b'!': 0,
 b'"': 1,
 b'#': 2,
 b'$': 3,
 b'%': 4,
 b'&': 5,
 b"'": 6,
 b'(': 7,
 b')': 8,
 b'*': 9,
 b'+': 10,
 b',': 11,
 b'-': 12,
 b'.': 13,
 b'/': 14,
 b'0': 15,
 b'1': 16,
 b'2': 17,
 b'3': 18,
 b'4': 19,
 b'5': 20,
 b'6': 21,
 b'7': 22,
 b'8': 23,
 b'9': 24,
 b':': 25,
 b';': 26,
 b'<': 27,
 b'=': 28,
 b'>': 29,
 b'?': 30,
 b'@': 31,
 b'A': 32,
 b'B': 33,
 b'C': 34,
 b'D': 35,
 b'E': 36,
 b'F': 37,
 b'G': 38,
 b'H': 39,
 b'I': 40,
 b'J': 41,
 b'K': 42,
 b'L': 43,
 b'M': 44,
 b'N': 45,
 b'O': 46,
 b'P': 47,
 b'Q': 48,
 b'R': 49,
 b'S': 50,
 b'T': 51,
 b'U': 52,
 b'V': 53,
 b'W': 54,
 b'X': 55,
 b'Y': 56,
 b'Z': 57,
 b'[': 58,
 b'\\': 59,
 b']': 60,
 b'^': 61,
 b'_': 62,
 b'`': 63,
 b'a': 64,
 b'b': 65,
 b'c': 66,
 b'd': 67,
 b'e': 68,
 b'f': 69,
 b'g': 70,
 b'h': 71,
 b'i': 72,
 b'j': 73,
 b'k': 74,
 b'l': 75,
 b'm': 76,
 b'n': 77,
 b'o': 78,
 b'p': 79,
 b'q': 80,
 b'r': 81,
 b's': 82,
 b't': 83,
 b'u': 84,
 b'v': 85,
 b'w': 86,
 b'x': 87,
 b'y': 88,
 b'z': 89,
 b'{': 90,
 b'|': 9

In [117]:
num_base_tokens = len(mergeable_ranks)
num_base_tokens

128000

In [118]:
num_reserved_special_tokens = 256
special_tokens = [
    "<|begin_of_text|>",
    "<|end_of_text|>",
    "<|reserved_special_token_0|>",
    "<|reserved_special_token_1|>",
    "<|reserved_special_token_2|>",
    "<|reserved_special_token_3|>",
    "<|start_header_id|>",
    "<|end_header_id|>",
    "<|reserved_special_token_4|>",
    "<|eot_id|>",  # end of turn
] + [
    f"<|reserved_special_token_{i}|>"
    for i in range(5, num_reserved_special_tokens - 5)
]
special_tokens

['<|begin_of_text|>',
 '<|end_of_text|>',
 '<|reserved_special_token_0|>',
 '<|reserved_special_token_1|>',
 '<|reserved_special_token_2|>',
 '<|reserved_special_token_3|>',
 '<|start_header_id|>',
 '<|end_header_id|>',
 '<|reserved_special_token_4|>',
 '<|eot_id|>',
 '<|reserved_special_token_5|>',
 '<|reserved_special_token_6|>',
 '<|reserved_special_token_7|>',
 '<|reserved_special_token_8|>',
 '<|reserved_special_token_9|>',
 '<|reserved_special_token_10|>',
 '<|reserved_special_token_11|>',
 '<|reserved_special_token_12|>',
 '<|reserved_special_token_13|>',
 '<|reserved_special_token_14|>',
 '<|reserved_special_token_15|>',
 '<|reserved_special_token_16|>',
 '<|reserved_special_token_17|>',
 '<|reserved_special_token_18|>',
 '<|reserved_special_token_19|>',
 '<|reserved_special_token_20|>',
 '<|reserved_special_token_21|>',
 '<|reserved_special_token_22|>',
 '<|reserved_special_token_23|>',
 '<|reserved_special_token_24|>',
 '<|reserved_special_token_25|>',
 '<|reserved_special_to

In [119]:
special_tokens = {
    token: num_base_tokens + i for i, token in enumerate(special_tokens)
}
special_tokens

{'<|begin_of_text|>': 128000,
 '<|end_of_text|>': 128001,
 '<|reserved_special_token_0|>': 128002,
 '<|reserved_special_token_1|>': 128003,
 '<|reserved_special_token_2|>': 128004,
 '<|reserved_special_token_3|>': 128005,
 '<|start_header_id|>': 128006,
 '<|end_header_id|>': 128007,
 '<|reserved_special_token_4|>': 128008,
 '<|eot_id|>': 128009,
 '<|reserved_special_token_5|>': 128010,
 '<|reserved_special_token_6|>': 128011,
 '<|reserved_special_token_7|>': 128012,
 '<|reserved_special_token_8|>': 128013,
 '<|reserved_special_token_9|>': 128014,
 '<|reserved_special_token_10|>': 128015,
 '<|reserved_special_token_11|>': 128016,
 '<|reserved_special_token_12|>': 128017,
 '<|reserved_special_token_13|>': 128018,
 '<|reserved_special_token_14|>': 128019,
 '<|reserved_special_token_15|>': 128020,
 '<|reserved_special_token_16|>': 128021,
 '<|reserved_special_token_17|>': 128022,
 '<|reserved_special_token_18|>': 128023,
 '<|reserved_special_token_19|>': 128024,
 '<|reserved_special_token_

In [120]:
pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"

In [121]:
model = tiktoken.Encoding(
    name=Path(model_path).name,
    pat_str=pat_str,
    mergeable_ranks=mergeable_ranks,
    special_tokens=special_tokens,
)
model

<Encoding 'tokenizer.model'>

In [122]:
n_words: int = model.n_vocab
n_words

128256

In [123]:
# BOS / EOS token IDs
bos_id: int = special_tokens["<|begin_of_text|>"]
eos_id: int = special_tokens["<|end_of_text|>"]
bos_id, eos_id

(128000, 128001)

In [124]:
pad_id: int = -1
stop_tokens = {
    special_tokens["<|end_of_text|>"],
    special_tokens["<|eot_id|>"],
}
stop_tokens

{128001, 128009}

In [125]:
inputs = [
    "The quick brown fox jumps over the lazy dog",
    "零一二三四五六七八九十",
]

# encode

## encode

In [126]:
model.encode(inputs[0]), model.encode(inputs[1])

([791, 4062, 14198, 39935, 35308, 927, 279, 16053, 5679],
 [110260,
  15120,
  41920,
  46091,
  64803,
  76208,
  103070,
  103305,
  102397,
  103178,
  95598])

## encode_batch

In [127]:
ids = model.encode_batch(inputs)
ids

[[791, 4062, 14198, 39935, 35308, 927, 279, 16053, 5679],
 [110260,
  15120,
  41920,
  46091,
  64803,
  76208,
  103070,
  103305,
  102397,
  103178,
  95598]]

In [128]:
# Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence.
for i in range(len(ids)):
    ids[i] = [bos_id] + ids[i] + [eos_id]
ids

[[128000, 791, 4062, 14198, 39935, 35308, 927, 279, 16053, 5679, 128001],
 [128000,
  110260,
  15120,
  41920,
  46091,
  64803,
  76208,
  103070,
  103305,
  102397,
  103178,
  95598,
  128001]]

## encode_ordinary

In [129]:
model.encode_ordinary(inputs[0]), model.encode_ordinary(inputs[1])

([791, 4062, 14198, 39935, 35308, 927, 279, 16053, 5679],
 [110260,
  15120,
  41920,
  46091,
  64803,
  76208,
  103070,
  103305,
  102397,
  103178,
  95598])

## encode_ordinary_batch

In [130]:
model.encode_ordinary_batch(inputs)

[[791, 4062, 14198, 39935, 35308, 927, 279, 16053, 5679],
 [110260,
  15120,
  41920,
  46091,
  64803,
  76208,
  103070,
  103305,
  102397,
  103178,
  95598]]

## encode_single_token()

In [131]:
model.encode_single_token("<|begin_of_text|>")

128000

In [132]:
model.encode_single_token("<|end_of_text|>")

128001

## encode_with_unstable

In [133]:
model.encode_with_unstable(inputs[0])

([791, 4062, 14198, 39935, 35308, 927, 279, 16053],
 [[656, 67405],
  [294, 51549],
  [656, 97885],
  [294, 67245],
  [656, 67131],
  [656, 50530],
  [294, 12968],
  [656, 92969],
  [656, 54993],
  [656, 52853],
  [656, 53668],
  [656, 45554],
  [656, 72149],
  [656, 70781],
  [656, 60269],
  [656, 96072],
  [656, 92446],
  [294, 32277],
  [656, 89682],
  [294, 5328],
  [656, 48760],
  [656, 456],
  [656, 92524],
  [656, 26717],
  [656, 57984],
  [656, 59736],
  [294, 31756],
  [656, 80266],
  [656, 46171],
  [656, 69973],
  [656, 52539],
  [656, 94290],
  [656, 19209],
  [656, 26345],
  [294, 45245],
  [656, 75985],
  [656, 67873],
  [656, 61012],
  [656, 67370],
  [656, 93694],
  [656, 88904],
  [294, 3257],
  [656, 91261],
  [656, 72610],
  [656, 77545],
  [656, 51118],
  [656, 15985],
  [656, 75789],
  [656, 97572],
  [656, 3522],
  [656, 97472],
  [656, 77637],
  [656, 77695],
  [656, 19006],
  [656, 82832],
  [5679],
  [656, 72403],
  [656, 76757],
  [656, 14197],
  [656, 53991],

# decode

In [134]:
# Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence.
for i in range(len(ids)):
    ids[i] = cast(list[int], ids[i])
ids

[[128000, 791, 4062, 14198, 39935, 35308, 927, 279, 16053, 5679, 128001],
 [128000,
  110260,
  15120,
  41920,
  46091,
  64803,
  76208,
  103070,
  103305,
  102397,
  103178,
  95598,
  128001]]

In [135]:
len(ids)

2

## decode

In [137]:
# 不会去掉特殊字符
model.decode(ids[0]), model.decode(ids[1])

('<|begin_of_text|>The quick brown fox jumps over the lazy dog<|end_of_text|>',
 '<|begin_of_text|>零一二三四五六七八九十<|end_of_text|>')

## decode_batch

In [138]:
model.decode_batch(ids)

['<|begin_of_text|>The quick brown fox jumps over the lazy dog<|end_of_text|>',
 '<|begin_of_text|>零一二三四五六七八九十<|end_of_text|>']

## decode_bytes

In [139]:
model.decode_bytes(ids[0]), model.decode_bytes(ids[1])

(b'<|begin_of_text|>The quick brown fox jumps over the lazy dog<|end_of_text|>',
 b'<|begin_of_text|>\xe9\x9b\xb6\xe4\xb8\x80\xe4\xba\x8c\xe4\xb8\x89\xe5\x9b\x9b\xe4\xba\x94\xe5\x85\xad\xe4\xb8\x83\xe5\x85\xab\xe4\xb9\x9d\xe5\x8d\x81<|end_of_text|>')

## decode_bytes_batch

In [140]:
model.decode_bytes_batch(ids)

[b'<|begin_of_text|>The quick brown fox jumps over the lazy dog<|end_of_text|>',
 b'<|begin_of_text|>\xe9\x9b\xb6\xe4\xb8\x80\xe4\xba\x8c\xe4\xb8\x89\xe5\x9b\x9b\xe4\xba\x94\xe5\x85\xad\xe4\xb8\x83\xe5\x85\xab\xe4\xb9\x9d\xe5\x8d\x81<|end_of_text|>']

## decode_tokens_bytes

In [141]:
model.decode_tokens_bytes(ids[0]), model.decode_tokens_bytes(ids[1])

([b'<|begin_of_text|>',
  b'The',
  b' quick',
  b' brown',
  b' fox',
  b' jumps',
  b' over',
  b' the',
  b' lazy',
  b' dog',
  b'<|end_of_text|>'],
 [b'<|begin_of_text|>',
  b'\xe9\x9b\xb6',
  b'\xe4\xb8\x80',
  b'\xe4\xba\x8c',
  b'\xe4\xb8\x89',
  b'\xe5\x9b\x9b',
  b'\xe4\xba\x94',
  b'\xe5\x85\xad',
  b'\xe4\xb8\x83',
  b'\xe5\x85\xab',
  b'\xe4\xb9\x9d',
  b'\xe5\x8d\x81',
  b'<|end_of_text|>'])

## decode_single_token_bytes

In [142]:
model.decode_single_token_bytes(bos_id), model.decode_single_token_bytes(eos_id)

(b'<|begin_of_text|>', b'<|end_of_text|>')

## decode_with_offsets

In [143]:
model.decode_with_offsets(ids[0]), model.decode_with_offsets(ids[1])

(('<|begin_of_text|>The quick brown fox jumps over the lazy dog<|end_of_text|>',
  [0, 17, 20, 26, 32, 36, 42, 47, 51, 56, 60]),
 ('<|begin_of_text|>零一二三四五六七八九十<|end_of_text|>',
  [0, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]))