In [22]:
from typing import Dict, List, Tuple
import regex
import tiktoken

In [23]:
def encode_without_merging(string_input: str):
    return list(string_input.encode("utf-8"))


def encode(string_input: str, merges: Dict[Tuple, int]):
    tokens = list(string_input.encode("utf-8"))
    while len(tokens) >= 2:
        pair_cnts = get_pair_cnt(tokens)
        pair = min(pair_cnts, key=lambda p: merges.get(p, float("inf")))
        if pair not in merges:
            break
        idx = merges[pair]
        print("pair", pair)
        print("index", idx)
        tokens = mint_new_token(tokens, pair, idx)
    return tokens


def build_vocab(merge_dict: Dict) -> Dict[int, str]:
    vocab = {idx: bytes([idx]) for idx in range(256)}
    for (p0, p1), idx in merge_dict.items():
        vocab[idx] = vocab[p0] + vocab[p1]
    return vocab


def decode(ids: int, vocab: Dict[int, str]):
    tokens = b"".join(vocab[idx] for idx in ids)
    text = tokens.decode("utf-8", errors="replace")
    return text


def get_pair_cnt(ids: List[int]):
    counts = {}
    for pair in zip(ids, ids[1:]):
        counts[pair] = counts.get(pair, 0) + 1
    return counts


def get_top_pair(ids: List[int]) -> Tuple[int, ...]:
    pair_cnts = get_pair_cnt(ids)
    top_pair = max(pair_cnts, key=pair_cnts.get)
    return top_pair


def mint_new_token(ids: List[int], pair: Tuple[int, ...], idx: int):
    newIds = []
    i = 0
    while i < len(ids):
        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i + 1] == pair[1]:
            newIds.append(idx)
            i += 2
        else:
            newIds.append(ids[i])
            i += 1
    return newIds


def get_compression_ratio(tokens: List[int], ids: List[int]):
    print("length of the tokens:", len(tokens))
    print("length of the ids:", len(ids))
    print("compression ratio:", round(len(tokens) / len(ids), 2))

In [24]:
random_str: str = "hi hello 😆, how are you doing ? お元気ですか "

In [25]:
paragraph_string = "Ｕｎｉｃｏｄｅ! 🅤🅝🅘🅒🅞🅓🅔‽ 🇺‌🇳‌🇮‌🇨‌🇴‌🇩‌🇪! 😄 The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to “support Unicode” in our software (whatever that means—like using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don’t blame programmers for still finding the whole thing mysterious, even 30 years after Unicode’s inception."

In [26]:
paragraph_tokens = encode_without_merging(paragraph_string)

In [27]:
random_string_tokens = encode_without_merging(random_str)

In [28]:
# define the vocab size
vocab_size = 300
n_merges = vocab_size - 256
# copy the tokens list
ids = list(paragraph_tokens)

In [29]:
merges = {}
for i in range(n_merges):
    top_pair = get_top_pair(ids)
    idx = 256 + i
    print(f"merging {top_pair} into a new token {idx}")
    ids = mint_new_token(ids, top_pair, idx)
    merges[top_pair] = idx

merging (101, 32) into a new token 256
merging (240, 159) into a new token 257
merging (226, 128) into a new token 258
merging (105, 110) into a new token 259
merging (115, 32) into a new token 260
merging (97, 110) into a new token 261
merging (116, 104) into a new token 262
merging (257, 133) into a new token 263
merging (257, 135) into a new token 264
merging (97, 114) into a new token 265
merging (239, 189) into a new token 266
merging (258, 140) into a new token 267
merging (267, 264) into a new token 268
merging (101, 114) into a new token 269
merging (111, 114) into a new token 270
merging (116, 32) into a new token 271
merging (259, 103) into a new token 272
merging (115, 116) into a new token 273
merging (261, 100) into a new token 274
merging (32, 262) into a new token 275
merging (44, 32) into a new token 276
merging (97, 109) into a new token 277
merging (275, 256) into a new token 278
merging (111, 117) into a new token 279
merging (85, 110) into a new token 280
merging (2

In [30]:
merges

{(101, 32): 256,
 (240, 159): 257,
 (226, 128): 258,
 (105, 110): 259,
 (115, 32): 260,
 (97, 110): 261,
 (116, 104): 262,
 (257, 133): 263,
 (257, 135): 264,
 (97, 114): 265,
 (239, 189): 266,
 (258, 140): 267,
 (267, 264): 268,
 (101, 114): 269,
 (111, 114): 270,
 (116, 32): 271,
 (259, 103): 272,
 (115, 116): 273,
 (261, 100): 274,
 (32, 262): 275,
 (44, 32): 276,
 (97, 109): 277,
 (275, 256): 278,
 (111, 117): 279,
 (85, 110): 280,
 (280, 105): 281,
 (281, 99): 282,
 (282, 111): 283,
 (283, 100): 284,
 (115, 276): 285,
 (273, 114): 286,
 (101, 265): 287,
 (274, 32): 288,
 (259, 116): 289,
 (111, 102): 290,
 (46, 32): 291,
 (108, 108): 292,
 (272, 32): 293,
 (261, 32): 294,
 (101, 110): 295,
 (33, 32): 296,
 (118, 269): 297,
 (121, 32): 298,
 (277, 256): 299}

In [31]:
get_compression_ratio(paragraph_tokens, ids)

length of the tokens: 616
length of the ids: 372
compression ratio: 1.66


In [32]:
encoded_random_str = encode(random_str, merges)

pair (101, 32)
index 256
pair (240, 159)
index 257
pair (105, 110)
index 259
pair (97, 114)
index 265
pair (259, 103)
index 272
pair (44, 32)
index 276
pair (111, 117)
index 279
pair (108, 108)
index 292
pair (272, 32)
index 293


In [33]:
vocab = build_vocab(merges)

In [34]:
vocab

{0: b'\x00',
 1: b'\x01',
 2: b'\x02',
 3: b'\x03',
 4: b'\x04',
 5: b'\x05',
 6: b'\x06',
 7: b'\x07',
 8: b'\x08',
 9: b'\t',
 10: b'\n',
 11: b'\x0b',
 12: b'\x0c',
 13: b'\r',
 14: b'\x0e',
 15: b'\x0f',
 16: b'\x10',
 17: b'\x11',
 18: b'\x12',
 19: b'\x13',
 20: b'\x14',
 21: b'\x15',
 22: b'\x16',
 23: b'\x17',
 24: b'\x18',
 25: b'\x19',
 26: b'\x1a',
 27: b'\x1b',
 28: b'\x1c',
 29: b'\x1d',
 30: b'\x1e',
 31: b'\x1f',
 32: b' ',
 33: b'!',
 34: b'"',
 35: b'#',
 36: b'$',
 37: b'%',
 38: b'&',
 39: b"'",
 40: b'(',
 41: b')',
 42: b'*',
 43: b'+',
 44: b',',
 45: b'-',
 46: b'.',
 47: b'/',
 48: b'0',
 49: b'1',
 50: b'2',
 51: b'3',
 52: b'4',
 53: b'5',
 54: b'6',
 55: b'7',
 56: b'8',
 57: b'9',
 58: b':',
 59: b';',
 60: b'<',
 61: b'=',
 62: b'>',
 63: b'?',
 64: b'@',
 65: b'A',
 66: b'B',
 67: b'C',
 68: b'D',
 69: b'E',
 70: b'F',
 71: b'G',
 72: b'H',
 73: b'I',
 74: b'J',
 75: b'K',
 76: b'L',
 77: b'M',
 78: b'N',
 79: b'O',
 80: b'P',
 81: b'Q',
 82: b'R',
 83: b'

In [35]:
print(decode(encoded_random_str, vocab))

hi hello 😆, how are you doing ? お元気ですか 


In [36]:
gpt2pat = regex.compile(
    r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
)

In [37]:
print(regex.findall(gpt2pat, "Hellow how've are you?123 お元 気 ですか"))

['Hellow', ' how', "'ve", ' are', ' you', '?', '123', ' お元', ' 気', ' ですか']


In [38]:
enc = tiktoken.get_encoding("cl100k_base")

In [39]:
print(enc.encode("hellow how are u ??"))

[71, 5412, 1268, 527, 577, 9602]
