In [1]:
from typing import Dict, List, Tuple
import regex
import tiktoken
import sentencepiece as sp

In [46]:
bytes([0])

b'\x00'

In [26]:
def encode_without_merging(string_input: str):
    return list(string_input.encode("utf-8"))


def encode(string_input: str, merges: Dict[Tuple, int]):
    tokens = list(string_input.encode("utf-8"))
    while len(tokens) >= 2:
        pair_cnts = get_pair_cnt(tokens)
        pair = min(pair_cnts, key=lambda p: merges.get(p, float("inf")))
        if pair not in merges:
            break
        idx = merges[pair]
        print("pair", pair)
        print("index", idx)
        tokens = mint_new_token(tokens, pair, idx)
    return tokens


def build_vocab(merge_dict: Dict) -> Dict[int, str]:
    vocab = {idx: bytes([idx]) for idx in range(256)}
    for (p0, p1), idx in merge_dict.items():
        print(vocab[p0] + vocab[p1])
        vocab[idx] = vocab[p0] + vocab[p1]
    return vocab


def decode(ids: int, vocab: Dict[int, str]):
    tokens = b"".join(vocab[idx] for idx in ids)
    print(tokens)
    text = tokens.decode("utf-8", errors="replace")
    return text


def get_pair_cnt(ids: List[int]):
    counts = {}
    for pair in zip(ids, ids[1:]):
        counts[pair] = counts.get(pair, 0) + 1
    return counts


def get_top_pair(ids: List[int]) -> Tuple[int, ...]:
    pair_cnts = get_pair_cnt(ids)
    top_pair = max(pair_cnts, key=pair_cnts.get)
    return top_pair


def mint_new_token(ids: List[int], pair: Tuple[int, ...], idx: int):
    newIds = []
    i = 0
    while i < len(ids):
        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i + 1] == pair[1]:
            newIds.append(idx)
            i += 2
        else:
            newIds.append(ids[i])
            i += 1
    return newIds


def get_compression_ratio(tokens: List[int], ids: List[int]):
    print("length of the tokens:", len(tokens))
    print("length of the ids:", len(ids))
    print("compression ratio:", round(len(tokens) / len(ids), 2))

In [45]:
bytes([0])

b'\x00'

In [27]:
random_str: str = "hi hello 😆, how are you doing ? お元気ですか "

In [28]:
paragraph_string = "Ｕｎｉｃｏｄｅ! 🅤🅝🅘🅒🅞🅓🅔‽ 🇺‌🇳‌🇮‌🇨‌🇴‌🇩‌🇪! 😄 The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to “support Unicode” in our software (whatever that means—like using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don’t blame programmers for still finding the whole thing mysterious, even 30 years after Unicode’s inception."

In [29]:
paragraph_tokens = encode_without_merging(paragraph_string)

In [30]:
random_string_tokens = encode_without_merging(random_str)

In [31]:
random_string_tokens

[104,
 105,
 32,
 104,
 101,
 108,
 108,
 111,
 32,
 240,
 159,
 152,
 134,
 44,
 32,
 104,
 111,
 119,
 32,
 97,
 114,
 101,
 32,
 121,
 111,
 117,
 32,
 100,
 111,
 105,
 110,
 103,
 32,
 63,
 32,
 227,
 129,
 138,
 229,
 133,
 131,
 230,
 176,
 151,
 227,
 129,
 167,
 227,
 129,
 153,
 227,
 129,
 139,
 32]

In [32]:
# define the vocab size
vocab_size = 300
n_merges = vocab_size - 256
# copy the tokens list
ids = list(paragraph_tokens)

In [33]:
merges = {}
for i in range(n_merges):
    top_pair = get_top_pair(ids)
    idx = 256 + i
    print(f"merging {top_pair} into a new token {idx}")
    ids = mint_new_token(ids, top_pair, idx)
    merges[top_pair] = idx

merging (101, 32) into a new token 256
merging (240, 159) into a new token 257
merging (226, 128) into a new token 258
merging (105, 110) into a new token 259
merging (115, 32) into a new token 260
merging (97, 110) into a new token 261
merging (116, 104) into a new token 262
merging (257, 133) into a new token 263
merging (257, 135) into a new token 264
merging (97, 114) into a new token 265
merging (239, 189) into a new token 266
merging (258, 140) into a new token 267
merging (267, 264) into a new token 268
merging (101, 114) into a new token 269
merging (111, 114) into a new token 270
merging (116, 32) into a new token 271
merging (259, 103) into a new token 272
merging (115, 116) into a new token 273
merging (261, 100) into a new token 274
merging (32, 262) into a new token 275
merging (44, 32) into a new token 276
merging (97, 109) into a new token 277
merging (275, 256) into a new token 278
merging (111, 117) into a new token 279
merging (85, 110) into a new token 280
merging (2

In [34]:
merges

{(101, 32): 256,
 (240, 159): 257,
 (226, 128): 258,
 (105, 110): 259,
 (115, 32): 260,
 (97, 110): 261,
 (116, 104): 262,
 (257, 133): 263,
 (257, 135): 264,
 (97, 114): 265,
 (239, 189): 266,
 (258, 140): 267,
 (267, 264): 268,
 (101, 114): 269,
 (111, 114): 270,
 (116, 32): 271,
 (259, 103): 272,
 (115, 116): 273,
 (261, 100): 274,
 (32, 262): 275,
 (44, 32): 276,
 (97, 109): 277,
 (275, 256): 278,
 (111, 117): 279,
 (85, 110): 280,
 (280, 105): 281,
 (281, 99): 282,
 (282, 111): 283,
 (283, 100): 284,
 (115, 276): 285,
 (273, 114): 286,
 (101, 265): 287,
 (274, 32): 288,
 (259, 116): 289,
 (111, 102): 290,
 (46, 32): 291,
 (108, 108): 292,
 (272, 32): 293,
 (261, 32): 294,
 (101, 110): 295,
 (33, 32): 296,
 (118, 269): 297,
 (121, 32): 298,
 (277, 256): 299}

In [35]:
get_compression_ratio(paragraph_tokens, ids)

length of the tokens: 616
length of the ids: 372
compression ratio: 1.66


In [36]:
encoded_random_str = encode(random_str, merges)

pair (101, 32)
index 256
pair (240, 159)
index 257
pair (105, 110)
index 259
pair (97, 114)
index 265
pair (259, 103)
index 272
pair (44, 32)
index 276
pair (111, 117)
index 279
pair (108, 108)
index 292
pair (272, 32)
index 293


In [37]:
vocab = build_vocab(merges)

b'e '
b'\xf0\x9f'
b'\xe2\x80'
b'in'
b's '
b'an'
b'th'
b'\xf0\x9f\x85'
b'\xf0\x9f\x87'
b'ar'
b'\xef\xbd'
b'\xe2\x80\x8c'
b'\xe2\x80\x8c\xf0\x9f\x87'
b'er'
b'or'
b't '
b'ing'
b'st'
b'and'
b' th'
b', '
b'am'
b' the '
b'ou'
b'Un'
b'Uni'
b'Unic'
b'Unico'
b'Unicod'
b's, '
b'str'
b'ear'
b'and '
b'int'
b'of'
b'. '
b'll'
b'ing '
b'an '
b'en'
b'! '
b'ver'
b'y '
b'ame '


In [25]:
vocab

{0: b'\x00',
 1: b'\x01',
 2: b'\x02',
 3: b'\x03',
 4: b'\x04',
 5: b'\x05',
 6: b'\x06',
 7: b'\x07',
 8: b'\x08',
 9: b'\t',
 10: b'\n',
 11: b'\x0b',
 12: b'\x0c',
 13: b'\r',
 14: b'\x0e',
 15: b'\x0f',
 16: b'\x10',
 17: b'\x11',
 18: b'\x12',
 19: b'\x13',
 20: b'\x14',
 21: b'\x15',
 22: b'\x16',
 23: b'\x17',
 24: b'\x18',
 25: b'\x19',
 26: b'\x1a',
 27: b'\x1b',
 28: b'\x1c',
 29: b'\x1d',
 30: b'\x1e',
 31: b'\x1f',
 32: b' ',
 33: b'!',
 34: b'"',
 35: b'#',
 36: b'$',
 37: b'%',
 38: b'&',
 39: b"'",
 40: b'(',
 41: b')',
 42: b'*',
 43: b'+',
 44: b',',
 45: b'-',
 46: b'.',
 47: b'/',
 48: b'0',
 49: b'1',
 50: b'2',
 51: b'3',
 52: b'4',
 53: b'5',
 54: b'6',
 55: b'7',
 56: b'8',
 57: b'9',
 58: b':',
 59: b';',
 60: b'<',
 61: b'=',
 62: b'>',
 63: b'?',
 64: b'@',
 65: b'A',
 66: b'B',
 67: b'C',
 68: b'D',
 69: b'E',
 70: b'F',
 71: b'G',
 72: b'H',
 73: b'I',
 74: b'J',
 75: b'K',
 76: b'L',
 77: b'M',
 78: b'N',
 79: b'O',
 80: b'P',
 81: b'Q',
 82: b'R',
 83: b'

In [32]:
print(decode(encoded_random_str, vocab))

b'hi hello \xf0\x9f\x98\x86, how are you doing ? \xe3\x81\x8a\xe5\x85\x83\xe6\xb0\x97\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8b '
hi hello 😆, how are you doing ? お元気ですか 


In [40]:
b'\xe3\x81\x8a\xe5\x85\x83\xe6\xb0\x97\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8b'.decode("utf-8")

'お元気ですか'

In [15]:
gpt2pat = regex.compile(
    r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
)

In [16]:
print(regex.findall(gpt2pat, "Hellow how've are you?123 お元 気 ですか"))

['Hellow', ' how', "'ve", ' are', ' you', '?', '123', ' お元', ' 気', ' ですか']


In [17]:
enc = tiktoken.get_encoding("cl100k_base")

In [18]:
print(enc.encode("hellow how are u ??"))

[71, 5412, 1268, 527, 577, 9602]


In [19]:
# write a toy.txt file with some random text
with open("test.txt", "w", encoding="utf-8") as f:
  f.write("SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation systems where the vocabulary size is predetermined prior to the neural model training. SentencePiece implements subword units (e.g., byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model [Kudo.]) with the extension of direct training from raw sentences. SentencePiece allows us to make a purely end-to-end system that does not depend on language-specific pre/postprocessing.")

In [20]:
# train a sentencepiece model on it
# the settings here are (best effort) those used for training Llama 2
import os

options = dict(
  # input spec
  input="test.txt",
  input_format="text",
  # output spec
  model_prefix="tok400", # output filename prefix
  # algorithm spec
  # BPE alg
  model_type="bpe",
  vocab_size=400,
  # normalization
  normalization_rule_name="identity", # ew, turn off normalization
  remove_extra_whitespaces=False,
  input_sentence_size=200000000, # max number of training sentences
  max_sentence_length=4192, # max number of bytes per sentence
  seed_sentencepiece_size=1000000,
  shuffle_input_sentence=True,
  # rare word treatment
  character_coverage=0.99995,
  byte_fallback=True,
  # merge rules
  split_digits=True,
  split_by_unicode_script=True,
  split_by_whitespace=True,
  split_by_number=True,
  max_sentencepiece_length=16,
  add_dummy_prefix=True,
  allow_whitespace_only_pieces=True,
  # special tokens
  unk_id=0, # the UNK token MUST exist
  bos_id=1, # the others are optional, set to -1 to turn off
  eos_id=2,
  pad_id=-1,
  # systems
  num_threads=os.cpu_count(), # use ~all system resources
)

sp.SentencePieceTrainer.train(**options)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: test.txt
  input_format: text
  model_prefix: tok400
  model_type: BPE
  vocab_size: 400
  self_test_sample_size: 0
  character_coverage: 0.99995
  input_sentence_size: 200000000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 8
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 1
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 1
  required_chars: 
  byte_fallback: 1
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  diff

In [21]:
spm = sp.SentencePieceProcessor()

xE7>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0xE8>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0xE9>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0xEA>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0xEB>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0xEC>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0xED>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0xEE>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0xEF>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0xF0>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0xF1>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0xF2>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0xF3>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0xF4>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0xF5>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0xF6>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0xF7>
tra

In [22]:
spm.load('tok400.model')

True

In [23]:
vocab = [[spm.id_to_piece(idx), idx] for idx in range(spm.get_piece_size())]

In [24]:
vocab

[['<unk>', 0],
 ['<s>', 1],
 ['</s>', 2],
 ['<0x00>', 3],
 ['<0x01>', 4],
 ['<0x02>', 5],
 ['<0x03>', 6],
 ['<0x04>', 7],
 ['<0x05>', 8],
 ['<0x06>', 9],
 ['<0x07>', 10],
 ['<0x08>', 11],
 ['<0x09>', 12],
 ['<0x0A>', 13],
 ['<0x0B>', 14],
 ['<0x0C>', 15],
 ['<0x0D>', 16],
 ['<0x0E>', 17],
 ['<0x0F>', 18],
 ['<0x10>', 19],
 ['<0x11>', 20],
 ['<0x12>', 21],
 ['<0x13>', 22],
 ['<0x14>', 23],
 ['<0x15>', 24],
 ['<0x16>', 25],
 ['<0x17>', 26],
 ['<0x18>', 27],
 ['<0x19>', 28],
 ['<0x1A>', 29],
 ['<0x1B>', 30],
 ['<0x1C>', 31],
 ['<0x1D>', 32],
 ['<0x1E>', 33],
 ['<0x1F>', 34],
 ['<0x20>', 35],
 ['<0x21>', 36],
 ['<0x22>', 37],
 ['<0x23>', 38],
 ['<0x24>', 39],
 ['<0x25>', 40],
 ['<0x26>', 41],
 ['<0x27>', 42],
 ['<0x28>', 43],
 ['<0x29>', 44],
 ['<0x2A>', 45],
 ['<0x2B>', 46],
 ['<0x2C>', 47],
 ['<0x2D>', 48],
 ['<0x2E>', 49],
 ['<0x2F>', 50],
 ['<0x30>', 51],
 ['<0x31>', 52],
 ['<0x32>', 53],
 ['<0x33>', 54],
 ['<0x34>', 55],
 ['<0x35>', 56],
 ['<0x36>', 57],
 ['<0x37>', 58],
 ['<0x38>', 5

In [26]:
sp_ids = spm.encode("hello こんにちは")

In [27]:
sp_ids

[362,
 378,
 361,
 372,
 358,
 362,
 230,
 132,
 150,
 230,
 133,
 150,
 230,
 132,
 174,
 230,
 132,
 164,
 230,
 132,
 178]

In [29]:
print([spm.id_to_piece(idx) for idx in sp_ids])

['▁', 'h', 'e', 'l', 'lo', '▁', '<0xE3>', '<0x81>', '<0x93>', '<0xE3>', '<0x82>', '<0x93>', '<0xE3>', '<0x81>', '<0xAB>', '<0xE3>', '<0x81>', '<0xA1>', '<0xE3>', '<0x81>', '<0xAF>']
