# Tokenization - Exercises

Notes on the exercises from the [tokenization video](https://www.youtube.com/watch?v=zduSFxRajkE).<br>
Adapted from [github.com/karpathy/minbpe/exercise.md](https://github.com/karpathy/minbpe/blob/master/exercise.md).

1. Watch the [tokenization video](https://www.youtube.com/watch?v=zduSFxRajkE) on YouTube
2. Come back and solve these exercises to level up :)

**Build your own GPT-4 Tokenizer!**

### Step 1

Write the `BasicTokenizer` class, with the following three core functions:

- `def train(self, text, vocab_size, verbose=False)`
- `def encode(self, text)`
- `def decode(self, ids)`

Train your tokenizer on whatever text you like and visualize the merged tokens.<br>
**Do they look reasonable?**<br><br>One default test you may wish to use is the text file `taylorswift.txt`.

---

In [53]:
import os
import tiktoken
import regex as re
import sentencepiece as spm
from tqdm import tqdm

In [54]:
class BasicTokenizer:

    def __init__(self):
        self.encoding = 'utf-8'

    def train(self, text, vocab_size, verbose=False):
        # verbose=True will print all kinds of progress updates during training
        self.num_merges = vocab_size // 2  # Set number of merges
        print(f"Starting training, aiming for {self.num_merges} merges") if verbose else None
        # Tokenize using UTF-8 for a starter token set
        tokens = list(map(int, text.encode(self.encoding)))
        # Store highest token ID present to continue there with merged token IDs
        self.max_id = max(tokens)
        if verbose:
            print(tokens, "\n")
            print("Length of Token List:", len(tokens))
            print("Max Token ID:", self.max_id, "\n")
        self.ids = list(tokens) # deep copy
        self.merges = {} # (pair) -> new token id, keeping track of merges
        for i in range(self.num_merges):
            stats = self._get_stats(self.ids)
            pair = max(stats, key=stats.get) # Retrieve the most common bigram
            idx = self.max_id + i + 1 # ID for the new token (+1 for initial loop offset)
            print(f"Merging {pair}\tinto new token {idx}") if verbose else None
            self.ids = self._merge(self.ids, pair, idx)
            self.merges[pair] = idx
        if verbose:
            print("Old Length of Token List:", len(tokens))
            print("New Length of Token List:", len(self.ids))
            print(f"Compression Ratio: {len(tokens) / len(self.ids):.2f}x")
        # Decoder Preprocessing: Mapping from token-id to bytes-object for that token
        self.vocab = {idx: bytes([idx]) for idx in range(self.max_id + 1)}
        for (p0, p1), idx in self.merges.items():  # This needs to run in the order in which we inserted items into merges (use Python >= 3.7)
            self.vocab[idx] = self.vocab[p0] + self.vocab[p1] # Populate at idx (parent integer) with concatenated bytes-object of children p0 and p1

    def encode(self, text):
        tokens = list(text.encode(self.encoding)) # raw bytes formatted as integer list
        # Lookup into merges (historically accurate from top to bottom) and replace the pair with the new token-id recursively
        while len(tokens) > 1:
            stats = self._get_stats(tokens) # Count how many times each pair occurs, format: (int, int) -> int (occurrence count)
            pair = min(stats, key=lambda p: self.merges.get(p, float('inf'))) # Iterating over keys of stats here; Retrieve the pair with the lowest merge index
            if pair not in self.merges: # Could be because self.merges doesn't contain any pair that occurs in tokens
                break # Nothing to merge anymore
            idx = self.merges[pair] # Retrieve new token-representation for mergable pair
            tokens = self._merge(tokens, pair, idx) # Every pair is replaced with idx
        return tokens

    def decode(self, ids):
        # Given ids (list of integers), return the Python string
        tokens = b"".join(self.vocab[idx] for idx in ids)     # Concatenate all the bytes-objects for each new token-id
        text = tokens.decode(self.encoding, errors="replace")  # Decode the bytes into a Python string
        return text

    def _get_stats(self, ids):
        counts = {}
        for pair in zip(ids, ids[1:]): # Sliding Window of size 2 across tokens 
            counts[pair] = counts.get(pair, 0) + 1
        return counts

    def _merge(self, ids, pair, idx):
        # Iterating through ids, if we find (pair), replace it by value idx
        newids = []
        i = 0
        while i < len(ids):
            # If we are not at the very last position AND the pair matches, replace it
            if i < len(ids)-1 and (ids[i], ids[i+1]) == pair:
                newids.append(idx)
                i += 2 # We skip over the now replaced pair
            else:
                newids.append(ids[i])
                i += 1
        return newids

In [55]:
# Testing the implementation with the taylorswift.txt
with open("./taylorswift.txt", "r") as f:
    text = f.read()
vocab_size = len(set(text))
# Initialize
tokenizer = BasicTokenizer()
# Train
tokenizer.train(text, vocab_size=vocab_size, verbose=True)
# Encode
encoded = tokenizer.encode("Yo, Taylor, I’m really happy for you, I’mma let you finish, but Beyoncé had one of the best videos of all time! One of the best videos of all time!")
# Decode
decoded = tokenizer.decode(encoded)

Starting training, aiming for 47 merges
[67, 111, 112, 121, 32, 112, 97, 115, 116, 101, 32, 111, 102, 32, 116, 104, 101, 32, 87, 105, 107, 105, 112, 101, 100, 105, 97, 32, 97, 114, 116, 105, 99, 108, 101, 32, 111, 110, 32, 84, 97, 121, 108, 111, 114, 32, 83, 119, 105, 102, 116, 44, 32, 97, 115, 32, 111, 102, 32, 70, 101, 98, 32, 49, 54, 44, 32, 50, 48, 50, 52, 46, 10, 45, 45, 45, 10, 10, 77, 97, 105, 110, 32, 109, 101, 110, 117, 10, 10, 87, 105, 107, 105, 112, 101, 100, 105, 97, 84, 104, 101, 32, 70, 114, 101, 101, 32, 69, 110, 99, 121, 99, 108, 111, 112, 101, 100, 105, 97, 10, 10, 83, 101, 97, 114, 99, 104, 10, 67, 114, 101, 97, 116, 101, 32, 97, 99, 99, 111, 117, 110, 116, 10, 76, 111, 103, 32, 105, 110, 10, 10, 80, 101, 114, 115, 111, 110, 97, 108, 32, 116, 111, 111, 108, 115, 10, 67, 111, 110, 116, 101, 110, 116, 115, 32, 32, 104, 105, 100, 101, 10, 40, 84, 111, 112, 41, 10, 76, 105, 102, 101, 32, 97, 110, 100, 32, 99, 97, 114, 101, 101, 114, 10, 84, 111, 103, 103, 108, 101, 32, 76

### Step 2

Convert your `BasicTokenizer` into a `RegexTokenizer`, which takes a regex pattern and splits the text exactly as GPT-4 would.<br>
Process the parts separately as before, then concatenate the results.<br>
Retrain your tokenizer and compare the results before and after.<br><br>
You should see that you will now have no tokens that go across categories (numbers, letters, punctuation, more than one whitespace).<br><br><br>Use the GPT-4 pattern:

```
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
```

---

With the `BasicTokenizer`, we took the text as-is and applied basic tokenization.<br>
To apply the `GPT4_SPLIT_PATTERN`, we will first split the text into a list of regex-compliant input text chunks.<br>
**The list entries then are individually processed and their token representations are finally concatenated together.**

In [56]:
GPT4_SPLIT_PATTERN = re.compile(r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""")
print(re.findall(GPT4_SPLIT_PATTERN, "Hello world how are you? I've head you're      4.543 billion years old??!"))

['Hello', ' world', ' how', ' are', ' you', '?', ' I', "'ve", ' head', ' you', "'re", '     ', ' ', '4', '.', '543', ' billion', ' years', ' old', '??!']


In [57]:
class RegexTokenizer:

    def __init__(self):
        self.encoding = 'utf-8'
        self.GPT4_SPLIT_PATTERN = re.compile(r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""")

    def _regex_split_tokenize(self, text):
        chunks = re.findall(self.GPT4_SPLIT_PATTERN, text)
        # return [[], [], []] instead of [] to more clearly mark different text segments
        return [list(chunk.encode(self.encoding)) for chunk in chunks]

    def train(self, text, vocab_size, verbose=False):
        self.num_merges = vocab_size // 2
        if verbose:
            print(f"Starting training, aiming for {self.num_merges} merges")
        # Split with regex, then tokenize created chunks into bytes
        chunked_tokens = self._regex_split_tokenize(text)
        # flatten for max_id calculation only
        flat_tokens = [b for chunk in chunked_tokens for b in chunk]
        self.max_id = max(flat_tokens) # GPT-4 Tokenizer would set this to 255, we don't go for that gap

        if verbose:
            print("Initial Tokens:", chunked_tokens)
            print("Length of Token List:", sum(len(c) for c in chunked_tokens))
            print("Max Token ID:", self.max_id, "\n")

        self.ids = [list(chunk) for chunk in chunked_tokens]  # deep copy
        self.merges = {}

        for i in range(self.num_merges):
            # compute stats over all chunks, but not across boundaries
            stats = {}
            for chunk in self.ids:
                stats.update(self._get_stats(chunk))

            if not stats:
                break

            pair = max(stats, key=stats.get)
            idx = self.max_id + i + 1 # ID for the new token (+1 for initial loop offset)
            if verbose:
                print(f"Merging {pair} into new token {idx}")

            # merge inside each chunk
            self.ids = [self._merge(chunk, pair, idx) for chunk in self.ids]
            self.merges[pair] = idx

        if verbose:
            print("Old Length of Token List:", len(flat_tokens))
            new_len = sum(len(c) for c in self.ids)
            print("New Length of Token List:", new_len)
            print(f"Compression Ratio: {len(flat_tokens) / new_len:.2f}x")

        # Build vocab for decoding
        self.vocab = {idx: bytes([idx]) for idx in range(self.max_id + 1)}
        for (p0, p1), idx in self.merges.items():
            self.vocab[idx] = self.vocab[p0] + self.vocab[p1]

    def encode(self, text):
        chunks = self._regex_split_tokenize(text)

        # apply merges inside each chunk
        encoded_chunks = []
        for chunk in chunks:
            while len(chunk) > 1:
                stats = self._get_stats(chunk)
                if not stats:
                    break
                pair = min(stats, key=lambda p: self.merges.get(p, float('inf')))
                if pair not in self.merges:
                    break
                idx = self.merges[pair]
                chunk = self._merge(chunk, pair, idx)
            encoded_chunks.append(chunk)

        # concatenate results
        return [tok for chunk in encoded_chunks for tok in chunk]

    def decode(self, ids):
        tokens = b"".join(self.vocab[idx] for idx in ids)
        return tokens.decode(self.encoding, errors="replace")

    def _get_stats(self, ids):
        counts = {}
        for pair in zip(ids, ids[1:]):
            counts[pair] = counts.get(pair, 0) + 1
        return counts

    def _merge(self, ids, pair, idx):
        newids = []
        i = 0
        while i < len(ids):
            if i < len(ids) - 1 and (ids[i], ids[i + 1]) == pair:
                newids.append(idx)
                i += 2
            else:
                newids.append(ids[i])
                i += 1
        return newids

Let's now test this implementation with the `taylorswift.txt` file.

In [58]:
with open("./taylorswift.txt", "r") as f:
    text = f.read()
vocab_size = len(set(text))

# Initialize
tokenizer = RegexTokenizer()
# Train
tokenizer.train(text, vocab_size=vocab_size, verbose=True)
# Encode
encoded = tokenizer.encode("Yo, Taylor, I’m really happy for you, I’mma let you finish, but Beyoncé had one of the best videos of all time! One of the best videos of all time!")
# Decode
decoded = tokenizer.decode(encoded)

Starting training, aiming for 47 merges


Initial Tokens: [[67, 111, 112, 121], [32, 112, 97, 115, 116, 101], [32, 111, 102], [32, 116, 104, 101], [32, 87, 105, 107, 105, 112, 101, 100, 105, 97], [32, 97, 114, 116, 105, 99, 108, 101], [32, 111, 110], [32, 84, 97, 121, 108, 111, 114], [32, 83, 119, 105, 102, 116], [44], [32, 97, 115], [32, 111, 102], [32, 70, 101, 98], [32], [49, 54], [44], [32], [50, 48, 50], [52], [46, 10], [45, 45, 45, 10, 10], [77, 97, 105, 110], [32, 109, 101, 110, 117], [10, 10], [87, 105, 107, 105, 112, 101, 100, 105, 97, 84, 104, 101], [32, 70, 114, 101, 101], [32, 69, 110, 99, 121, 99, 108, 111, 112, 101, 100, 105, 97], [10, 10], [83, 101, 97, 114, 99, 104], [10], [67, 114, 101, 97, 116, 101], [32, 97, 99, 99, 111, 117, 110, 116], [10], [76, 111, 103], [32, 105, 110], [10, 10], [80, 101, 114, 115, 111, 110, 97, 108], [32, 116, 111, 111, 108, 115], [10], [67, 111, 110, 116, 101, 110, 116, 115], [32], [32, 104, 105, 100, 101], [10], [40, 84, 111, 112], [41, 10], [76, 105, 102, 101], [32, 97, 110, 100], [

### Step 3

You're now ready to load the merges from the GPT-4 tokenizer and show that your tokenizer produces the identical results for both `encode` and `decode`, matching [tiktoken](https://github.com/openai/tiktoken).

```
# match this
import tiktoken
enc = tiktoken.get_encoding("cl100k_base") # this is the GPT-4 tokenizer
ids = enc.encode("hello world!!!? (안녕하세요!) lol123 😉")
text = enc.decode(ids) # get the same text back
```

Unfortunately, you will run into two issues:

1. It is not trivial to recover the raw merges from the GPT-4 tokenizer. You can easily recover what we call `vocab` here, and what they call and store under `enc._mergeable_ranks`. Feel free to copy paste the `recover_merges` function in [`minbpe/gpt4.py`](https://github.com/karpathy/minbpe/blob/master/minbpe/gpt4.py), which takes these ranks and returns the raw merges. If you wish to know how this function works, read [this](https://github.com/openai/tiktoken/issues/60) and [this](https://github.com/karpathy/minbpe/issues/11#issuecomment-1950805306). Basically, under some conditions it is enough to only store the parent nodes (and their rank) and get rid of the precise details of which children merged up to any parent.
2. Second, the GPT-4 tokenizer for some reason permutes its raw bytes. It stores this permutation in the first $256$ elements of the mergeable ranks, so you can recover this byte shuffle relatively simply as `byte_shuffle = {i: enc._mergeable_ranks[bytes([i])] for i in range(256)}`. In both your encode and decode, you'll have to shuffle bytes around accordingly.

---

**Ok, step by step here.**<br>
First, we will copy the function `recover_merges` from the [minbpe](https://github.com/karpathy/minbpe/blob/master/minbpe/gpt4.py) project.<br>
That function requires the `bpe` helper function to be defined as well.

In [59]:
# ------------------------------------------------------------------------------
# From: https://github.com/karpathy/minbpe/blob/master/minbpe/gpt4.py

def bpe(mergeable_ranks, token, max_rank):
    # helper function used in get_gpt4_merges() to reconstruct the merge forest
    parts = [bytes([b]) for b in token]
    while True:
        min_idx = None
        min_rank = None
        for i, pair in enumerate(zip(parts[:-1], parts[1:])):
            rank = mergeable_ranks.get(pair[0] + pair[1])
            if rank is not None and (min_rank is None or rank < min_rank):
                min_idx = i
                min_rank = rank
        if min_rank is None or (max_rank is not None and min_rank >= max_rank):
            break
        assert min_idx is not None
        parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
    return parts

def recover_merges(mergeable_ranks):
    # the `merges` are already the byte sequences in their merged state.
    # so we have to recover the original pairings. We can do this by doing
    # a small BPE training run on all the tokens, in their order.
    # also see https://github.com/openai/tiktoken/issues/60
    # also see https://github.com/karpathy/minbpe/issues/11#issuecomment-1950805306
    merges = {}
    for token, rank in mergeable_ranks.items():
        if len(token) == 1:
            continue # skip raw bytes
        pair = tuple(bpe(mergeable_ranks, token, max_rank=rank))
        assert len(pair) == 2
        # recover the integer ranks of the pair
        ix0 = mergeable_ranks[pair[0]]
        ix1 = mergeable_ranks[pair[1]]
        merges[(ix0, ix1)] = rank
    return merges

# ------------------------------------------------------------------------------

Now we can apply `recover_merges` to provide the original merge pairs:

In [60]:
# Recover merges from GPT-4 tokenizer (cl100k-base)
enc = tiktoken.get_encoding("cl100k_base")
vocab = enc._mergeable_ranks
merges = recover_merges(vocab)

print(list(merges.items())[:5])

[((220, 220), 256), ((256, 256), 257), ((72, 77), 258), ((220, 83), 259), ((257, 257), 260)]


We can now go ahead and modify the `RegexTokenizer` class to incorporate our new merge pairs and that weird byte shuffle.<br>
I didn't find a need to unshuffle the bytes during decoding, as the byte shuffle is really only relevant for the encoding process, meaning that in my setup `decode()` actually doesn't need to change at all.

In [61]:
class RegexGPT4Tokenizer:

    def __init__(self):
        self.encoding = 'utf-8'
        self.GPT4_SPLIT_PATTERN = re.compile(r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""")

    def load_gpt4_merges(self):
        enc = tiktoken.get_encoding("cl100k_base")
        ranks = enc._mergeable_ranks
        # Recover merges
        self.merges = recover_merges(ranks)
        self.vocab = {rank: token for token, rank in ranks.items()}  # idx -> bytes
        # Byte shuffle
        self.byte_shuffle = {i: ranks[bytes([i])] for i in range(256)}
        self.max_id = 255 # GPT-4 Tokenizer's first merge has ID 256, so we set to that too

    def _regex_split_tokenize(self, text):
        chunks = re.findall(self.GPT4_SPLIT_PATTERN, text)
        return [[self.byte_shuffle[b] for b in chunk.encode(self.encoding)] for chunk in chunks if chunk]

    def encode(self, text):
        chunks = self._regex_split_tokenize(text)
        # Apply merges inside each chunk
        encoded_chunks = []
        for chunk in chunks:
            while len(chunk) > 1:
                stats = self._get_stats(chunk)
                if not stats:
                    break
                pair = min(stats, key=lambda p: self.merges.get(p, float('inf')))
                if pair not in self.merges:
                    break
                idx = self.merges[pair]
                chunk = self._merge(chunk, pair, idx)
            encoded_chunks.append(chunk)
        # Concatenate results
        return [tok for chunk in encoded_chunks for tok in chunk]

    def decode(self, ids):
        tokens = b"".join(self.vocab[idx] for idx in ids)
        return tokens.decode(self.encoding, errors="replace")

    def _get_stats(self, ids):
        counts = {}
        for pair in zip(ids, ids[1:]):
            counts[pair] = counts.get(pair, 0) + 1
        return counts

    def _merge(self, ids, pair, idx):
        newids = []
        i = 0
        while i < len(ids):
            if i < len(ids) - 1 and (ids[i], ids[i + 1]) == pair:
                newids.append(idx)
                i += 2
            else:
                newids.append(ids[i])
                i += 1
        return newids

In [62]:
tok = RegexGPT4Tokenizer()
tok.load_gpt4_merges()

s = "hello world!!!? (안녕하세요!) lol123 😉"
ids = tok.encode(s)
print("ids:", ids)

decoded = tok.decode(ids)
print("decoded:", decoded, "\n")

enc = tiktoken.get_encoding("cl100k_base")
ids_ref = enc.encode(s)
print("ids_ref:", ids_ref)

decoded_ref = enc.decode(ids_ref)
print("decoded_ref:", decoded_ref)

assert ids == ids_ref
assert decoded == decoded_ref

ids: [15339, 1917, 12340, 30, 320, 31495, 230, 75265, 243, 92245, 16715, 28509, 4513, 57037]
decoded: hello world!!!? (안녕하세요!) lol123 😉 

ids_ref: [15339, 1917, 12340, 30, 320, 31495, 230, 75265, 243, 92245, 16715, 28509, 4513, 57037]
decoded_ref: hello world!!!? (안녕하세요!) lol123 😉


In [63]:
print(list(tok.vocab.items())[:10])
print(type(list(tok.vocab.items())[0][0]))
print(type(list(tok.vocab.items())[0][1]))

[(0, b'!'), (1, b'"'), (2, b'#'), (3, b'$'), (4, b'%'), (5, b'&'), (6, b"'"), (7, b'('), (8, b')'), (9, b'*')]
<class 'int'>
<class 'bytes'>


### Step 4

*(Optional, irritating, not obviously useful)*<br><br>Add the ability to handle special tokens.<br>
You'll then be able to match the output of tiktoken even when special tokens are present, e.g.:

```
import tiktoken
enc = tiktoken.get_encoding("cl100k_base") # this is the GPT-4 tokenizer
ids = enc.encode("<|endoftext|>hello world", allowed_special="all")
```

Without `allowed_special` tiktoken will error.

---

Ok, what kind of special tokens are we talking about here? More precisely, how many?<br>
We can find an answer in the [minbpe/gpt4.py](https://github.com/karpathy/minbpe/blob/master/minbpe/gpt4.py) file.

```
GPT4_SPECIAL_TOKENS = {
    '<|endoftext|>': 100257,
    '<|fim_prefix|>': 100258,
    '<|fim_middle|>': 100259,
    '<|fim_suffix|>': 100260,
    '<|endofprompt|>': 100276
}
```

Let's use this `GPT4_SPECIAL_TOKENS` dictionary and integrate this into the `RegexGPT4Tokenizer`, forming a new `GPT4Tokenizer` class.<br>
Of course the special tokens have to be announced to the tokenizer, I do that with the `self.GPT4_SPECIAL_TOKENS` attribute.<br><br>
Simply appending to the tokenizer's `vocab` doesn't work. This becomes evident when looking at how our regex pattern treats the special tokens:

In [64]:
GPT4_SPLIT_PATTERN = re.compile(r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""")
print(re.findall(GPT4_SPLIT_PATTERN, '<|endoftext|>'))

['<|', 'endoftext', '|>']


The regex splitting the special tokens apart makes it impossible to correctly identify and encode them.<br>
We can solve this by putting special tokens into their own chunks inside our `_regex_split_tokenize()`:

In [65]:
class GPT4Tokenizer:

    def __init__(self):
        self.encoding = 'utf-8'
        self.GPT4_SPLIT_PATTERN = re.compile(r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""")
        self.GPT4_SPECIAL_TOKENS = {
            '<|endoftext|>': 100257,
            '<|fim_prefix|>': 100258,
            '<|fim_middle|>': 100259,
            '<|fim_suffix|>': 100260,
            '<|endofprompt|>': 100276
        }

    def load_gpt4_merges(self):
        enc = tiktoken.get_encoding("cl100k_base")
        ranks = enc._mergeable_ranks
        # Recover merges
        self.merges = recover_merges(ranks)
        self.vocab = {rank: token for token, rank in ranks.items()}
        # Byte shuffle
        self.byte_shuffle = {i: ranks[bytes([i])] for i in range(256)}
        # GPT-4 Tokenizer's first merge has ID 256
        # We set to that too even if our training set doesn't provide
        self.max_id = 255
        # Append the special tokens, where k is <token> and v is <id>
        self.vocab.update({v: bytes(k, self.encoding) for k, v in self.GPT4_SPECIAL_TOKENS.items()})

    def _regex_split_tokenize(self, text):
        # Sort by length desc to prefer longest matches when specials share prefixes
        specials = sorted(self.GPT4_SPECIAL_TOKENS.keys(), key=len, reverse=True)
        chunks = []
        i = 0
        while i < len(text):
            # Find earliest special token, preferring longer ones at same position
            next_special = min(
                ((text.find(tok, i), tok) for tok in specials if text.find(tok, i) != -1),
                key=lambda x: (x[0], -len(x[1])),
                default=(-1, None)
            )
            next_pos, next_tok = next_special
            if next_pos == -1:
                # No more specials: tokenize remainder with regex
                for m in self.GPT4_SPLIT_PATTERN.finditer(text[i:]):
                    if m.group(0):
                        chunks.append([self.byte_shuffle[b] for b in m.group(0).encode(self.encoding)])
                break
            if next_pos > i:
                # Tokenize span before next special token
                for m in self.GPT4_SPLIT_PATTERN.finditer(text[i:next_pos]):
                    if m.group(0):
                        chunks.append([self.byte_shuffle[b] for b in m.group(0).encode(self.encoding)])
            # Emit special token as single chunk
            chunks.append([self.GPT4_SPECIAL_TOKENS[next_tok]])
            i = next_pos + len(next_tok)
        return chunks


    def encode(self, text):
        chunks = self._regex_split_tokenize(text)
        # Apply merges inside each chunk
        encoded_chunks = []
        for chunk in chunks:
            while len(chunk) > 1:
                stats = self._get_stats(chunk)
                if not stats:
                    break
                pair = min(stats, key=lambda p: self.merges.get(p, float('inf')))
                if pair not in self.merges:
                    break
                idx = self.merges[pair]
                chunk = self._merge(chunk, pair, idx)
            encoded_chunks.append(chunk)
        # Concatenate results
        return [tok for chunk in encoded_chunks for tok in chunk]

    def decode(self, ids):
        tokens = b"".join(self.vocab[idx] for idx in ids)
        return tokens.decode(self.encoding, errors="replace")

    def _get_stats(self, ids):
        counts = {}
        for pair in zip(ids, ids[1:]):
            counts[pair] = counts.get(pair, 0) + 1
        return counts

    def _merge(self, ids, pair, idx):
        newids = []
        i = 0
        while i < len(ids):
            if i < len(ids) - 1 and (ids[i], ids[i + 1]) == pair:
                newids.append(idx)
                i += 2
            else:
                newids.append(ids[i])
                i += 1
        return newids

In [66]:
tok = GPT4Tokenizer()
tok.load_gpt4_merges()

s = "<|endoftext|>hello world, I <|endoftext|> am so glad you're well."
ids = tok.encode(s)
print("ids:", ids)

decoded = tok.decode(ids)
print("decoded:", decoded, "\n")

enc = tiktoken.get_encoding("cl100k_base")
ids_ref = enc.encode(s, allowed_special="all")
print("ids_ref:", ids_ref)

decoded_ref = enc.decode(ids_ref)
print("decoded_ref:", decoded_ref)

assert ids == ids_ref
assert decoded == decoded_ref

ids: [100257, 15339, 1917, 11, 358, 220, 100257, 1097, 779, 16089, 499, 2351, 1664, 13]
decoded: <|endoftext|>hello world, I <|endoftext|> am so glad you're well. 

ids_ref: [100257, 15339, 1917, 11, 358, 220, 100257, 1097, 779, 16089, 499, 2351, 1664, 13]
decoded_ref: <|endoftext|>hello world, I <|endoftext|> am so glad you're well.


The logic behind the revised `_regex_split_tokenize()` function is as follows:<br>
We scan the text linearly, finding the earliest special token in that text at each step.<br>
Yes, essentially we use regex on top of regex here, one for chunking, one for special token identification. That is awkward.<br>
Anyway, when found, we first tokenize any special-token-preceding text with the standard regex pattern, then append the special token as a standalone chunk.<br>
Special tokens this way can't get torn apart by regex and their position in the list of tokenized chunks remains preserved.

*This is awkward* because it introduces complexity into the tokenization process and can lead to inefficiencies.<br>
However, it is necessary to ensure that special tokens are handled correctly and consistently, and they can boost performance significantly down the line, e.g. for tool use.

### Step 5

**If you've made it this far, you're now a pro at LLM Tokenization!**<br><br>
Sadly, you're not *exactly* done yet because a lot of LLMs outside of OpenAI (e.g. Llama, Mistral) use [sentencepiece](https://github.com/google/sentencepiece) instead.<br>
Primary difference being that sentencepiece runs BPE *directly* on Unicode code points instead of on UTF-8 encoded bytes.<br><br>
Feel free to explore sentencepiece on your own (good luck, it's not too pretty),<br>
and stretch goal if you really experience and suffer from the burden of time,<br>
re-write your BPE to be on Unicode code points and match the Llama 2 tokenizer.

---

In [70]:
options = dict(
  # Input
  input="taylorswift.txt",              # Use the taylorswift.txt as before
  input_format="text",                  # The file we provided contains text
  # Output
  model_prefix="taylor400",             # Output filename prefix
  # Algorithm spec
  model_type="bpe",                     # Use BPE algorithm for tokenization
  vocab_size=400,                       # Run BPE until a total of 400 tokens are created
  # Normalization
  normalization_rule_name="identity",   # Turn off normalization (would concern e.g. case folding, space handling)
  remove_extra_whitespaces=False,       # Don't touch existing whitespaces
  input_sentence_size=200000000,        # Max number of training sentences
  max_sentence_length=4192,             # Max number of bytes per sentence
  seed_sentencepiece_size=1000000,      # Initial seed vocabulary size, determines how many candidate subword pieces the algorithm starts with
  shuffle_input_sentence=True,          # Shuffle the input sentences
  # rare word treatment
  character_coverage=0.99995,           # Percentage of all unique characters from training corpus to be included in tokenizer's vocabulary
  byte_fallback=True,                   # Unknown characters get tokenized as bytes
  # merge rules (a different way to approach what tiktoken did through regex)
  split_digits=True,                    # Split digits into separate tokens if needed
  split_by_unicode_script=True,         # Splits text at boundaries where the Unicode script changes (e.g. Latin, then Cyrillic characters)
  split_by_whitespace=True,             # Treats whitespace as a token boundary in hopes of words getting separated more cleanly
  split_by_number=True,                 # Splits digits from alphabetic characters ("abc123" becomes "abc", "123").
  max_sentencepiece_length=16,          # Maximum substring length to be represented by a single token
  add_dummy_prefix=False,               # Adds a special prefix to the start of input
  allow_whitespace_only_pieces=True,    # Allows tokens consisting only of whitespace (useful for formatting-sensitive text).
  # special hard-coded tokens
  unk_id=0,  # UNK token ID, MUST exist as fallback for out-of-vocabulary tokens
  bos_id=1,  # beginning-of-sequence token ID, optional, set to -1 to turn off
  eos_id=2,  # end-of-sequence token ID, again optional, -1 to turn off
  pad_id=-1, # padding token ID, used in batching, -1 means padding disabled
  # systems
  num_threads=os.cpu_count() // 2,           # Use half of system's resources
)

spm.SentencePieceTrainer.train(**options)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: taylorswift.txt
  input_format: text
  model_prefix: taylor400
  model_type: BPE
  vocab_size: 400
  self_test_sample_size: 0
  character_coverage: 0.99995
  input_sentence_size: 200000000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 2
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 1
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 1
  required_chars: 
  byte_fallback: 1
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy

In [None]:
sp = spm.SentencePieceProcessor()
sp.load("taylor400.model")
ids = sp.encode("hello 안녕하세요")

print(ids)
print([sp.id_to_piece(idx) for idx in ids]) 

[265, 300, 319, 315, 239, 152, 139, 238, 136, 152, 240, 152, 155, 239, 135, 187, 239, 157, 151]
['he', 'll', 'o', '▁', '<0xEC>', '<0x95>', '<0x88>', '<0xEB>', '<0x85>', '<0x95>', '<0xED>', '<0x95>', '<0x98>', '<0xEC>', '<0x84>', '<0xB8>', '<0xEC>', '<0x9A>', '<0x94>']


Finally, let's look at the [Llama2 tokenizer (1)](https://huggingface.co/docs/transformers/v4.40.1/en/model_doc/llama2) and the [Llama-2 paper (2)](https://arxiv.org/abs/2307.09288):

**(1)**
- "The LLaMA tokenizer is a BPE model based on sentencepiece."
- "The original model uses pad_id = -1 which means that there is *no padding token*."
- "One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. “Banana”), the tokenizer does not prepend the prefix space to the string."
- "`pad_token_id` = None `bos_token_id` = 1 `eos_token_id` = 2"

**(2)**
- "[...] we split all numbers into individual digits and use bytes to decompose unknown UTF-8 characters."
- "The [tokenizer's] total vocabulary size is 32k tokens."

All of this essentially amounts to:

```bash
sentencepiece_train \
    --input=taylorswift.txt \
    --model_prefix=llama2_tokenizer \
    --vocab_size=32000 \
    --model_type=bpe \
    --bos_id=1 \
    --eos_id=2 \
    --unk_id=0 \
```

Let's look at the reference implementation of the Llama-2 tokenizer from Huggingface:

In [None]:
from huggingface_hub import login
from transformers import AutoTokenizer

login(token="hf_doNoTeVentRy")

In [81]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", use_auth_token=True)
tokens = tokenizer.encode("hello 안녕하세요")
print(tokens)
decoded = tokenizer.decode(tokens)
print(decoded)

[1, 22172, 29871, 31734, 238, 136, 152, 30944, 31578, 31527]
<s> hello 안녕하세요


**Now let's try and replicate this behavior.**<br><br>
The Llama2 tokenizer is based on SentencePiece, which typically works on Unicode code points internally.<br>
SentencePiece then encodes these Unicode strings into tokens using UTF-8 byte handling as part of its BPE algorithm.<br>
So, in total, the Llama2 tokenizer uses UTF-8 for BPE but tokenizes Unicode.<br>
We should also consider that Llama 2 using sentencepiece means that it doesn't rely on regex splits like GPT, so no split patterns.

In [99]:
class LLaMA2Tokenizer:

    def __init__(self, merges=None, vocab=None):
        self.BOS_ID = 1
        self.EOS_ID = 2
        self.UNK_ID = 0
        self.vocab = vocab or {}      # token_id -> string
        self.merges = merges or {}    # (token_id1, token_id2) -> merged_id

    def encode(self, text):
        # Prepend space if text starts a new word (SentencePiece quirk)
        tokens = [1] + [ord(c) for c in text] # Prepend BOS token
        # Apply BPE merges
        chunk = tokens
        while len(chunk) > 1:
            stats = self._get_stats(chunk[1:]) # Skip BOS token
            if not stats:
                break
            # Pick the pair with lowest merge rank (if exists)
            pair = min(stats, key=lambda p: self.merges.get(p, float('inf')))
            if pair not in self.merges:
                break
            idx = self.merges[pair]
            chunk = self._merge(chunk, pair, idx)
        return chunk

    def decode(self, ids):
        s = ""
        for idx in ids:
            if idx == self.BOS_ID:
                s += '<bos>'
                continue
            if idx == self.EOS_ID:
                s += '<eos>'
                continue
            if idx in self.vocab:
                s += self.vocab[idx]
            else:
                try:
                    s += chr(idx)
                except ValueError:
                    s += "�"
        return s

    def _get_stats(self, ids):
        counts = {}
        for pair in zip(ids, ids[1:]):
            counts[pair] = counts.get(pair, 0) + 1
        return counts

    def _merge(self, ids, pair, idx):
        new_ids = []
        i = 0
        while i < len(ids):
            if i < len(ids) - 1 and (ids[i], ids[i + 1]) == pair:
                new_ids.append(idx)
                i += 2
            else:
                new_ids.append(ids[i])
                i += 1
        return new_ids

In [100]:
# Instantiate tokenizer
tokenizer = LLaMA2Tokenizer()

# Example text
text = "hello 안녕하세요"

# Encode: returns a list of token IDs (Unicode codepoints)
encoded = tokenizer.encode(text)
print("Encoded:", encoded)

# Decode: converts token IDs back to string
decoded = tokenizer.decode(encoded)
print("Decoded:", decoded)

Encoded: [1, 104, 101, 108, 108, 111, 32, 50504, 45397, 54616, 49464, 50836]
Decoded: <bos>hello 안녕하세요


*This is by no means perfect*, but it provides a solid foundation to build a functional, Unicode-based LLaMA 2 tokenizer<br>
that can encode and decode text, handle special tokens, and apply BPE merges correctly.