# Tokenization - converting human readable text into tokens

### Why not unicode?
- one glyph as one token causes shorter actual context (less words generated while keeping the same context size)
- attention will become inefficient and expensive
- purely for speed and memory efficiency

In [150]:
 text = "Lorem 🌀 你好 🎉, мир! こんにちは 🌏"

In [151]:
tokens = text.encode('utf-8') # raw bytes
tokens = list(tokens) # list of integers

print(f"Text {text}")
print(f"Len: {len(text)}")
print("---")
print(f"Tokens: {tokens}")
print(f"Len: {len(tokens)}")

Text Lorem 🌀 你好 🎉, мир! こんにちは 🌏
Len: 26
---
Tokens: [76, 111, 114, 101, 109, 32, 240, 159, 140, 128, 32, 228, 189, 160, 229, 165, 189, 32, 240, 159, 142, 137, 44, 32, 208, 188, 208, 184, 209, 128, 33, 32, 227, 129, 147, 227, 130, 147, 227, 129, 171, 227, 129, 161, 227, 129, 175, 32, 240, 159, 140, 143]
Len: 52


In [182]:
def get_stats(tokens):
    '''Get the number of times each pair of tokens appears in the text'''
    stats = {}
    for i in range(len(tokens)-1):
        pair = (tokens[i], tokens[i+1])
        stats[pair] = stats.get(pair, 0) + 1
    stats = {k: v for k, v in sorted(stats.items(), key=lambda item: item[1], reverse=True)}
    return stats

In [181]:
print(f"The number of occurrences of each pair of tokens: {get_stats(tokens)}")

The number of occurrences of each pair of tokens: {(227, 129): 4, (32, 240): 3, (240, 159): 3, (159, 140): 2, (147, 227): 2, (76, 111): 1, (111, 114): 1, (114, 101): 1, (101, 109): 1, (109, 32): 1, (140, 128): 1, (128, 32): 1, (32, 228): 1, (228, 189): 1, (189, 160): 1, (160, 229): 1, (229, 165): 1, (165, 189): 1, (189, 32): 1, (159, 142): 1, (142, 137): 1, (137, 44): 1, (44, 32): 1, (32, 208): 1, (208, 188): 1, (188, 208): 1, (208, 184): 1, (184, 209): 1, (209, 128): 1, (128, 33): 1, (33, 32): 1, (32, 227): 1, (129, 147): 1, (227, 130): 1, (130, 147): 1, (129, 171): 1, (171, 227): 1, (129, 161): 1, (161, 227): 1, (129, 175): 1, (175, 32): 1, (140, 143): 1}


In [None]:
def merge(tokens, pair_to_replace, new_token):
    '''Replace all occurrences of a pair of tokens with a new token'''
    merged_tokens = []
    i = 0
    while i < len(tokens)-1:
        pair = (tokens[i], tokens[i+1])
        if pair == pair_to_replace:
            merged_tokens.append(new_token)
            i += 2
        else:
            merged_tokens.append(tokens[i])
            i += 1
            
    if i == len(tokens)-1:
        merged_tokens.append(tokens[-1])
    
    return merged_tokens

In [180]:
pair_to_replace = next(iter(get_stats(tokens))) # get the first element
new_token = 999
merged_tokens = merge(tokens, pair_to_replace, new_token)

print(f"Pair to replace: {pair_to_replace}")
print(f"New token: {new_token}")
print("---")
print(f"Tokens: {len(tokens)},  {tokens}")
print(f"Merged: {len(merged_tokens)}, {merged_tokens}")

Pair to replace: (227, 129)
New token: 999
---
Tokens: 52,  [76, 111, 114, 101, 109, 32, 240, 159, 140, 128, 32, 228, 189, 160, 229, 165, 189, 32, 240, 159, 142, 137, 44, 32, 208, 188, 208, 184, 209, 128, 33, 32, 227, 129, 147, 227, 130, 147, 227, 129, 171, 227, 129, 161, 227, 129, 175, 32, 240, 159, 140, 143]
Merged: 48, [76, 111, 114, 101, 109, 32, 240, 159, 140, 128, 32, 228, 189, 160, 229, 165, 189, 32, 240, 159, 142, 137, 44, 32, 208, 188, 208, 184, 209, 128, 33, 32, 999, 147, 227, 130, 147, 999, 171, 999, 161, 999, 175, 32, 240, 159, 140, 143]
