# Determine character codepoint

In Python, strings are encoded with unicode, meaning each character is a unicode code point. Code points are identified by number, customarily written in hexadecimal with the prefix “U+”,
which are their index in the codespace. To retrieve the unicode code point of a character in Python, use the `ord` function with the character as its parameter. 

Unicode defines 3 types of encodings: UTF-8, UTF-16 and UTF-32. These encodings allow the standard's abstracted codes for characters to be processed and stored as binary data (refer to [A Programmer's Introduction to Unicode](https://www.reedbeta.com/blog/programmers-intro-to-unicode/)). 

Using the UTF-8 encoding to represent each characters as a 1 - 4 bytes. We can then utilize the byte pair encoding algorithm to shorten the representations of token, allowing our language model to see more tokens in its attention. 

In [67]:
s = "日本からこんにちは (Hello from Japan!)"

In [68]:
unicode_s = [ord(x) for x in s] 

# bytes representation converted to integers representation 
utf_8_s = list(s.encode("utf-8"))

print(utf_8_s)

[230, 151, 165, 230, 156, 172, 227, 129, 139, 227, 130, 137, 227, 129, 147, 227, 130, 147, 227, 129, 171, 227, 129, 161, 227, 129, 175, 32, 40, 72, 101, 108, 108, 111, 32, 102, 114, 111, 109, 32, 74, 97, 112, 97, 110, 33, 41]


In [69]:
# encode a long paragraph
text = " UTF-8, each code point is stored using 1 to 4 bytes, based on its index value.  UTF-8 uses a system of binary prefixes, in which the high bits of each byte mark whether it’s a single byte, the beginning of a multi-byte sequence, or a continuation byte; the remaining bits, concatenated, give the code point index."

tokens = text.encode("utf-8")
tokens = list(tokens) 

print(text) 
print(f"length: {len(text)}")
print("----")
print(tokens) 
print(f"length: {len(tokens)}")

 UTF-8, each code point is stored using 1 to 4 bytes, based on its index value.  UTF-8 uses a system of binary prefixes, in which the high bits of each byte mark whether it’s a single byte, the beginning of a multi-byte sequence, or a continuation byte; the remaining bits, concatenated, give the code point index.
length: 314
----
[32, 85, 84, 70, 45, 56, 44, 32, 101, 97, 99, 104, 32, 99, 111, 100, 101, 32, 112, 111, 105, 110, 116, 32, 105, 115, 32, 115, 116, 111, 114, 101, 100, 32, 117, 115, 105, 110, 103, 32, 49, 32, 116, 111, 32, 52, 32, 98, 121, 116, 101, 115, 44, 32, 98, 97, 115, 101, 100, 32, 111, 110, 32, 105, 116, 115, 32, 105, 110, 100, 101, 120, 32, 118, 97, 108, 117, 101, 46, 32, 32, 85, 84, 70, 45, 56, 32, 117, 115, 101, 115, 32, 97, 32, 115, 121, 115, 116, 101, 109, 32, 111, 102, 32, 98, 105, 110, 97, 114, 121, 32, 112, 114, 101, 102, 105, 120, 101, 115, 44, 32, 105, 110, 32, 119, 104, 105, 99, 104, 32, 116, 104, 101, 32, 104, 105, 103, 104, 32, 98, 105, 116, 115, 32, 111, 

In [70]:
from heapq import heapify, heappush, heappop

heap = []
heapify(heap) 

# store as count, pair

count = {
    "c": 3, 
    "b": 8, 
    "a": 9, 
}

heappush(heap, (-count["c"], "c"))
heappush(heap, (-count["b"], "b")) 
heappush(heap, (-count["a"], "a"))

print(f"largest element: {heap[0]}")

index_update = heap.index((-count["c"], "c"))
print(index_update, heap[index_update])
count["c"] += 8
heap[index_update] = (-count["c"], "c")
heapify(heap)

print(f"largest element after changing count: {heap[0]}, count of c: {count['c']}")

largest element: (-9, 'a')
1 (-3, 'c')
largest element after changing count: (-11, 'c'), count of c: 11


In [102]:
# byte pair encoding 
def merge(tokens, byte_pair_max, byte_pair_representation): 
    new_token = [] 

    i = 0 
    while i < len(tokens): 
        if i  <= len(tokens) - 2: 
            a = tokens[i] 
            b = tokens[i + 1]
            byte_pair = str(a) + str(b) 

            if byte_pair == byte_pair_max: 
                new_token.append(byte_pair_representation)
                i += 2 # skip the next byte because they are combined 
                continue 
            else: 
                new_token.append(a)
        else: 
            new_token.append(tokens[i]) 
        i += 1

    return new_token
    
def byte_pair_encoding(tokens, depth=1): 
    counter = 256

    # loop through number of times doing byte pair encoding 
    for _ in range(depth): 
        # loop through all bytes 
        pair_lookup = {} 
        heap = [] 
        heapify(heap) 
        for i in range(len(tokens) - 1): 
            a = tokens[i] 
            b = tokens[i + 1]
            byte_pair = str(a) + str(b) 

            if pair_lookup.get(byte_pair): 
                update_index = heap.index((-pair_lookup[byte_pair], byte_pair))
                pair_lookup[byte_pair] += 1

                heap[update_index] = (-pair_lookup[byte_pair], byte_pair)
                heapify(heap) 
            else: 
                heappush(heap, (-1, byte_pair))
                pair_lookup[byte_pair] = 1 
            
    
        byte_pair_max = heap[0][1] 
        byte_pair_max_frequency = heap[0][0] 

        # update byte pair to new representation 
        byte_pair_representation = counter
        pair_lookup[byte_pair_representation] = byte_pair_max_frequency
        counter += 1

        # update current token to token replaced with new byte pair
        tokens = merge(tokens=tokens, byte_pair_max=byte_pair_max, byte_pair_representation=byte_pair_representation)

        print(f"byte pair with most frequency: {heap[0][1]} | frequency: {-heap[0][0]} | new representation: {counter} | new token length: {len(tokens)}")
        print("--")
        print(tokens) 
        print("---------------------")


In [103]:
byte_pair_encoding(tokens, depth=20) 

byte pair with most frequency: 105110 | frequency: 13 | new representation: 257 | new token length: 303
--
[32, 85, 84, 70, 45, 56, 44, 32, 101, 97, 99, 104, 32, 99, 111, 100, 101, 32, 112, 111, 256, 116, 32, 105, 115, 32, 115, 116, 111, 114, 101, 100, 32, 117, 115, 256, 103, 32, 49, 32, 116, 111, 32, 52, 32, 98, 121, 116, 101, 115, 44, 32, 98, 97, 115, 101, 100, 32, 111, 110, 32, 105, 116, 115, 32, 256, 100, 101, 120, 32, 118, 97, 108, 117, 101, 46, 32, 32, 85, 84, 70, 45, 56, 32, 117, 115, 101, 115, 32, 97, 32, 115, 121, 115, 116, 101, 109, 32, 111, 102, 32, 98, 256, 97, 114, 121, 32, 112, 114, 101, 102, 105, 120, 101, 115, 44, 32, 256, 32, 119, 104, 105, 99, 104, 32, 116, 104, 101, 32, 104, 105, 103, 104, 32, 98, 105, 116, 115, 32, 111, 102, 32, 101, 97, 99, 104, 32, 98, 121, 116, 101, 32, 109, 97, 114, 107, 32, 119, 104, 101, 116, 104, 101, 114, 32, 105, 116, 226, 128, 153, 115, 32, 97, 32, 115, 256, 103, 108, 101, 32, 98, 121, 116, 101, 44, 32, 116, 104, 101, 32, 98, 101, 103, 256