In [1]:
ord('牛')

29275

In [2]:
chr(29275)

'牛'

In [2]:
test_string = "hello! this is is a test"
utf8_encoded = test_string.encode('utf-8')
print(utf8_encoded)

b'hello! this is is a test'


In [3]:
print(type(utf8_encoded))
print(list(map(int,utf8_encoded)))

<class 'bytes'>
[104, 101, 108, 108, 111, 33, 32, 116, 104, 105, 115, 32, 105, 115, 32, 105, 115, 32, 97, 32, 116, 101, 115, 116]


In [None]:
from collections import defaultdict

class BPE:
    
    def __init__(self, tokens:str, vocab_size:int) -> None:

        self.tokens:str = tokens
        self.vocab_size:int = vocab_size
        self.vocab:dict[int, bytes] = {i:bytes([i]) for i in range(256)}
        self.merge_sets:dict[int, tuple] = {}
    
    def get_tokens(self) -> list[int]:

        return list(self.tokens.encode("utf-8"))
    
    def get_stats(self, tokens:list[int]) -> dict:

        freq_pair:dict[tuple, int] = defaultdict(int)
        
        for fisrt_word, second_word in zip (tokens, tokens[1:]):
            freq_pair[(fisrt_word, second_word)] += 1
        return freq_pair
    
    def get_most_frequent_pair(self, freq_pair:dict[tuple, int]) -> tuple:

        pair_result:tuple
        freq:int = 0

        for pair, count in freq_pair.items():
            if count > freq:
                pair_result = pair
                freq = count
        return pair_result

        
    def merge(self, tokens: list[int], index: int) -> list[int]:

        freq_pair: dict[tuple, int] = self.get_stats(tokens)

        if not freq_pair:
            raise ValueError("No frequent pairs found — cannot perform merge. The token list may be too short or already fully merged.")

        most_freq_pair = self.get_most_frequent_pair(freq_pair=freq_pair)
        a, b = most_freq_pair

        self.vocab[index] = self.vocab[a] + self.vocab[b]
        self.merge_sets[index] = (a, b)

        new_tokens = []
        i = 0

        while i < len(tokens):

            if i < len(tokens) - 1 and tokens[i] == a and tokens[i + 1] == b:
                new_tokens.append(index)
                i += 2
            else:
                new_tokens.append(tokens[i])
                i += 1

        return new_tokens

    def train(self) -> None:

        tokens:list[int] = self.get_tokens()
        num_merges:int = self.vocab_size - 256

        for i in range(num_merges):
            index:int = i + 256
            tokens = self.merge(tokens, index)
    
    def decode(self, tokens: list[int]) -> str:

        byte_sequence = b''.join(self.vocab[token] for token in tokens)
        return byte_sequence.decode("utf-8", errors="replace")

    def tokenize(self, text: str) -> list[int]:

        tokens = list(text.encode("utf-8"))

        for index in sorted(self.merge_sets.keys()):

            a, b = self.merge_sets[index]
            i = 0
            merged = []

            while i < len(tokens):

                if i < len(tokens) - 1 and tokens[i] == a and tokens[i + 1] == b:
                    merged.append(index)
                    i += 2
                else:
                    merged.append(tokens[i])
                    i += 1
            tokens = merged

        return tokens


In [5]:

corpus = f"LOUISVILLE, Ky. — A few unflattering reviews are to be expected with any hotel. The lobby of Hotel Louisville Pat McDonogh for Al Jazeera America Every homeless shelter has a NIMBY problem. Try building a new facility or renovating an old one and the neighbors come out of the woodwork to protest each additional bed. But the battle waged against Hotel Louisville was unusual even in the long history of Wayside Christian Mission, founded in 1957. The saga began six years ago, after the group finally raised enough money to replace its worn-out transitional-housing facility for women and kids. Initially, the married couple at Wayside’s helm — Tim Moseley, a bearded, heavyset minister, and his wife, Nina, an attorney with waist-length platinum blonde hair — intended to build on property it already owned along gentrifying Market Street. Real-estate developers with city-hall ties killed the plan, claiming the need for"
bpe = BPE(corpus, 300)
bpe.train()
new_text = f"Then, in early 2009, the Moseleys heard that the downtown Holiday Inn, nicknamed “Hotel Louisville,” would be sold at a foreclosure auction. The final price tag of $10 million depleted all the funds Wayside had raised through its years-long capital campaign and proceeds from the Market Street sale, but at 187 rooms and 169,400 square feet, the building could house hundreds. Eighty-three homeless women moved into the hotel in November. Shortly thereafter, with utility costs mounting and many floors vacant, the Moseleys saw an opportunity. “People kept coming through and asking for a room,” Nina Moseley recalled. So Wayside opened Hotel Louisville to the public while continuing to provide shelter and substance-abuse recovery services to women in need, free of charge."
encoded = bpe.tokenize(new_text)
decoded = bpe.decode(encoded)
print(f"is the new text the same thing as the decoded? {new_text == decoded}")

is the new text the same thing as the decoded? True
