In [8]:
#HW_2
#source https://lucytalksdata.com/the-modern-tokenization-stack-for-nlp-byte-pair-encoding/
#https://lucytalksdata.com/the-modern-tokenization-stack-for-nlp-byte-pair-encoding/https://towardsdatascience.com/byte-pair-encoding-the-dark-horse-of-modern-nlp-eb36c7df4f10


from numpy.lib.function_base import kaiser
import re
from collections import Counter, defaultdict


def process_vocab(corpus: str) -> dict:
    char = [" ".join(word) + " </w>" for word in corpus.split()]
    vocabulary = Counter(char)  
    return vocabulary


def get_pairs(vocabulary: dict) -> dict:
    pairs = defaultdict(int)
    for word, frequency in vocabulary.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i + 1]] += frequency
    return pairs


def merge_vocabulary(pair: tuple, initial_vocabulary: dict) -> dict:
    vocabulary_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in initial_vocabulary:
        word_out = p.sub(''.join(pair), word)
        vocabulary_out[word_out] = initial_vocabulary[word]
    return vocabulary_out

#corpus="low low low low low lowest lowest newer newer newer newer newer newer wider wider wider new new"
corpus="Baker Betty Lou bought some butter. But, it made her batter bitter. So, Baker Betty Lou bought some better butter to make her bitter batter better."
vocabulary = process_vocab(corpus)  # Step 1

k = 9
for i in range(k):
    print("iteration",i)
    print("--------------")
    pairs = get_pairs(vocabulary)  # Step 2
    print("initial vocab",dict(pairs))
    if not pairs:
        break
    best_pair = max(pairs, key=pairs.get)
    print("merge",best_pair)
    vocabulary = merge_vocabulary(best_pair, vocabulary)
    print("after merging corpus is:",vocabulary)
    print("\n\n")

iteration 0
--------------
initial vocab {('B', 'a'): 2, ('a', 'k'): 3, ('k', 'e'): 3, ('e', 'r'): 12, ('r', '</w>'): 9, ('B', 'e'): 2, ('e', 't'): 4, ('t', 't'): 10, ('t', 'y'): 2, ('y', '</w>'): 2, ('L', 'o'): 2, ('o', 'u'): 4, ('u', '</w>'): 2, ('b', 'o'): 2, ('u', 'g'): 2, ('g', 'h'): 2, ('h', 't'): 2, ('t', '</w>'): 3, ('s', 'o'): 2, ('o', 'm'): 2, ('m', 'e'): 2, ('e', '</w>'): 4, ('b', 'u'): 2, ('u', 't'): 3, ('t', 'e'): 8, ('r', '.'): 3, ('.', '</w>'): 3, ('B', 'u'): 1, ('t', ','): 1, (',', '</w>'): 2, ('i', 't'): 3, ('m', 'a'): 2, ('a', 'd'): 1, ('d', 'e'): 1, ('h', 'e'): 2, ('b', 'a'): 2, ('a', 't'): 2, ('b', 'i'): 2, ('S', 'o'): 1, ('o', ','): 1, ('b', 'e'): 2, ('t', 'o'): 1, ('o', '</w>'): 1}
merge ('e', 'r')
after merging corpus is: {'B a k er </w>': 2, 'B e t t y </w>': 2, 'L o u </w>': 2, 'b o u g h t </w>': 2, 's o m e </w>': 2, 'b u t t er . </w>': 1, 'B u t , </w>': 1, 'i t </w>': 1, 'm a d e </w>': 1, 'h er </w>': 2, 'b a t t er </w>': 2, 'b i t t er . </w>': 1, 'S o 