In [None]:
!pip install bpe

In [131]:
from bpe import Encoder
from collections import Counter, defaultdict
import re

In [132]:
# Build vocabulary from corpus 
text = "Baker Betty Lou bought some butter But, it made her batter bitter So, Baker Betty Lou bought some better butter to make her bitter batter better"
vocab = collections.defaultdict(int)

split_text = [" ".join(word) + " </_>" for word in text.split()]

for i in split_text:
       
    # The default value is 0
    # so there is no need to 
    # enter the key first
    vocab[i] += 1

In [133]:
# Count frequency of tokens from corpus 
vocab

defaultdict(int,
            {'B a k e r </_>': 2,
             'B e t t y </_>': 2,
             'L o u </_>': 2,
             'b o u g h t </_>': 2,
             's o m e </_>': 2,
             'b u t t e r </_>': 2,
             'B u t , </_>': 1,
             'i t </_>': 1,
             'm a d e </_>': 1,
             'h e r </_>': 2,
             'b a t t e r </_>': 2,
             'b i t t e r </_>': 2,
             'S o , </_>': 1,
             'b e t t e r </_>': 2,
             't o </_>': 1,
             'm a k e </_>': 1})

In [134]:
#Count individual elements of tokens, used to process pairs
import collections
char_dict = collections.defaultdict(int)
for word, frequency in vocab.items():
    chars = word.split()
    for char in chars:
        char_dict[char] += frequency

char_dict


defaultdict(int,
            {'B': 5,
             'a': 6,
             'k': 3,
             'e': 20,
             'r': 12,
             '</_>': 26,
             't': 25,
             'y': 2,
             'L': 2,
             'o': 8,
             'u': 7,
             'b': 10,
             'g': 2,
             'h': 4,
             's': 2,
             'm': 4,
             ',': 2,
             'i': 3,
             'd': 1,
             'S': 1})

In [135]:
# Get counts of paired characters 
import re

## create all possible consecutive pairs
pairs = collections.defaultdict(int)
for word, frequency in vocab.items():
    chars = word.split()
    for i in range(len(chars)-1):
        pairs[chars[i], chars[i+1]] += frequency

pairs

defaultdict(int,
            {('B', 'a'): 2,
             ('a', 'k'): 3,
             ('k', 'e'): 3,
             ('e', 'r'): 12,
             ('r', '</_>'): 12,
             ('B', 'e'): 2,
             ('e', 't'): 4,
             ('t', 't'): 10,
             ('t', 'y'): 2,
             ('y', '</_>'): 2,
             ('L', 'o'): 2,
             ('o', 'u'): 4,
             ('u', '</_>'): 2,
             ('b', 'o'): 2,
             ('u', 'g'): 2,
             ('g', 'h'): 2,
             ('h', 't'): 2,
             ('t', '</_>'): 3,
             ('s', 'o'): 2,
             ('o', 'm'): 2,
             ('m', 'e'): 2,
             ('e', '</_>'): 4,
             ('b', 'u'): 2,
             ('u', 't'): 3,
             ('t', 'e'): 8,
             ('B', 'u'): 1,
             ('t', ','): 1,
             (',', '</_>'): 2,
             ('i', 't'): 3,
             ('m', 'a'): 2,
             ('a', 'd'): 1,
             ('d', 'e'): 1,
             ('h', 'e'): 2,
             ('b', 'a'): 2,
          

In [129]:
type(pairs)

collections.defaultdict

In [130]:
##find the best pairs

def get_pairs(vocab):
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        chars = word.split()
        for i in range(len(chars)-1):
            pairs[chars[i], chars[i+1]] += freq
    return pairs

def merge_byte_pairs(best_pair, vocab):
    print(best_pair)
    merged_dict = {}
    bigram = re.escape(' '.join(best_pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in vocab:
        # print(word)
        w_out = p.sub(''.join(best_pair), word)
        merged_dict[w_out] = vocab[word]
    return merged_dict

def get_subword_tokens(vocab):
    char_dict = collections.defaultdict(int)
    for word, freq in vocab.items():
        chars = word.split()
        for char in chars:
            char_dict[char] += freq
    return char_dict

for i in range(7):
    pairs = get_pairs(vocab)
    best_pair = max(pairs, key=pairs.get)
    print(f"Iteration {i}: ")
    vocab = merge_byte_pairs(best_pair, vocab)
    # print(vocab)
    subword_tokens = get_subword_tokens(vocab)
    print(subword_tokens)
    print(len(subword_tokens))
    print("--------")

Iteration 0: 
('e', 'r')
defaultdict(<class 'int'>, {'B': 5, 'a': 6, 'k': 3, 'er': 12, '</_>': 26, 'e': 8, 't': 25, 'y': 2, 'L': 2, 'o': 8, 'u': 7, 'b': 10, 'g': 2, 'h': 4, 's': 2, 'm': 4, ',': 2, 'i': 3, 'd': 1, 'S': 1})
20
--------
Iteration 1: 
('er', '</_>')
defaultdict(<class 'int'>, {'B': 5, 'a': 6, 'k': 3, 'er</_>': 12, 'e': 8, 't': 25, 'y': 2, '</_>': 14, 'L': 2, 'o': 8, 'u': 7, 'b': 10, 'g': 2, 'h': 4, 's': 2, 'm': 4, ',': 2, 'i': 3, 'd': 1, 'S': 1})
20
--------
Iteration 2: 
('t', 't')
defaultdict(<class 'int'>, {'B': 5, 'a': 6, 'k': 3, 'er</_>': 12, 'e': 8, 'tt': 10, 'y': 2, '</_>': 14, 'L': 2, 'o': 8, 'u': 7, 'b': 10, 'g': 2, 'h': 4, 't': 5, 's': 2, 'm': 4, ',': 2, 'i': 3, 'd': 1, 'S': 1})
21
--------
Iteration 3: 
('tt', 'er</_>')
defaultdict(<class 'int'>, {'B': 5, 'a': 6, 'k': 3, 'er</_>': 4, 'e': 8, 'tt': 2, 'y': 2, '</_>': 14, 'L': 2, 'o': 8, 'u': 7, 'b': 10, 'g': 2, 'h': 4, 't': 5, 's': 2, 'm': 4, 'tter</_>': 8, ',': 2, 'i': 3, 'd': 1, 'S': 1})
22
--------
Iteration 4