In [13]:
import numpy as np
import pandas as pd
from collections import defaultdict

# Byte-Pair Encoding

In [19]:
word_freqs = {'low_' : 5, 'lowest_' : 2, 'newer_' : 6, 'wider_' : 3, 'new_' : 2}

vocab = set("".join(word_freqs.keys()))
vocab = list(vocab)

splits = {word: [c for c in word] for word in word_freqs.keys()}

def compute_pair_freqs(splits, word_freqs):
    """Count occurrences of adjacent symbol pairs in coprus."""
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs

def most_freq_pair(pair_freqs):
    """Find the most frequent adjacent pair."""
    if not pair_freqs:
        return ('', '')
    
    best_pair = max(pair_freqs, key=pair_freqs.get)
    return best_pair

def merge_pair(a, b, splits, word_freqs):
    """Merge most frequent pair into a single token in all words."""
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue

        new_split = []
        i = 0
        while i < len(split):
            if i < len(split) - 1 and split[i] == a and split[i + 1] == b:
                new_split.append(a + b)
                i += 2
            else:
                new_split.append(split[i])
                i += 1

        splits[word] = new_split
    return splits

merges = {}

num_merges = 8
for _ in range(num_merges):
    pair_freqs = compute_pair_freqs(splits, word_freqs)
    best_pair = most_freq_pair(pair_freqs)

    if best_pair == ('', ''):
        break

    a, b = best_pair
    splits = merge_pair(a, b, splits, word_freqs)
    merges[best_pair] = a + b
    vocab.append(a + b)

In [20]:
print('Merges:', merges)
print('Final vocabulary:', vocab)

Merges: {('e', 'r'): 'er', ('er', '_'): 'er_', ('n', 'e'): 'ne', ('ne', 'w'): 'new', ('l', 'o'): 'lo', ('lo', 'w'): 'low', ('new', 'er_'): 'newer_', ('low', '_'): 'low_'}
Final vocabulary: ['i', 'l', 'o', '_', 'e', 't', 'w', 's', 'n', 'd', 'r', 'er', 'er_', 'ne', 'new', 'lo', 'low', 'newer_', 'low_']
