# Byte Pair Encoding Tokenization

An algorithm used for tokenization of text, prior to analysis or processing. It is based on the idea of iteratively merging the most frequent pair of tokens.

## Implementation

Naive implementation: no word or sentence boundary detection, no special tokenization for numbers, dates, etc.

In [43]:
def build_tokens_using_byte_pair_encoding(document, vocabulary_size):
    '''Builds a dictionary of tokens using byte pair encodings from the given document.

    document: The document to build tokens from.
    vocabulary_size: The maximum number of tokens to create.
    returns: A dictionary of tokens and their frequencies
    '''

    tokenised_document = list(document)
    tokens = set(tokenised_document)
    
    while len(tokens) < vocabulary_size:
        token_pair = _find_most_frequent_token_pair(tokenised_document)
        tokens.add(token_pair[0] + token_pair[1])
        _merge_pairs(tokenised_document, token_pair)

    return _get_token_frequencies(tokenised_document)

def _tokenise(document, token_frequencies):
    '''Tokenises a document using the given tokens in decreasing order of frequency.'''

    tokenised_document = list(document)
    sorted_tokens = sorted(token_frequencies.keys(), key=lambda token: token_frequencies[token], reverse=True)
    for token in sorted_tokens:
        tokenised_document = _merge_token_occurrences(tokenised_document, token)
    return tokenised_document

def _merge_pairs(tokenised_document, token_pair):
    i = 0
    while (i < len(tokenised_document)-1):
        if (tokenised_document[i], tokenised_document[i+1]) == token_pair:
            tokenised_document[i] = token_pair[0] + token_pair[1]
            tokenised_document.pop(i+1)
        i = i + 1

def _find_most_frequent_token_pair(tokenised_document):
    token_pairs = {}
    last_token = tokenised_document[0]
    for token in tokenised_document[1:]:
        token_pair = (last_token, token)
        token_pairs[token_pair] = token_pairs.get(token_pair, 0) + 1
        last_token = token
    most_frequent_token_pair = max(token_pairs, key=token_pairs.get)
    return most_frequent_token_pair

def _get_token_frequencies(tokenised_document):
    token_frequencies = {}
    for token in tokenised_document:
        if token not in token_frequencies:
            token_frequencies[token] = 0
        token_frequencies[token] += 1
    return token_frequencies

def _merge_token_occurrences(tokenised_document, token):
    token_length = len(token)
    document_index = 0
    while document_index < len(tokenised_document):
        tokens_count_to_merge = 0
        accumulated_token = tokenised_document[document_index]
        while len(accumulated_token) < token_length and document_index+tokens_count_to_merge+1 < len(tokenised_document):
            tokens_count_to_merge += 1
            accumulated_token += tokenised_document[document_index+tokens_count_to_merge]
        if accumulated_token == token:
            tokenised_document = tokenised_document[:document_index] + [token] + tokenised_document[document_index+token_length:]
        document_index += 1
    return tokenised_document

## Application

In [45]:
def calculate_and_print_tokens(filename, vocabulary_size):
    print('Calculating tokens for ' + filename)
    with open(filename, 'r', encoding="mbcs") as file:
        document = file.read()
        tokens = build_tokens_using_byte_pair_encoding(document, vocabulary_size)
        print('Final tokens: ' + str(_sample_dictionary(tokens, 100)))
        with open(filename + '.' + str(vocabulary_size) + '.tokens', 'w') as file:
            file.write(str(tokens))
        print('Tokenised document' + str(_tokenise(document, tokens)[:100]))

def _sample_dictionary(dictionary, sample_size):
    return {key: value for index, (key, value) in enumerate(dictionary.items()) if index < sample_size}
    
calculate_and_print_tokens('./datasets/Alice.txt', vocabulary_size=1000)
calculate_and_print_tokens('./datasets/Shakespeare.txt', vocabulary_size=1000)

Calculating tokens for ../Word2Vec/Alice.txt
Final tokens: {'The ': 72, 'Project Gutenberg': 21, ' ': 610, 'eB': 18, 'ook ': 42, 'of ': 193, 'Alice': 72, 'â€™s ': 96, 'A': 149, 'dv': 21, 'ent': 150, 'ure': 31, 's ': 359, 'in ': 139, 'W': 98, 'onder': 28, 'l': 281, 'and': 75, ', ': 470, 'by ': 30, 'L': 112, 'e': 520, 'w': 354, 'is ': 65, 'C': 104, 'ar': 152, 'ro': 68, 'll': 141, '\n\n': 71, 'T': 142, 'his ': 76, 'for the ': 19, 'use ': 35, 'of': 156, ' any': 24, 'one ': 99, 'an': 171, 'y': 344, 'here ': 28, 'in the ': 85, 'Un': 18, 'ited ': 20, 'St': 27, 'at': 200, 'es': 273, ' and': 64, '\n': 730, 'mo': 38, 'st ': 60, 'other ': 28, 'part': 25, 'of the ': 133, 'wor': 25, 'd': 450, ' at ': 78, 'no ': 54, 'c': 277, 'o': 204, 'st': 199, ' and ': 169, 'with': 85, ' al': 36, 'r': 267, 'est': 45, 'ri': 126, 't': 502, 'ion': 100, 's\n': 33, 'what': 26, 'so': 34, 'ever': 91, '. ': 205, 'You ': 24, 'may ': 21, 'cop': 21, 'y ': 240, 'it': 415, 'g': 294, 'ive ': 40, ' a': 166, 'way ': 41, 'or ': 1

KeyboardInterrupt: 