# BYTE-PAIR-ENCODING del Quijote
--- 
Este es una forma simple de compresión de datos en la que el par más común de bytes consecutivos de datos se reemplaza con un byte que no ocurre dentro de esos datos. Aquí, el objetivo no es la compresión de datos, sino la codificación de texto en un idioma dado como una secuencia de 'tokens', utilizando un vocabulario fijo de diferentes tokens. La mayoría de las palabras se codificarán como un solo token, mientras que las palabras raras se codificarán como una secuencia de unos pocos tokens, donde estos tokens representan partes de palabras significativas.

### Equipo cangrejo      
* Montaño Preciado Alondra Karolina
* Velasquez Hidalgo Luis Juventino
* Navarro Lopez Malcom Hiram
* Juan fez


### Fuentes
- Medium con la informacion: https://towardsdatascience.com/byte-pair-encoding-subword-based-tokenization-algorithm-77828a70bee0

- Codigo en el cual nos estamos inspirando: https://leimao.github.io/blog/Byte-Pair-Encoding/

In [1]:
import re, collections

Devuelve las palabras que hay en el texto y el conteo de cada una de ellas
- filename: El nombre del archivo que contiene el texto
- return: Un diccionario con el conteo de cada palabra.


In [2]:
def get_vocab(filename):
    '''
    Parte el texto si encuentra espacios
    '''
    vocab = collections.defaultdict(int)
    with open(filename, 'r', encoding='utf-8') as fhand:
        for line in fhand:
            words = line.strip().split()
            for word in words:
                vocab[' '.join(list(word)) + ' </w>'] += 1
    return vocab

Aqui se explica la funcion de esta celda


In [3]:
def get_stats(vocab):
    '''
    Devuelve el numero de veces que se repiten las palabras
    '''
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq
    return pairs

Aqui se explica la funcion de esta celda

In [4]:
def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

Aqui se explica la funcion de esta celda

In [5]:
def get_tokens_from_vocab(vocab):
    tokens_frequencies = collections.defaultdict(int)
    vocab_tokenization = {}
    for word, freq in vocab.items():
        word_tokens = word.split()
        for token in word_tokens:
            tokens_frequencies[token] += freq
        vocab_tokenization[''.join(word_tokens)] = word_tokens
    return tokens_frequencies, vocab_tokenization

Aqui se explica la funcion de esta celda


In [6]:
def measure_token_length(token):
    if token[-4:] == '</w>':
        return len(token[:-4]) + 1
    else:
        return len(token)

Aqui se explica la funcion de esta celda

In [7]:
def tokenize_word(string, sorted_tokens, unknown_token='</u>'):
    
    if string == '':
        return []
    if sorted_tokens == []:
        return [unknown_token]

    string_tokens = []
    for i in range(len(sorted_tokens)):
        token = sorted_tokens[i]
        token_reg = re.escape(token.replace('.', '[.]'))

        matched_positions = [(m.start(0), m.end(0)) for m in re.finditer(token_reg, string)]
        if len(matched_positions) == 0:
            continue
        substring_end_positions = [matched_position[0] for matched_position in matched_positions]

        substring_start_position = 0
        for substring_end_position in substring_end_positions:
            substring = string[substring_start_position:substring_end_position]
            string_tokens += tokenize_word(string=substring, sorted_tokens=sorted_tokens[i+1:], unknown_token=unknown_token)
            string_tokens += [token]
            substring_start_position = substring_end_position + len(token)
        remaining_substring = string[substring_start_position:]
        string_tokens += tokenize_word(string=remaining_substring, sorted_tokens=sorted_tokens[i+1:], unknown_token=unknown_token)
        break
    return string_tokens

Implementacion usando el Quijote
---


In [8]:
vocab = get_vocab('TextoEjemplo.txt')

num_merges = 10000
for i in range(num_merges):
    pairs = get_stats(vocab)
    print("Pares: ", pairs)
    if not pairs:
        break
    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best, vocab)
    print('Iter: {}'.format(i))
    print('Best pair: {}'.format(best))
    tokens_frequencies, vocab_tokenization = get_tokens_from_vocab(vocab)
    print('All tokens: {}'.format(tokens_frequencies.keys()))
    print('Number of tokens: {}'.format(len(tokens_frequencies.keys())))
    print('==========')

Pares:  defaultdict(<class 'int'>, {('T', 'h'): 21, ('h', 'e'): 2318, ('e', '</w>'): 70499, ('P', 'r'): 176, ('r', 'o'): 12958, ('o', 'j'): 729, ('j', 'e'): 1490, ('e', 'c'): 6028, ('c', 't'): 343, ('t', '</w>'): 315, ('G', 'u'): 143, ('u', 't'): 591, ('t', 'e'): 15389, ('e', 'n'): 33444, ('n', 'b'): 91, ('b', 'e'): 2930, ('e', 'r'): 27411, ('r', 'g'): 591, ('g', '</w>'): 100, ('E', 'B'): 5, ('B', 'o'): 72, ('o', 'o'): 38, ('o', 'k'): 15, ('k', '</w>'): 48, ('o', 'f'): 577, ('f', '</w>'): 129, ('D', 'o'): 326, ('o', 'n'): 18164, ('n', '</w>'): 32483, ('Q', 'u'): 2636, ('u', 'i'): 5991, ('i', 'j'): 4684, ('j', 'o'): 5678, ('o', 't'): 4903, ('e', ','): 5150, (',', '</w>'): 40310, ('b', 'y'): 25, ('y', '</w>'): 17790, ('M', 'i'): 218, ('i', 'g'): 3041, ('g', 'u'): 3775, ('u', 'e'): 36965, ('e', 'l'): 17053, ('l', '</w>'): 16547, ('d', 'e'): 33705, ('C', 'e'): 51, ('r', 'v'): 453, ('v', 'a'): 2375, ('a', 'n'): 21188, ('n', 't'): 15327, ('e', 's'): 32232, ('s', '</w>'): 40995, ('S', 'a'): 2

KeyboardInterrupt: 

In [17]:
'''
print('==========')
print('Tokens Before BPE')
tokens_frequencies, vocab_tokenization = get_tokens_from_vocab(vocab)
print('All tokens: {}'.format(tokens_frequencies.keys()))
print('Number of tokens: {}'.format(len(tokens_frequencies.keys())))
print('==========')

num_merges = 2
for i in range(num_merges):
    pairs = get_stats(vocab)
    if not pairs:
        break
    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best, vocab)
    print('Iter: {}'.format(i))
    print('Best pair: {}'.format(best))
    tokens_frequencies, vocab_tokenization = get_tokens_from_vocab(vocab)
    print('All tokens: {}'.format(tokens_frequencies.keys()))
    print('Number of tokens: {}'.format(len(tokens_frequencies.keys())))
    print('==========')

# Let's check how tokenization will be for a known word
word_given_known = 'mountains</w>'
word_given_unknown = 'Ilikeeatingapples!</w>'

sorted_tokens_tuple = sorted(tokens_frequencies.items(), key=lambda item: (measure_token_length(item[0]), item[1]), reverse=True)
sorted_tokens = [token for (token, freq) in sorted_tokens_tuple]

print(sorted_tokens)

word_given = word_given_known 

print('Tokenizing word: {}...'.format(word_given))
if word_given in vocab_tokenization:
    print('Tokenization of the known word:')
    print(vocab_tokenization[word_given])
    print('Tokenization treating the known word as unknown:')
    print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))
else:
    print('Tokenizating of the unknown word:')
    print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))

word_given = word_given_unknown 

print('Tokenizing word: {}...'.format(word_given))
if word_given in vocab_tokenization:
    print('Tokenization of the known word:')
    print(vocab_tokenization[word_given])
    print('Tokenization treating the known word as unknown:')
    print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))
else:
    print('Tokenizating of the unknown word:')
    print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))
'''



# Conclusiones
___
Me diverti mucho