## Train

In [148]:
def tokenize(text, vocabs):
    chars = list(text)
    tokenized = chars[:1]
    for char in chars[1:]:
        print(tokenized[-1], char, tokenized[-1] + char in vocabs)
        if tokenized[-1] + char in vocabs:
            tokenized[-1] = tokenized[-1] + char
        else:
            tokenized.append(char)
    return tokenized

In [132]:
def sliding_window(li, length=2, step=1):
    i, j = 0, length
    while j<=len(li):
        yield li[i:j]
        i+= step
        j+= step

In [133]:
def count_occs(text, vocabs):
    itext = tokenize(text, vocabs)
    occs = {}
    for token1, token2 in sliding_window(itext):
        if (token1,token2) in occs:
            occs[(token1,token2)]+=1
        else:
            occs[(token1,token2)]=1
    return occs

In [134]:
def update_vocabs(text, oldvocabs):
    vocabs = oldvocabs.copy()
    occs = count_occs(corpus, vocabs)
    keys = max(occs.keys(), key=lambda x: occs[x])
    vocabs.append("".join(keys))
    return vocabs

In [149]:
icorpus = tokenize(corpus, vocabs)
icorpus


 t False
t h True
th i False
i s False
s   True
s  i False
i s False
s   True
s  t False
t h True
th e False
e   False
  h False
h u False
u g False
g g False
g i False
i n False
n g False
g   False
  f False
f a False
a c False
c e False
e   False
  c False
c o False
o u False
u r False
r s False
s e False
e , False
,   False
  t False
t h True
th i False
i s False
s   True
s  c False
c h False
h a False
a p False
p t False
t e False
e r False
r   False
  i False
i s False
s   True
s  a False
a b False
b o False
o u False
u t False
t   False
  t False
t o False
o k False
k e False
e n False
n i False
i z False
z a False
a t False
t i False
i o False
o n False
n . False
.   False
  t False
t h True
th i False
i s False
s   True
s  s False
s e False
e c False
c t False
t i False
i o False
o n False
n   False
  s False
s h False
h o False
o w False
w s False
s   True
s  s False
s e False
e v False
v e False
e r False
r a False
a l False
l   False
  t False
t o False
o k False
k e False


['\n',
 'th',
 'i',
 's ',
 'i',
 's ',
 'th',
 'e',
 ' ',
 'h',
 'u',
 'g',
 'g',
 'i',
 'n',
 'g',
 ' ',
 'f',
 'a',
 'c',
 'e',
 ' ',
 'c',
 'o',
 'u',
 'r',
 's',
 'e',
 ',',
 ' ',
 'th',
 'i',
 's ',
 'c',
 'h',
 'a',
 'p',
 't',
 'e',
 'r',
 ' ',
 'i',
 's ',
 'a',
 'b',
 'o',
 'u',
 't',
 ' ',
 't',
 'o',
 'k',
 'e',
 'n',
 'i',
 'z',
 'a',
 't',
 'i',
 'o',
 'n',
 '.',
 ' ',
 'th',
 'i',
 's ',
 's',
 'e',
 'c',
 't',
 'i',
 'o',
 'n',
 ' ',
 's',
 'h',
 'o',
 'w',
 's ',
 's',
 'e',
 'v',
 'e',
 'r',
 'a',
 'l',
 ' ',
 't',
 'o',
 'k',
 'e',
 'n',
 'i',
 'z',
 'e',
 'r',
 ' ',
 'a',
 'l',
 'g',
 'o',
 'r',
 'i',
 'th',
 'm',
 's',
 '.',
 '\n']

In [145]:
occs = count_occs(corpus, vocabs)
print(max(occs.keys(), key=lambda x: occs[x]))
occs

('i', 's ')


{('\n', 'th'): 1,
 ('th', 'i'): 3,
 ('i', 's '): 5,
 ('s ', 'i'): 1,
 ('s ', 'th'): 1,
 ('th', 'e'): 1,
 ('e', ' '): 2,
 (' ', 'h'): 1,
 ('h', 'u'): 1,
 ('u', 'g'): 1,
 ('g', 'g'): 1,
 ('g', 'i'): 1,
 ('i', 'n'): 1,
 ('n', 'g'): 1,
 ('g', ' '): 1,
 (' ', 'f'): 1,
 ('f', 'a'): 1,
 ('a', 'c'): 1,
 ('c', 'e'): 1,
 (' ', 'c'): 1,
 ('c', 'o'): 1,
 ('o', 'u'): 2,
 ('u', 'r'): 1,
 ('r', 's'): 1,
 ('s', 'e'): 3,
 ('e', ','): 1,
 (',', ' '): 1,
 (' ', 'th'): 2,
 ('s ', 'c'): 1,
 ('c', 'h'): 1,
 ('h', 'a'): 1,
 ('a', 'p'): 1,
 ('p', 't'): 1,
 ('t', 'e'): 1,
 ('e', 'r'): 3,
 ('r', ' '): 2,
 (' ', 'i'): 1,
 ('s ', 'a'): 1,
 ('a', 'b'): 1,
 ('b', 'o'): 1,
 ('u', 't'): 1,
 ('t', ' '): 1,
 (' ', 't'): 2,
 ('t', 'o'): 2,
 ('o', 'k'): 2,
 ('k', 'e'): 2,
 ('e', 'n'): 2,
 ('n', 'i'): 2,
 ('i', 'z'): 2,
 ('z', 'a'): 1,
 ('a', 't'): 1,
 ('t', 'i'): 2,
 ('i', 'o'): 2,
 ('o', 'n'): 2,
 ('n', '.'): 1,
 ('.', ' '): 1,
 ('s ', 's'): 2,
 ('e', 'c'): 1,
 ('c', 't'): 1,
 ('n', ' '): 1,
 (' ', 's'): 1,
 ('s', 'h'):

In [146]:
vocabs = update_vocabs(corpus, vocabs)
vocabs

['o',
 'm',
 'u',
 'w',
 'l',
 'e',
 '\n',
 'i',
 'v',
 'k',
 ',',
 'f',
 't',
 'h',
 ' ',
 'n',
 '.',
 'c',
 'g',
 'p',
 'r',
 'a',
 'z',
 'b',
 's',
 's ',
 'th',
 'is ',
 'is ']

## Initial Variables

In [196]:
corpus = """
this is the hugging face course, this chapter is about tokenization. this section shows several tokenizer algorithms.
huggingface is learning new astonishing and brilliant techniques that are superpellant. I'm happyly joining the world of ai
as it is moving forward. I joyfolly and greatfully trying to get this done.
"""

In [287]:
corpus = """
huggingface hug face hugging hugger learning learner learners learn
"""

## Pretokenize

In [288]:
#corpus = corpus.replace(".", "").replace(",", "").replace("\n", " ")
words = list(set(corpus.split(sep=" ")))
words

['hugger',
 'face',
 'learn\n',
 'hugging',
 'learning',
 'learner',
 'learners',
 '\nhuggingface',
 'hug']

## New Method

In [289]:
def encode(text, vocabs, merges):
    split = list(text)
    for pair in merges:
        i = 0
        while i < len(split):
            if split[i]==pair[0] and split[i+1]==pair[1]:
                split = split[:i] + [pair[0]+pair[1]] + split[i+2:]
            else:
                i+=1
    return split

In [290]:
def count_freqs(text, vocabs, merges):
    itext = encode(text, vocabs, merges)
    occs = {}
    for token1, token2 in sliding_window(itext):
        if (token1,token2) in occs:
            occs[(token1,token2)]+=1
        else:
            occs[(token1,token2)]=1
    return occs

In [291]:
def update_vocabs(text, oldvocabs, oldmerges):
    vocabs = oldvocabs.copy()
    merges = oldmerges.copy()
    occs = count_freqs(text, vocabs, merges)
    keys = max(occs.keys(), key=lambda x: occs[x])
    merges.append(keys)
    vocabs.append("".join(keys))
    return vocabs, merges

In [292]:
vocabs = list(set(list(" ".join(words))))
merges = []

In [293]:
"' , '".join(vocabs)

"r' , 'n' , 'u' , 'a' , 'l' , 'e' , 'i' , 'f' , 'c' , 'g' , '\n' , 's' , 'h' , ' "

In [294]:
occs = count_freqs(corpus, vocabs, merges)
print(max(occs.keys(), key=lambda x: occs[x]))
{key:occs[key] for key in sorted(occs.keys(), key=lambda x: occs[x], reverse=True)}

('h', 'u')


{('h', 'u'): 4,
 ('u', 'g'): 4,
 (' ', 'l'): 4,
 ('l', 'e'): 4,
 ('e', 'a'): 4,
 ('a', 'r'): 4,
 ('r', 'n'): 4,
 ('g', 'g'): 3,
 ('i', 'n'): 3,
 ('n', 'g'): 3,
 (' ', 'h'): 3,
 ('g', ' '): 3,
 ('e', 'r'): 3,
 ('g', 'i'): 2,
 ('f', 'a'): 2,
 ('a', 'c'): 2,
 ('c', 'e'): 2,
 ('e', ' '): 2,
 ('r', ' '): 2,
 ('n', 'e'): 2,
 ('\n', 'h'): 1,
 ('g', 'f'): 1,
 (' ', 'f'): 1,
 ('g', 'e'): 1,
 ('n', 'i'): 1,
 ('r', 's'): 1,
 ('s', ' '): 1,
 ('n', '\n'): 1}

In [295]:
import re
maxvocab = 100
pattern = r"[a-zA-Z0-9]+\s[a-zA-Z0-9]+"
while len(vocabs)<maxvocab:
    newvocabs, newmerges = update_vocabs(corpus, vocabs, merges)
    if re.match(pattern, newmerges[-1][0]+newmerges[-1][1]):
        display("Terminalized", newmerges[-1])
        break
    vocabs, merges = newvocabs, newmerges
print("' , '".join(vocabs))
merges

'Terminalized'

('er', ' learn')

r' , 'n' , 'u' , 'a' , 'l' , 'e' , 'i' , 'f' , 'c' , 'g' , '
' , 's' , 'h' , ' ' , 'hu' , 'hug' , ' l' , ' le' , ' lea' , ' lear' , ' learn' , 'hugg' , 'in' , 'ing' , 'er' , 'hugging' , 'fa' , 'fac' , 'face' , 'face 


[('h', 'u'),
 ('hu', 'g'),
 (' ', 'l'),
 (' l', 'e'),
 (' le', 'a'),
 (' lea', 'r'),
 (' lear', 'n'),
 ('hug', 'g'),
 ('i', 'n'),
 ('in', 'g'),
 ('e', 'r'),
 ('hugg', 'ing'),
 ('f', 'a'),
 ('fa', 'c'),
 ('fac', 'e'),
 ('face', ' ')]

## Time to show some results

In [296]:
from IPython.display import Markdown

In [297]:
colors=["#362A59", "#3D6D46", "#75592B", "#732E31", "#235C73"]

In [299]:
text = corpus
tokenized = encode(text, vocabs, merges)
mark = ""
for i, token in enumerate(tokenized):
    mark+=f'''<span style="background: {colors[i%len(colors)]}">{token}</span>'''
Markdown(f'''<div style="color: white; background: #202123; font-weight: bold; padding: 20px 20px 20px 20px; font-family: 'Courier New', monospace;">
{mark}
</div>
''')

<div style="color: white; background: #202123; font-weight: bold; padding: 20px 20px 20px 20px; font-family: 'Courier New', monospace;">
<span style="background: #362A59">
</span><span style="background: #3D6D46">hugging</span><span style="background: #75592B">face </span><span style="background: #732E31">hug</span><span style="background: #235C73"> </span><span style="background: #362A59">face </span><span style="background: #3D6D46">hugging</span><span style="background: #75592B"> </span><span style="background: #732E31">hugg</span><span style="background: #235C73">er</span><span style="background: #362A59"> learn</span><span style="background: #3D6D46">ing</span><span style="background: #75592B"> learn</span><span style="background: #732E31">er</span><span style="background: #235C73"> learn</span><span style="background: #362A59">er</span><span style="background: #3D6D46">s</span><span style="background: #75592B"> learn</span><span style="background: #732E31">
</span>
</div>
