<a href="https://colab.research.google.com/github/HanSong19/Hugging-Face/blob/main/6.4%20BPE%20tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets evaluate transformers[sentencepiece]



In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print(type(tokenizer.backend_tokenizer))

<class 'tokenizers.Tokenizer'>


In [3]:
print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))

hello how are u?


In [4]:
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are you?")

[('Hello', (0, 5)),
 (',', (5, 6)),
 ('how', (7, 10)),
 ('are', (11, 14)),
 ('you', (15, 18)),
 ('?', (18, 19))]

In [5]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you")

[('Hello', (0, 5)),
 (',', (5, 6)),
 ('Ġhow', (6, 10)),
 ('Ġare', (10, 14)),
 ('Ġ', (14, 15)),
 ('Ġyou', (15, 19))]

In [6]:
tokenizer=AutoTokenizer.from_pretrained("t5-small")
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you")

[('▁Hello,', (0, 6)),
 ('▁how', (7, 10)),
 ('▁are', (11, 14)),
 ('▁you', (16, 19))]

## Byte-Pair Encoding tokenizer (BPE)

In [49]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [50]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [51]:
from collections import defaultdict

word_freqs = defaultdict(int)

for text in corpus:
  words_with_offsets= tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)

  new_words = [word for word, offset in words_with_offsets]
  for word in new_words:
    word_freqs[word] += 1

print(word_freqs)

defaultdict(<class 'int'>, {'This': 3, 'Ġis': 2, 'Ġthe': 1, 'ĠHugging': 1, 'ĠFace': 1, 'ĠCourse': 1, '.': 4, 'Ġchapter': 1, 'Ġabout': 1, 'Ġtokenization': 1, 'Ġsection': 1, 'Ġshows': 1, 'Ġseveral': 1, 'Ġtokenizer': 1, 'Ġalgorithms': 1, 'Hopefully': 1, ',': 1, 'Ġyou': 1, 'Ġwill': 1, 'Ġbe': 1, 'Ġable': 1, 'Ġto': 1, 'Ġunderstand': 1, 'Ġhow': 1, 'Ġthey': 1, 'Ġare': 1, 'Ġtrained': 1, 'Ġand': 1, 'Ġgenerate': 1, 'Ġtokens': 1})


In [52]:
alphabet = []

for word in word_freqs.keys():
  for letter in word:
    if letter not in alphabet:
      alphabet.append(letter)
alphabet.sort()

print(alphabet)




[',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ']


In [53]:
vocab = ["<|endoftext|>"] + alphabet.copy()
print(vocab)

['<|endoftext|>', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ']


In [54]:
splits = {word: [c for c in word] for word in word_freqs.keys()}
print(splits)

{'This': ['T', 'h', 'i', 's'], 'Ġis': ['Ġ', 'i', 's'], 'Ġthe': ['Ġ', 't', 'h', 'e'], 'ĠHugging': ['Ġ', 'H', 'u', 'g', 'g', 'i', 'n', 'g'], 'ĠFace': ['Ġ', 'F', 'a', 'c', 'e'], 'ĠCourse': ['Ġ', 'C', 'o', 'u', 'r', 's', 'e'], '.': ['.'], 'Ġchapter': ['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r'], 'Ġabout': ['Ġ', 'a', 'b', 'o', 'u', 't'], 'Ġtokenization': ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n'], 'Ġsection': ['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n'], 'Ġshows': ['Ġ', 's', 'h', 'o', 'w', 's'], 'Ġseveral': ['Ġ', 's', 'e', 'v', 'e', 'r', 'a', 'l'], 'Ġtokenizer': ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r'], 'Ġalgorithms': ['Ġ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's'], 'Hopefully': ['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y'], ',': [','], 'Ġyou': ['Ġ', 'y', 'o', 'u'], 'Ġwill': ['Ġ', 'w', 'i', 'l', 'l'], 'Ġbe': ['Ġ', 'b', 'e'], 'Ġable': ['Ġ', 'a', 'b', 'l', 'e'], 'Ġto': ['Ġ', 't', 'o'], 'Ġunderstand': ['Ġ', 'u', 'n', 'd', 'e', 'r', 's', 't', 'a', 'n', 'd'], 'Ġh

In [55]:
print(word_freqs.items())

dict_items([('This', 3), ('Ġis', 2), ('Ġthe', 1), ('ĠHugging', 1), ('ĠFace', 1), ('ĠCourse', 1), ('.', 4), ('Ġchapter', 1), ('Ġabout', 1), ('Ġtokenization', 1), ('Ġsection', 1), ('Ġshows', 1), ('Ġseveral', 1), ('Ġtokenizer', 1), ('Ġalgorithms', 1), ('Hopefully', 1), (',', 1), ('Ġyou', 1), ('Ġwill', 1), ('Ġbe', 1), ('Ġable', 1), ('Ġto', 1), ('Ġunderstand', 1), ('Ġhow', 1), ('Ġthey', 1), ('Ġare', 1), ('Ġtrained', 1), ('Ġand', 1), ('Ġgenerate', 1), ('Ġtokens', 1)])


In [56]:
def compute_pair_freqs(splits):
  pair_freqs = defaultdict(int)
  for w, f in word_freqs.items():
    split=splits[w]
    if len(split) == 1:
      continue
    for i in range(len(split)-1):
      pair = (split[i], split[i+1])
      pair_freqs[pair] += f
  return pair_freqs



In [57]:
pair_freqs = compute_pair_freqs(splits)
print(pair_freqs.keys())
print(pair_freqs)




dict_keys([('T', 'h'), ('h', 'i'), ('i', 's'), ('Ġ', 'i'), ('Ġ', 't'), ('t', 'h'), ('h', 'e'), ('Ġ', 'H'), ('H', 'u'), ('u', 'g'), ('g', 'g'), ('g', 'i'), ('i', 'n'), ('n', 'g'), ('Ġ', 'F'), ('F', 'a'), ('a', 'c'), ('c', 'e'), ('Ġ', 'C'), ('C', 'o'), ('o', 'u'), ('u', 'r'), ('r', 's'), ('s', 'e'), ('Ġ', 'c'), ('c', 'h'), ('h', 'a'), ('a', 'p'), ('p', 't'), ('t', 'e'), ('e', 'r'), ('Ġ', 'a'), ('a', 'b'), ('b', 'o'), ('u', 't'), ('t', 'o'), ('o', 'k'), ('k', 'e'), ('e', 'n'), ('n', 'i'), ('i', 'z'), ('z', 'a'), ('a', 't'), ('t', 'i'), ('i', 'o'), ('o', 'n'), ('Ġ', 's'), ('e', 'c'), ('c', 't'), ('s', 'h'), ('h', 'o'), ('o', 'w'), ('w', 's'), ('e', 'v'), ('v', 'e'), ('r', 'a'), ('a', 'l'), ('z', 'e'), ('l', 'g'), ('g', 'o'), ('o', 'r'), ('r', 'i'), ('i', 't'), ('h', 'm'), ('m', 's'), ('H', 'o'), ('o', 'p'), ('p', 'e'), ('e', 'f'), ('f', 'u'), ('u', 'l'), ('l', 'l'), ('l', 'y'), ('Ġ', 'y'), ('y', 'o'), ('Ġ', 'w'), ('w', 'i'), ('i', 'l'), ('Ġ', 'b'), ('b', 'e'), ('b', 'l'), ('l', 'e'), ('Ġ',

In [58]:
for i, key in enumerate(pair_freqs.keys()):
  print(f"{key}: {pair_freqs[key]}")
  if i>=5:
    break

('T', 'h'): 3
('h', 'i'): 3
('i', 's'): 5
('Ġ', 'i'): 2
('Ġ', 't'): 7
('t', 'h'): 3


In [17]:
print(pair_freqs)

defaultdict(<class 'int'>, {('T', 'h'): 3, ('h', 'i'): 3, ('i', 's'): 5, ('Ġ', 'i'): 2, ('Ġ', 't'): 7, ('t', 'h'): 3, ('h', 'e'): 2, ('Ġ', 'H'): 1, ('H', 'u'): 1, ('u', 'g'): 1, ('g', 'g'): 1, ('g', 'i'): 1, ('i', 'n'): 2, ('n', 'g'): 1, ('Ġ', 'F'): 1, ('F', 'a'): 1, ('a', 'c'): 1, ('c', 'e'): 1, ('Ġ', 'C'): 1, ('C', 'o'): 1, ('o', 'u'): 3, ('u', 'r'): 1, ('r', 's'): 2, ('s', 'e'): 3, ('Ġ', 'c'): 1, ('c', 'h'): 1, ('h', 'a'): 1, ('a', 'p'): 1, ('p', 't'): 1, ('t', 'e'): 2, ('e', 'r'): 5, ('Ġ', 'a'): 5, ('a', 'b'): 2, ('b', 'o'): 1, ('u', 't'): 1, ('t', 'o'): 4, ('o', 'k'): 3, ('k', 'e'): 3, ('e', 'n'): 4, ('n', 'i'): 2, ('i', 'z'): 2, ('z', 'a'): 1, ('a', 't'): 2, ('t', 'i'): 2, ('i', 'o'): 2, ('o', 'n'): 2, ('Ġ', 's'): 3, ('e', 'c'): 1, ('c', 't'): 1, ('s', 'h'): 1, ('h', 'o'): 2, ('o', 'w'): 2, ('w', 's'): 1, ('e', 'v'): 1, ('v', 'e'): 1, ('r', 'a'): 3, ('a', 'l'): 2, ('z', 'e'): 1, ('l', 'g'): 1, ('g', 'o'): 1, ('o', 'r'): 1, ('r', 'i'): 1, ('i', 't'): 1, ('h', 'm'): 1, ('m', 's'): 

In [59]:
best_pair = ""
max_freq = None

for p, f in pair_freqs.items():
  if max_freq is None or max_freq <f:
    best_pair = p
    max_freq = f

print(best_pair, max_freq)

('Ġ', 't') 7


In [19]:
merges = {('Ġ', 't'):"Ġt" }
vocab.append("Ġt")

In [20]:
for word in word_freqs:
  split = splits[word]
  print(split)



['T', 'h', 'i', 's']
['Ġ', 'i', 's']
['Ġ', 't', 'h', 'e']
['Ġ', 'H', 'u', 'g', 'g', 'i', 'n', 'g']
['Ġ', 'F', 'a', 'c', 'e']
['Ġ', 'C', 'o', 'u', 'r', 's', 'e']
['.']
['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r']
['Ġ', 'a', 'b', 'o', 'u', 't']
['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n']
['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n']
['Ġ', 's', 'h', 'o', 'w', 's']
['Ġ', 's', 'e', 'v', 'e', 'r', 'a', 'l']
['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r']
['Ġ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's']
['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y']
[',']
['Ġ', 'y', 'o', 'u']
['Ġ', 'w', 'i', 'l', 'l']
['Ġ', 'b', 'e']
['Ġ', 'a', 'b', 'l', 'e']
['Ġ', 't', 'o']
['Ġ', 'u', 'n', 'd', 'e', 'r', 's', 't', 'a', 'n', 'd']
['Ġ', 'h', 'o', 'w']
['Ġ', 't', 'h', 'e', 'y']
['Ġ', 'a', 'r', 'e']
['Ġ', 't', 'r', 'a', 'i', 'n', 'e', 'd']
['Ġ', 'a', 'n', 'd']
['Ġ', 'g', 'e', 'n', 'e', 'r', 'a', 't', 'e']
['Ġ', 't', 'o', 'k', 'e', 'n', 's']


In [60]:
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

In [22]:
splits = merge_pair("Ġ", "t", splits)
print(splits["Ġtrained"] )

['Ġt', 'r', 'a', 'i', 'n', 'e', 'd']


In [68]:
vocab_size = 50

while len(vocab) < vocab_size:
    pair_freqs = compute_pair_freqs(splits)
    best_pair = ""
    max_freq = None
    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    splits = merge_pair(*best_pair, splits)
    merges[best_pair] = best_pair[0] + best_pair[1]
    vocab.append(best_pair[0] + best_pair[1])

In [69]:
print(merges)
print(vocab)

{('Ġ', 't'): 'Ġt', ('n', 's'): 'ns', ('e', 'ns'): 'ens', ('k', 'ens'): 'kens', ('o', 'kens'): 'okens', ('Ġt', 'okens'): 'Ġtokens', ('g', 'e'): 'ge', ('ge', 'n'): 'gen', ('gen', 'e'): 'gene', ('gene', 'r'): 'gener', ('gener', 'a'): 'genera', ('genera', 't'): 'generat', ('generat', 'e'): 'generate', ('Ġ', 'generate'): 'Ġgenerate', ('e', 'd'): 'ed', ('n', 'ed'): 'ned', ('i', 'ned'): 'ined', ('a', 'ined'): 'ained', ('r', 'ained'): 'rained', ('Ġt', 'rained'): 'Ġtrained', ('r', 'e'): 're', ('a', 're'): 'are', ('Ġ', 'are'): 'Ġare', ('e', 'y'): 'ey', ('h', 'ey'): 'hey', ('Ġt', 'hey'): 'Ġthey', ('Ġ', 'h'): 'Ġh', ('Ġh', 'o'): 'Ġho', ('Ġho', 'w'): 'Ġhow', ('a', 'n'): 'an', ('Ġ', 'an'): 'Ġan', ('Ġan', 'd'): 'Ġand', ('an', 'd'): 'and', ('t', 'and'): 'tand', ('s', 'tand'): 'stand', ('r', 'stand'): 'rstand', ('e', 'rstand'): 'erstand', ('d', 'erstand'): 'derstand', ('n', 'derstand'): 'nderstand', ('i', 's'): 'is', ('e', 'r'): 'er', ('Ġ', 'a'): 'Ġa', ('Ġt', 'o'): 'Ġto', ('e', 'n'): 'en', ('T', 'h'): '

In [70]:
def tokenize(text):
  pre_tokenize_results = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
  pre_tokenized_text = [word for word, offset in pre_tokenize_result]
  splits = [[l for l in word] for word in pre_tokenized_text]
  for pair, merge in merges.items():
    for idx, split in enumerate(splits):
      i=0
      while i < len(split) - 1:
        if split[i] == pair[0] and split[i + 1] == pair[1]:
            split = split[:i] + [merge] + split[i + 2 :]
        else:
            i += 1
      splits[idx] = split
  return sum(splits, [])



In [72]:
tokenize("This is not a token")

['This', 'Ġis', 'Ġ', 'n', 'o', 't', 'Ġa', 'Ġtoken', '.']

In [41]:
print(vocab)

['<|endoftext|>', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ', 'Ġt', 'ns', 'ens', 'kens', 'okens', 'Ġtokens', 'ge', 'gen', 'gene', 'gener', 'genera', 'generat', 'generate', 'Ġgenerate', 'ed', 'ned', 'ined', 'ained', 'rained', 'Ġtrained', 're', 'are', 'Ġare', 'ey', 'hey', 'Ġthey', 'Ġh', 'Ġho', 'Ġhow', 'an', 'Ġan', 'Ġand', 'and', 'tand', 'stand', 'rstand', 'erstand', 'derstand', 'nderstand']
