In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [3]:
corpus = ["This is the Hugging Face course",
          "This chapter is about tokenization."]

corpus = [c.lower() for c in corpus]

In [4]:
from collections import defaultdict

word_freqs = defaultdict(int)

for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

In [5]:
alphabet = []

for word in word_freqs.keys():
  for letter in word:
    if letter not in alphabet:
      alphabet.append(letter)
alphabet.sort()


In [6]:
vocab = ["<|endoftext|>"] + alphabet.copy()

In [7]:
vocab

['<|endoftext|>',
 '.',
 'a',
 'b',
 'c',
 'e',
 'f',
 'g',
 'h',
 'i',
 'k',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'z',
 'Ġ']

In [8]:
splits = {word: [c for c in word] for word in word_freqs.keys()}

In [9]:
print(splits)

{'this': ['t', 'h', 'i', 's'], 'Ġis': ['Ġ', 'i', 's'], 'Ġthe': ['Ġ', 't', 'h', 'e'], 'Ġhugging': ['Ġ', 'h', 'u', 'g', 'g', 'i', 'n', 'g'], 'Ġface': ['Ġ', 'f', 'a', 'c', 'e'], 'Ġcourse': ['Ġ', 'c', 'o', 'u', 'r', 's', 'e'], 'Ġchapter': ['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r'], 'Ġabout': ['Ġ', 'a', 'b', 'o', 'u', 't'], 'Ġtokenization': ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n'], '.': ['.']}


In [10]:
def compute_pair_freqs(splits):
  pair_freqs = defaultdict(int)
  for word, freq in word_freqs.items():
    split = splits[word]
    if len(split) ==1:
      continue
    for i in range(len(split) - 1):
      pair = (split[i], split[i+1])
      pair_freqs[pair] += freq
  return pair_freqs


pair_freqs = compute_pair_freqs(splits)

for i, key in enumerate(pair_freqs.keys()):
  if i<5:
    print(key, pair_freqs[key])
  else:
    break


('t', 'h') 3
('h', 'i') 2
('i', 's') 4
('Ġ', 'i') 2
('Ġ', 't') 2


In [11]:
best_pair = ''
max_freq = None
for pair, freq in pair_freqs.items():
  if max_freq is None or max_freq < freq:
    best_pair = pair
    max_freq = freq

In [12]:
print(best_pair, max_freq)

('i', 's') 4


In [13]:
def merge_pair(a, b, splits):
  for word in word_freqs:
    split = splits[word]
    if len(split) == 1:
      continue

    i=0
    while i < len(split) - 1:
      if split[i] == a and split[i+1] == b:
        split = split[:i] + [a+b] + split[i+2:]
      else:
        i += 1
    splits[word] = split
  return splits

splits = merge_pair('Ġ', 't', splits)

In [23]:
vocab_size = 20
merges = {}
while len(vocab) < vocab_size:
  pair_freqs = compute_pair_freqs(splits)
  best_pair = ''
  max_freq = None
  for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
      best_pair = pair
      max_freq = freq

  splits = merge_pair(*best_pair, splits)
  merges[best_pair] = best_pair[0] + best_pair[1]
  vocab.append(merges[best_pair])

In [24]:
def tokenize(text):
  pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text.lower())
  pre_tokenized_text = [word for word, offset in pre_tokenize_result]
  splits = [[l for l in word] for word in pre_tokenized_text]
  for pair, merge in merges.items():
    for idx, split in enumerate(splits):
      i = 0
      while i < len(split) - 1:
        if split[i] == pair[0] and split[i+1] == pair[1]:
          split = split[:i] + [merge] + split[i+2:]
        else:
          i += 1
      splits[idx] = split

  return sum(splits, [])

In [25]:
print(tokenize("This is the Hugging Face course"))

['t', 'h', 'i', 's', 'Ġ', 'i', 's', 'Ġ', 't', 'h', 'e', 'Ġ', 'h', 'u', 'g', 'g', 'i', 'n', 'g', 'Ġ', 'f', 'a', 'c', 'e', 'Ġ', 'c', 'o', 'u', 'r', 's', 'e']
