<a href="https://colab.research.google.com/github/JYP0824/Personal-Project/blob/main/WordPiece_Tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WordPiece Tokenizer

In [14]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [15]:
#BPE Tokenizer에서 사용한 예제 코퍼스를 여기서도 그대로 사용한다.
corpus = [
    "This is the Hugging Face course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

#설명예제
#("hug", 10), ("pug", 5), ("pun", 12), ("bun", 4), ("hugs", 5)
test = [
    "hug hug hug hug hug hug hug hug hug hug",
    "pug pug pug pug pug bun bun bun bun",
    "hugs hugs hugs hugs hugs",
    "pun pun pun pun pun pun pun pun pun pun pun pun"
]

In [18]:
from collections import defaultdict

word_freq = defaultdict(int)
for text in corpus:
  word_with_offset = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text.lower())
  ws = [word for word, offset in word_with_offset]
  for word in ws:
    word_freq[word] += 1

print(word_freq)

defaultdict(<class 'int'>, {'this': 3, 'is': 2, 'the': 1, 'hugging': 1, 'face': 1, 'course': 1, '.': 4, 'chapter': 1, 'about': 1, 'tokenization': 1, 'section': 1, 'shows': 1, 'several': 1, 'tokenizer': 1, 'algorithms': 1, 'hopefully': 1, ',': 1, 'you': 1, 'will': 1, 'be': 1, 'able': 1, 'to': 1, 'understand': 1, 'how': 1, 'they': 1, 'are': 1, 'trained': 1, 'and': 1, 'generate': 1, 'tokens': 1})


In [36]:
alphabet = []
for word in word_freq.keys():
  if word[0] not in alphabet:
    alphabet.append(word[0])
  for letter in word[1:]:
    if f'##{letter}' not in alphabet:
      alphabet.append(f'##{letter}')

alphabet.sort()

vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy()

In [None]:
splits = {word: [c if i == 0 else f'##{c}' for i, c in enumerate(word)] for word in word_freq.keys()}

In [66]:
def compute_pair(splits):
  letter_freq = defaultdict(int)
  pair_freq = defaultdict(int)
  for word, freq in word_freq.items():
    split = splits[word]
    if len(split) == 1:
      letter_freq[split[0]] += freq
      continue
    for i in range(len(split)-1):
      pair = (split[i], split[i + 1])
      letter_freq[split[i]] += freq
      pair_freq[pair] += freq
    letter_freq[split[-1]] += freq

    score = {
        pair: freq / (letter_freq[pair[0]] * letter_freq[pair[1]])
        for pair, freq in pair_freq.items()
        if pair[0] in letter_freq and pair[1] in letter_freq
    }
    return score

In [67]:
pair_scores = compute_pair(splits)
for i, key in enumerate(pair_scores.keys()):
    print(f"{key}: {pair_scores[key]}")
    if i >= 4:
        break

('t', '##h'): 0.3333333333333333
('##h', '##i'): 0.3333333333333333
('##i', '##s'): 0.3333333333333333


In [72]:
def merge_pair(a, b, splits):
    for word in word_freq:
        split = splits[word]
        if len(split) == 1:
            continue
        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                merge = a + b[2:] if b.startswith("##") else a + b
                split = split[:i] + [merge] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

In [73]:
vocab_size = 100
while len(vocab) < vocab_size:
    scores = compute_pair(splits)
    best_pair, max_score = "", None
    for pair, score in scores.items():
        if max_score is None or max_score < score:
            best_pair = pair
            max_score = score
    #print(max_score)
    splits = merge_pair(*best_pair, splits)
    new_token = (
        best_pair[0] + best_pair[1][2:]
        if best_pair[1].startswith("##")
        else best_pair[0] + best_pair[1]
    )
    vocab.append(new_token)

In [74]:
def encode_word(word):
    word = word.lower()
    tokens = []
    while len(word) > 0:
        i = len(word)
        while i > 0 and word[:i] not in vocab:
            i -= 1
        if i == 0:
            return ["[UNK]"]
        tokens.append(word[:i])
        word = word[i:]
        if len(word) > 0:
            word = f"##{word}"
    return tokens

In [75]:
print(encode_word("Hugging"))
print(encode_word("HXgging"))

['hugging']
['[UNK]']


In [76]:
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text.lower())
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    encoded_words = [encode_word(word) for word in pre_tokenized_text]
    return sum(encoded_words, [])

print(tokenize("This is the Hugging Face course!"))

['this', 'is', 'the', 'hugging', 'face', 'course', '[UNK]']
