<a href="https://colab.research.google.com/github/Kim-Jeong-Ju/AI_Modeling_NLP/blob/main/Subword_Tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Packages and Modules Importation**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
import time
import re
import csv
import collections
import urllib.request
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Markdown, Latex

!pip install sentencepiece
import sentencepiece as spm

!pip install tokenizers
from tokenizers import BertWordPieceTokenizer, ByteLevelBPETokenizer, CharBPETokenizer, SentencePieceBPETokenizer

import tensorflow as tf
import tensorflow_datasets as tfds

Mounted at /content/drive
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 5.1 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 5.2 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.2


# **Subword Tokenizer, 서브워드 토크나이저**

## **BPE(Byte Pair Encoding) in NLP, 자연어 처리에서의 BPE**

In [2]:
num_merges = 10         # BPE를 수행할 횟수
dictionary = {"l o w </w>" : 5, "l o w e r </w>" : 2, "n e w e s t </w>" : 6, "w i d e s t </w>" : 3}

def get_stats(dictionary):
    pairs = collections.defaultdict(int)
    for word, freq in dictionary.items():
        symbols = word.split()                                      # symbols = ["l", "o", "w", "</w>"], ...
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i+1]] += freq                 # pairs = {("l", "o") : 7, ("o", "w") : 7, ...}
    
    print("현재 pair들의 빈도수 =", dict(pairs))
    return pairs
print()

def merge_dict(pair, v_in):                 # pair는 best, v_in은 dictionary
    print(f"pair = {pair}")
    print(f"v_in = {v_in}")

    v_out = {}
    bigram = re.escape(" ".join(pair))      # bigram = "문자\ 문자"의 형태, re.escape는 입력된 문자열에 대해 특수문자들을 escape 처리함
    print(f"bigram = {bigram}")             # re.escape = https://greeksharifa.github.io/%EC%A0%95%EA%B7%9C%ED%91%9C%ED%98%84%EC%8B%9D(re)/2018/08/24/regex-usage-09-other-functions/
    new_pair = re.compile(r"(?<!\$)" + bigram + r"(?!\$)")

    for word in v_in:
        print(f"word = {word}")
        w_out = new_pair.sub("".join(pair), word)       # word에 있는 문자열 중에 p=re.compile(r'(?<!\S)' + bigram + r'(?!\S)')와 match 되는 문자열은 ''.join(pair)로 바꿈
        print(f"word_out = {w_out}")
        v_out[w_out] = v_in[word]

    return v_out
print()

bpe_codes = {}
bpe_codes_reverse = {}
for i in range(num_merges):
    display(Markdown("### Iteration {}".format(i + 1)))
    pairs = get_stats(dictionary)
    best = max(pairs, key=pairs.get)                    # get() => dictionary에서 key로부터 value를 받아오는 함수
    dictionary = merge_dict(best, dictionary)

    bpe_codes[best] = i
    bpe_codes_reverse[best[0] + best[1]] = best

    print("New Merge = {}".format(best))
    print("Updated Dictionary = {}".format(dictionary))
print()

print(bpe_codes)
print(bpe_codes_reverse)





### Iteration 1

현재 pair들의 빈도수 = {('l', 'o'): 7, ('o', 'w'): 7, ('w', '</w>'): 5, ('w', 'e'): 8, ('e', 'r'): 2, ('r', '</w>'): 2, ('n', 'e'): 6, ('e', 'w'): 6, ('e', 's'): 9, ('s', 't'): 9, ('t', '</w>'): 9, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'e'): 3}
pair = ('e', 's')
v_in = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w e s t </w>': 6, 'w i d e s t </w>': 3}
bigram = e\ s
word = l o w </w>
word_out = l o w </w>
word = l o w e r </w>
word_out = l o w e r </w>
word = n e w e s t </w>
word_out = n e w es t </w>
word = w i d e s t </w>
word_out = w i d es t </w>
New Merge = ('e', 's')
Updated Dictionary = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w es t </w>': 6, 'w i d es t </w>': 3}


### Iteration 2

현재 pair들의 빈도수 = {('l', 'o'): 7, ('o', 'w'): 7, ('w', '</w>'): 5, ('w', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('n', 'e'): 6, ('e', 'w'): 6, ('w', 'es'): 6, ('es', 't'): 9, ('t', '</w>'): 9, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'es'): 3}
pair = ('es', 't')
v_in = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w es t </w>': 6, 'w i d es t </w>': 3}
bigram = es\ t
word = l o w </w>
word_out = l o w </w>
word = l o w e r </w>
word_out = l o w e r </w>
word = n e w es t </w>
word_out = n e w est </w>
word = w i d es t </w>
word_out = w i d est </w>
New Merge = ('es', 't')
Updated Dictionary = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est </w>': 6, 'w i d est </w>': 3}


### Iteration 3

현재 pair들의 빈도수 = {('l', 'o'): 7, ('o', 'w'): 7, ('w', '</w>'): 5, ('w', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('n', 'e'): 6, ('e', 'w'): 6, ('w', 'est'): 6, ('est', '</w>'): 9, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'est'): 3}
pair = ('est', '</w>')
v_in = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est </w>': 6, 'w i d est </w>': 3}
bigram = est\ </w>
word = l o w </w>
word_out = l o w </w>
word = l o w e r </w>
word_out = l o w e r </w>
word = n e w est </w>
word_out = n e w est</w>
word = w i d est </w>
word_out = w i d est</w>
New Merge = ('est', '</w>')
Updated Dictionary = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}


### Iteration 4

현재 pair들의 빈도수 = {('l', 'o'): 7, ('o', 'w'): 7, ('w', '</w>'): 5, ('w', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('n', 'e'): 6, ('e', 'w'): 6, ('w', 'est</w>'): 6, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'est</w>'): 3}
pair = ('l', 'o')
v_in = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}
bigram = l\ o
word = l o w </w>
word_out = lo w </w>
word = l o w e r </w>
word_out = lo w e r </w>
word = n e w est</w>
word_out = n e w est</w>
word = w i d est</w>
word_out = w i d est</w>
New Merge = ('l', 'o')
Updated Dictionary = {'lo w </w>': 5, 'lo w e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}


### Iteration 5

현재 pair들의 빈도수 = {('lo', 'w'): 7, ('w', '</w>'): 5, ('w', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('n', 'e'): 6, ('e', 'w'): 6, ('w', 'est</w>'): 6, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'est</w>'): 3}
pair = ('lo', 'w')
v_in = {'lo w </w>': 5, 'lo w e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}
bigram = lo\ w
word = lo w </w>
word_out = low </w>
word = lo w e r </w>
word_out = low e r </w>
word = n e w est</w>
word_out = n e w est</w>
word = w i d est</w>
word_out = w i d est</w>
New Merge = ('lo', 'w')
Updated Dictionary = {'low </w>': 5, 'low e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}


### Iteration 6

현재 pair들의 빈도수 = {('low', '</w>'): 5, ('low', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('n', 'e'): 6, ('e', 'w'): 6, ('w', 'est</w>'): 6, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'est</w>'): 3}
pair = ('n', 'e')
v_in = {'low </w>': 5, 'low e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}
bigram = n\ e
word = low </w>
word_out = low </w>
word = low e r </w>
word_out = low e r </w>
word = n e w est</w>
word_out = ne w est</w>
word = w i d est</w>
word_out = w i d est</w>
New Merge = ('n', 'e')
Updated Dictionary = {'low </w>': 5, 'low e r </w>': 2, 'ne w est</w>': 6, 'w i d est</w>': 3}


### Iteration 7

현재 pair들의 빈도수 = {('low', '</w>'): 5, ('low', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('ne', 'w'): 6, ('w', 'est</w>'): 6, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'est</w>'): 3}
pair = ('ne', 'w')
v_in = {'low </w>': 5, 'low e r </w>': 2, 'ne w est</w>': 6, 'w i d est</w>': 3}
bigram = ne\ w
word = low </w>
word_out = low </w>
word = low e r </w>
word_out = low e r </w>
word = ne w est</w>
word_out = new est</w>
word = w i d est</w>
word_out = w i d est</w>
New Merge = ('ne', 'w')
Updated Dictionary = {'low </w>': 5, 'low e r </w>': 2, 'new est</w>': 6, 'w i d est</w>': 3}


### Iteration 8

현재 pair들의 빈도수 = {('low', '</w>'): 5, ('low', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('new', 'est</w>'): 6, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'est</w>'): 3}
pair = ('new', 'est</w>')
v_in = {'low </w>': 5, 'low e r </w>': 2, 'new est</w>': 6, 'w i d est</w>': 3}
bigram = new\ est</w>
word = low </w>
word_out = low </w>
word = low e r </w>
word_out = low e r </w>
word = new est</w>
word_out = newest</w>
word = w i d est</w>
word_out = w i d est</w>
New Merge = ('new', 'est</w>')
Updated Dictionary = {'low </w>': 5, 'low e r </w>': 2, 'newest</w>': 6, 'w i d est</w>': 3}


### Iteration 9

현재 pair들의 빈도수 = {('low', '</w>'): 5, ('low', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'est</w>'): 3}
pair = ('low', '</w>')
v_in = {'low </w>': 5, 'low e r </w>': 2, 'newest</w>': 6, 'w i d est</w>': 3}
bigram = low\ </w>
word = low </w>
word_out = low</w>
word = low e r </w>
word_out = low e r </w>
word = newest</w>
word_out = newest</w>
word = w i d est</w>
word_out = w i d est</w>
New Merge = ('low', '</w>')
Updated Dictionary = {'low</w>': 5, 'low e r </w>': 2, 'newest</w>': 6, 'w i d est</w>': 3}


### Iteration 10

현재 pair들의 빈도수 = {('low', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'est</w>'): 3}
pair = ('w', 'i')
v_in = {'low</w>': 5, 'low e r </w>': 2, 'newest</w>': 6, 'w i d est</w>': 3}
bigram = w\ i
word = low</w>
word_out = low</w>
word = low e r </w>
word_out = low e r </w>
word = newest</w>
word_out = newest</w>
word = w i d est</w>
word_out = wi d est</w>
New Merge = ('w', 'i')
Updated Dictionary = {'low</w>': 5, 'low e r </w>': 2, 'newest</w>': 6, 'wi d est</w>': 3}

{('e', 's'): 0, ('es', 't'): 1, ('est', '</w>'): 2, ('l', 'o'): 3, ('lo', 'w'): 4, ('n', 'e'): 5, ('ne', 'w'): 6, ('new', 'est</w>'): 7, ('low', '</w>'): 8, ('w', 'i'): 9}
{'es': ('e', 's'), 'est': ('es', 't'), 'est</w>': ('est', '</w>'), 'lo': ('l', 'o'), 'low': ('lo', 'w'), 'ne': ('n', 'e'), 'new': ('ne', 'w'), 'newest</w>': ('new', 'est</w>'), 'low</w>': ('low', '</w>'), 'wi': ('w', 'i')}


In [5]:
def get_pairs(word):                        # 단어의 symbol pair 집합 반환, 단어는 변수 길이 문자열인 symbol의 tuple 형식
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    
    return pairs

def encode(origin):
    word = tuple(origin) + ("</w>",)
    display(Markdown("__word split into characters :__ <tt>{}</tt>".format(word)))

    pairs = get_pairs(word)

    if not pairs:
        return origin
    
    iter = 0
    while True:
        iter += 1
        display(Markdown("__Iteration {} :__".format(iter)))

        print(f"bigrams in the word = {pairs}")
        bigram = min(pairs, key=lambda pair: bpe_codes.get(pair, float("inf")))
        print(f"candidates for merge = {bigram}")

        if bigram not in bpe_codes:
            display(Markdown("__Candidate is not in BPE merges, algorithm stops.__"))
            break
        
        first, second = bigram
        new_word = []
        a_iter = 0
        while a_iter < len(word):
            try:
                b_iter = word.index(first, a_iter)                          # tuple.index(element, start_index, end_index) => https://www.programiz.com/python-programming/methods/tuple/index
                new_word.extend(word[a_iter:b_iter])                        # start_index (optional) - start scanning the element from the start_index
                print(f"word = {word}, new_word = {new_word}")              # end_index (optional) - stop scanning the element at the end_index
                a_iter = b_iter
            except:
                new_word.extend(word[a_iter:])
                print(f"EXCEPTION   word = {word}, new_word = {new_word}")
                break
            
            if word[a_iter] == first and a_iter < len(word) - 1 and word[a_iter+1] == second:
                new_word.append(first + second)
                print(f"Merge {first} and {second}   word = {word}, new_word = {new_word}")
                a_iter += 2
            else:
                new_word.append(word[a_iter])
                print(f"Not Merge   word = {word}, new_word = {new_word}")
                a_iter += 1
        
        new_word = tuple(new_word)
        word = new_word
        print(f"Word After Merging = {word}")

        if len(word) == 1:
            print("Word Length is 1, BREAK")
            break
        else:
            pairs = get_pairs(word)
    
    if word[-1] == "</w>":
        word = word[:-1]
    elif word[-1].endswith("</w>"):
        word = word[:-1] + (word[-1].replace("</w>", ""),)
    
    return word

In [6]:
encode("loki")

__word split into characters :__ <tt>('l', 'o', 'k', 'i', '</w>')</tt>

__Iteration 1 :__

bigrams in the word = {('o', 'k'), ('i', '</w>'), ('l', 'o'), ('k', 'i')}
candidates for merge = ('l', 'o')
word = ('l', 'o', 'k', 'i', '</w>'), new_word = []
Merge l and o   word = ('l', 'o', 'k', 'i', '</w>'), new_word = ['lo']
EXCEPTION   word = ('l', 'o', 'k', 'i', '</w>'), new_word = ['lo', 'k', 'i', '</w>']
Word After Merging = ('lo', 'k', 'i', '</w>')


__Iteration 2 :__

bigrams in the word = {('lo', 'k'), ('i', '</w>'), ('k', 'i')}
candidates for merge = ('lo', 'k')


__Candidate is not in BPE merges, algorithm stops.__

('lo', 'k', 'i')

In [7]:
encode("lowest")

__word split into characters :__ <tt>('l', 'o', 'w', 'e', 's', 't', '</w>')</tt>

__Iteration 1 :__

bigrams in the word = {('t', '</w>'), ('e', 's'), ('w', 'e'), ('o', 'w'), ('s', 't'), ('l', 'o')}
candidates for merge = ('e', 's')
word = ('l', 'o', 'w', 'e', 's', 't', '</w>'), new_word = ['l', 'o', 'w']
Merge e and s   word = ('l', 'o', 'w', 'e', 's', 't', '</w>'), new_word = ['l', 'o', 'w', 'es']
EXCEPTION   word = ('l', 'o', 'w', 'e', 's', 't', '</w>'), new_word = ['l', 'o', 'w', 'es', 't', '</w>']
Word After Merging = ('l', 'o', 'w', 'es', 't', '</w>')


__Iteration 2 :__

bigrams in the word = {('t', '</w>'), ('es', 't'), ('w', 'es'), ('o', 'w'), ('l', 'o')}
candidates for merge = ('es', 't')
word = ('l', 'o', 'w', 'es', 't', '</w>'), new_word = ['l', 'o', 'w']
Merge es and t   word = ('l', 'o', 'w', 'es', 't', '</w>'), new_word = ['l', 'o', 'w', 'est']
EXCEPTION   word = ('l', 'o', 'w', 'es', 't', '</w>'), new_word = ['l', 'o', 'w', 'est', '</w>']
Word After Merging = ('l', 'o', 'w', 'est', '</w>')


__Iteration 3 :__

bigrams in the word = {('w', 'est'), ('o', 'w'), ('est', '</w>'), ('l', 'o')}
candidates for merge = ('est', '</w>')
word = ('l', 'o', 'w', 'est', '</w>'), new_word = ['l', 'o', 'w']
Merge est and </w>   word = ('l', 'o', 'w', 'est', '</w>'), new_word = ['l', 'o', 'w', 'est</w>']
Word After Merging = ('l', 'o', 'w', 'est</w>')


__Iteration 4 :__

bigrams in the word = {('o', 'w'), ('l', 'o'), ('w', 'est</w>')}
candidates for merge = ('l', 'o')
word = ('l', 'o', 'w', 'est</w>'), new_word = []
Merge l and o   word = ('l', 'o', 'w', 'est</w>'), new_word = ['lo']
EXCEPTION   word = ('l', 'o', 'w', 'est</w>'), new_word = ['lo', 'w', 'est</w>']
Word After Merging = ('lo', 'w', 'est</w>')


__Iteration 5 :__

bigrams in the word = {('lo', 'w'), ('w', 'est</w>')}
candidates for merge = ('lo', 'w')
word = ('lo', 'w', 'est</w>'), new_word = []
Merge lo and w   word = ('lo', 'w', 'est</w>'), new_word = ['low']
EXCEPTION   word = ('lo', 'w', 'est</w>'), new_word = ['low', 'est</w>']
Word After Merging = ('low', 'est</w>')


__Iteration 6 :__

bigrams in the word = {('low', 'est</w>')}
candidates for merge = ('low', 'est</w>')


__Candidate is not in BPE merges, algorithm stops.__

('low', 'est')

In [8]:
encode("lowing")

__word split into characters :__ <tt>('l', 'o', 'w', 'i', 'n', 'g', '</w>')</tt>

__Iteration 1 :__

bigrams in the word = {('n', 'g'), ('i', 'n'), ('g', '</w>'), ('o', 'w'), ('w', 'i'), ('l', 'o')}
candidates for merge = ('l', 'o')
word = ('l', 'o', 'w', 'i', 'n', 'g', '</w>'), new_word = []
Merge l and o   word = ('l', 'o', 'w', 'i', 'n', 'g', '</w>'), new_word = ['lo']
EXCEPTION   word = ('l', 'o', 'w', 'i', 'n', 'g', '</w>'), new_word = ['lo', 'w', 'i', 'n', 'g', '</w>']
Word After Merging = ('lo', 'w', 'i', 'n', 'g', '</w>')


__Iteration 2 :__

bigrams in the word = {('lo', 'w'), ('n', 'g'), ('i', 'n'), ('g', '</w>'), ('w', 'i')}
candidates for merge = ('lo', 'w')
word = ('lo', 'w', 'i', 'n', 'g', '</w>'), new_word = []
Merge lo and w   word = ('lo', 'w', 'i', 'n', 'g', '</w>'), new_word = ['low']
EXCEPTION   word = ('lo', 'w', 'i', 'n', 'g', '</w>'), new_word = ['low', 'i', 'n', 'g', '</w>']
Word After Merging = ('low', 'i', 'n', 'g', '</w>')


__Iteration 3 :__

bigrams in the word = {('g', '</w>'), ('low', 'i'), ('n', 'g'), ('i', 'n')}
candidates for merge = ('g', '</w>')


__Candidate is not in BPE merges, algorithm stops.__

('low', 'i', 'n', 'g')

In [9]:
encode("highing")

__word split into characters :__ <tt>('h', 'i', 'g', 'h', 'i', 'n', 'g', '</w>')</tt>

__Iteration 1 :__

bigrams in the word = {('n', 'g'), ('i', 'n'), ('i', 'g'), ('g', '</w>'), ('g', 'h'), ('h', 'i')}
candidates for merge = ('n', 'g')


__Candidate is not in BPE merges, algorithm stops.__

('h', 'i', 'g', 'h', 'i', 'n', 'g')

## **SentencePiece in NLP, 자연어 처리에서의 SentencePiece**  
---

##### **SPM의 SentencePieceTrainer의 Arguments**  
- input : train file
- model_prefix : 만들 모델의 이름
- vocab_size : 단어 집합의 크기
- model_type : 사용할 모델, default는 unigram이며 bpe, char, word 등이 가능
- max_sentence_length : 문장의 최대 길이
- pad_id, pad_piece : PAD token id와 value
- unk_id, unk_piece : Unkown token id와 value
- bos_id, bos_piece : Begion of Sentence token id와 value
- eos_id, eos_piece : End of Sentence token id와 value
- user_defined_symbols : 사용자 정의 token  
  
vocab 생성이 완료되면 **imdb.model과 imdb.vocab 파일**이 생성됨. **vocab 파일에서 학습된 subword들 확인 가능**

### **IMDB Review**

In [11]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv", filename="IMDb_Reviews.csv")
train_df = pd.read_csv("IMDb_Reviews.csv")
print("Train Data Length =", len(train_df))
print()

with open("IMDb_Reviews.txt", "w", encoding="utf8") as file:
    file.write("\n".join(train_df["review"]))

spm.SentencePieceTrainer.Train('--input=IMDb_Reviews.txt --model_prefix=imdb --vocab_size=5000 --model_type=bpe --max_sentence_length=9999')

vocab_list = pd.read_csv("imdb.vocab", sep="\t", header=None, quoting=csv.QUOTE_NONE)
vocab_list.sample(10)
print(len(vocab_list))
print()

sp = spm.SentencePieceProcessor()
vocab_file = "imdb.model"
sp.load(vocab_file)
print()

lines = ["I didn't at all think of it this way.", "I have waited a long time for someone to film"]
for line in lines:
    print(line)
    print(sp.encode_as_pieces(line))        # 입력된 문장에 대해 subword sequence로 변환
    print(sp.encode_as_ids(line))           # 입력된 문장에 대해 integer sequence로 변환
    print()

print("Get Piece Size =", sp.GetPieceSize)                                                                                                      # 단어 집합의 크기
print("ID to Piece =", sp.IdToPiece(430))                                                                                                       # 정수 => subword 변환
print("Piece To ID =", sp.PieceToId("_character"))                                                                                              # subword => 정수 변환
print("Decode IDs =", sp.DecodeIds([41, 141, 1364, 1120, 4, 666, 285, 92, 1078, 33, 91]))                                                       # 정수 seq => 문장 변환
print("Decode Pieces =", sp.DecodePieces(['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']))        # subword seq => 문장 변환
print("Encode to Integer Seq =", sp.encode('I have waited a long time for someone to film', out_type=int))                                      # 문장 => 정수 seq 변환
print("Encode to Subword Seq =", sp.encode('I have waited a long time for someone to film', out_type=str))                                      # 문장 => subword seq 변환

Train Data Length = 50000

5000


I didn't at all think of it this way.
['▁I', '▁didn', "'", 't', '▁at', '▁all', '▁think', '▁of', '▁it', '▁this', '▁way', '.']
[41, 623, 4950, 4926, 138, 169, 378, 30, 58, 73, 413, 4945]

I have waited a long time for someone to film
['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']
[41, 141, 1364, 1120, 4, 666, 285, 92, 1078, 33, 91]

Get Piece Size = <bound method SentencePieceProcessor.GetPieceSize of <sentencepiece.SentencePieceProcessor; proxy of <Swig Object of type 'sentencepiece::SentencePieceProcessor *' at 0x7f6107597f60> >>
ID to Piece = ▁character
Piece To ID = 0
Decode IDs = I have waited a long time for someone to film
Decode Pieces = I have waited a long time for someone to film
Encode to Integer Seq = [41, 141, 1364, 1120, 4, 666, 285, 92, 1078, 33, 91]
Encode to Subword Seq = ['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']


### **Naver Movie Review**

In [13]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")
naver_df = pd.read_table('ratings.txt')

naver_df = naver_df.dropna(how="any")           # 결측치 제거
print('Length :',len(naver_df))
print()

with open("naver_review.txt", "w", encoding="utf8") as file:
    file.write("\n".join(naver_df["document"]))

spm.SentencePieceTrainer.Train("--input=naver_review.txt --model_prefix=naver --vocab_size=5000 --model_type=bpe --max_sentence_length=9999")
vocab_list = pd.read_csv('naver.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)
len(vocab_list)
print()

sp = spm.SentencePieceProcessor()
vocab_file = "naver.model"
sp.load(vocab_file)

lines = ["뭐 이딴 것도 영화냐.", "진짜 최고의 영화입니다 ㅋㅋ"]
for line in lines:
  print(line)
  print(sp.encode_as_pieces(line))
  print(sp.encode_as_ids(line))
  print()

print("Get Piece Size =", sp.GetPieceSize)                                                          # 단어 집합의 크기
print("ID to Piece =", sp.IdToPiece(4))                                                             # 정수 => subword 변환
print("Piece To ID =", sp.PieceToId("영화"))                                                        # subword => 정수 변환
print("Decode IDs =", sp.DecodeIds([54, 200, 821, 85]))                                             # 정수 seq => 문장 변환
print("Decode Pieces =", sp.DecodePieces(['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ']))            # subword seq => 문장 변환
print("Encode to Integer Seq =", sp.encode('진짜 최고의 영화입니다 ㅋㅋ', out_type=int))            # 문장 => 정수 seq 변환
print("Encode to Subword Seq =", sp.encode('진짜 최고의 영화입니다 ㅋㅋ', out_type=str))            # 문장 => subword seq 변환

Length : 199992


뭐 이딴 것도 영화냐.
['▁뭐', '▁이딴', '▁것도', '▁영화냐', '.']
[132, 966, 1296, 2590, 3276]

진짜 최고의 영화입니다 ㅋㅋ
['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ']
[54, 200, 821, 85]

Get Piece Size = <bound method SentencePieceProcessor.GetPieceSize of <sentencepiece.SentencePieceProcessor; proxy of <Swig Object of type 'sentencepiece::SentencePieceProcessor *' at 0x7f6107b8e2d0> >>
ID to Piece = 영화
Piece To ID = 4
Decode IDs = 진짜 최고의 영화입니다 ᄏᄏ
Decode Pieces = 진짜 최고의 영화입니다 ᄏᄏ
Encode to Integer Seq = [54, 200, 821, 85]
Encode to Subword Seq = ['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ']


## **SubwordTextEncoder in NLP, 자연어 처리에서의 SubwordTextEncoder**

### **Tokenizing IMDB Review**

In [14]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv", filename="IMDb_Reviews.csv")
train_df = pd.read_csv('IMDb_Reviews.csv')

tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(train_df['review'], target_vocab_size=2 ** 13)    # subword로 이루어진 단어 집합 & Integer Encoding 수행
print(train_df['review'][20])                                                               # 리뷰 sample
print('Tokenized sample question = {}'.format(tokenizer.encode(train_df['review'][20])))    # Encoding된 리뷰 sample 확인
print()

sample_str = "It's mind-blowing to me that this film was even made."
encoded_str = tokenizer.encode(sample_str)          # Encoding 수행
print(f"Encoded Sentence = {encoded_str}")
decoded_str = tokenizer.decode(encoded_str)         # Decoding 수행
print(f"Decoded Sentence = {decoded_str}")
print()

print("단어 집합의 크기 =", tokenizer.vocab_size)
for token_str in encoded_str:
    print(f"{token_str} ======> {tokenizer.decode([token_str])}")

sample_str = "It's mind-blowing to me that this film was evenxyz made."            # even이라는 단어에 임의의 xyz 문자를 추가하여 subword 분리 작업 확인
encoded_str = tokenizer.encode(sample_str)          # Encoding 수행
print(f"Encoded Sentence = {encoded_str}")
decoded_str = tokenizer.decode(encoded_str)         # Decoding 수행
print(f"Decoded Sentence = {decoded_str}")
print()

for token_str in encoded_str:
    print(f"{token_str} ======> {tokenizer.decode([token_str])}")

Pretty bad PRC cheapie which I rarely bother to watch over again, and it's no wonder -- it's slow and creaky and dull as a butter knife. Mad doctor George Zucco is at it again, turning a dimwitted farmhand in overalls (Glenn Strange) into a wolf-man. Unfortunately, the makeup is virtually non-existent, consisting only of a beard and dimestore fangs for the most part. If it were not for Zucco and Strange's presence, along with the cute Anne Nagel, this would be completely unwatchable. Strange, who would go on to play Frankenstein's monster for Unuiversal in two years, does a Lenny impression from "Of Mice and Men", it seems.<br /><br />*1/2 (of Four)
Tokenized sample question = [1590, 4162, 132, 7107, 1892, 2983, 578, 76, 12, 4632, 3422, 7, 160, 175, 372, 2, 5, 39, 8051, 8, 84, 2652, 497, 39, 8051, 8, 1374, 5, 3461, 2012, 48, 5, 2263, 21, 4, 2992, 127, 4729, 711, 3, 1391, 8044, 3557, 1277, 8102, 2154, 5681, 9, 42, 15, 372, 2, 3773, 4, 3502, 2308, 467, 4890, 1503, 11, 3347, 1419, 8127, 2

### **Tokenizing Naver Movie Review**

In [17]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

train_data = train_data.dropna(how="any")

tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(train_data['document'], target_vocab_size=2 ** 13)
print(train_data['document'][20])
print('Tokenized sample question: {}'.format(tokenizer.encode(train_data['document'][20])))

sample_str = train_data["document"][21]
encoded_str = tokenizer.encode(sample_str)          # Encoding 수행
print(f"Encoded Sentence = {encoded_str}")
decoded_str = tokenizer.decode(encoded_str)         # Decoding 수행
print(f"Decoded Sentence = {decoded_str}")
print()
assert decoded_str == sample_str

for token_str in encoded_str:
    print(f"{token_str} ======> {tokenizer.decode([token_str])}")

sample_str = "이 영화 굉장히 재밌다 킄핫핫ㅎ"
encoded_str = tokenizer.encode(sample_str)          # Encoding 수행
print(f"Encoded Sentence = {encoded_str}")
decoded_str = tokenizer.decode(encoded_str)         # Decoding 수행
print(f"Decoded Sentence = {decoded_str}")
print()
assert decoded_str == sample_str

for token_str in encoded_str:
    print(f"{token_str} ======> {tokenizer.decode([token_str])}")

나름 심오한 뜻도 있는 듯. 그냥 학생이 선생과 놀아나는 영화는 절대 아님
Tokenized sample question: [669, 4700, 17, 1749, 8, 96, 131, 1, 48, 2239, 4, 7466, 32, 1274, 2655, 7, 80, 749, 1254]
Encoded Sentence = [570, 892, 36, 584, 159, 7091, 201]
Decoded Sentence = 보면서 웃지 않는 건 불가능하다

Encoded Sentence = [4, 23, 1364, 2157, 8235, 8128, 8130, 8235, 8147, 8169, 8235, 8147, 8169, 393]
Decoded Sentence = 이 영화 굉장히 재밌다 킄핫핫ㅎ



## **HuggingFace Tokenizer**  


---

**BertWordPieceTokenizer** 이외에 **ByteLevelBPETokenizer**, **CharBPETokenizer**, **SentencePieceBPETokenizer** 등이 존재

* **BertWordPieceTokenizer :** BERT에서 사용된 워드피스 토크나이저(WordPiece Tokenizer)
* **CharBPETokenizer :** 오리지널 BPE
* **ByteLevelBPETokenizer :** BPE의 바이트 레벨 버전
* **SentencePieceBPETokenizer :** 앞서 본 패키지 센텐스피스(SentencePiece)와 호환되는 BPE 구현체

In [19]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")

naver_df = pd.read_table('ratings.txt')
naver_df = naver_df.dropna(how = 'any')
with open('naver_review.txt', 'w', encoding='utf8') as file:
    file.write('\n'.join(naver_df['document']))

tokenizer = BertWordPieceTokenizer(lowercase=False, strip_accents=False)

data_file = 'naver_review.txt'
vocab_size = 30000
limit_alphabet = 6000
min_frequency = 5
tokenizer.train(files=data_file, vocab_size=vocab_size, limit_alphabet=limit_alphabet, min_frequency=min_frequency)
tokenizer.save("./data")

vocab_df = pd.read_fwf("./data", header=None)
encoded = tokenizer.encode("아 배고픈데 짜장면먹고싶다")
print("Tokenization Result =", encoded.tokens)
print("Integer Encoding =", encoded.ids)                # Integer Encoding의 결과 => 실제 DL 모델에 Input으로 사용
print("Decoding =", tokenizer.decode(encoded.ids))
print()

encoded = tokenizer.encode("커피 한잔의 여유를 즐기다")
print("Tokenization Result =", encoded.tokens)
print("Integer Encoding =", encoded.ids)                # Integer Encoding의 결과 => 실제 DL 모델에 Input으로 사용
print("Decoding =", tokenizer.decode(encoded.ids))

Tokenization Result = ['아', '배고', '##픈', '##데', '짜장면', '##먹고', '##싶다']
Integer Encoding = [2111, 20630, 4044, 3299, 24681, 7871, 7379]
Decoding = 아 배고픈데 짜장면먹고싶다

Tokenization Result = ['커피', '한잔', '##의', '여유', '##를', '즐기', '##다']
Integer Encoding = [12825, 25647, 3313, 12696, 3242, 10784, 3275]
Decoding = 커피 한잔의 여유를 즐기다


In [20]:
tokenizer = SentencePieceBPETokenizer()
tokenizer.train('naver_review.txt', vocab_size=10000, min_frequency=5)

encoded = tokenizer.encode("이 영화는 정말 재미있습니다.")
print(encoded.tokens)

['▁이', '▁영화는', '▁정말', '▁재미있', '습니다.']
