# import

In [1]:
import json
import re

In [2]:
def dump_jsonl(data, output_path, append=False):
    mode = "a+" if append else "w"
    with open(output_path, mode, encoding="utf-8") as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + "\n")


def load_jsonl(input_path) -> list:
    data = []
    with open(input_path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line.rstrip("\n|\r")))
    return data

In [3]:
vocab_size = 32000

# load data

In [4]:
data = load_jsonl("../data/sample.json")

def processing(text):
    text = re.sub(r" +", r" ", text)
    text = text.strip()
    return text

def gen():
    for row in data:
        for k, v in row.items():
            if isinstance(v, str):
                yield processing(v)

# train

In [5]:
user_defined_symbols = [
    "[PAD]",
    "[UNK]",
    "[CLS]",
    "[SEP]",
    "[MASK]",
    "[BOS]",
    "[EOS]",
]
unused_token_num = 100
unused_list = [f"[UNUSED{i}]" for i in range(unused_token_num)]
whole_user_defined_symbols = user_defined_symbols + unused_list

In [6]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece

tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

In [7]:
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents

tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])

In [8]:
from tokenizers.pre_tokenizers import Whitespace

tokenizer.pre_tokenizer = Whitespace()

In [9]:
from tokenizers.processors import TemplateProcessing

tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[(t, i) for i, t in enumerate(user_defined_symbols)],
)

In [10]:
from tokenizers.trainers import WordPieceTrainer

trainer = WordPieceTrainer(
    vocab_size=vocab_size,
    special_tokens=whole_user_defined_symbols,
)
tokenizer.train_from_iterator(gen(), trainer)






In [11]:
output = tokenizer.encode("테스트를 목적으로 하는 문장.")
print(output.ids)

tokenizer.decode(output.ids)

[2, 22692, 5213, 2158, 19113, 120, 3]


'테스트를 목적으로 하는 문장 .'

In [12]:
from tokenizers import decoders

tokenizer.decoder = decoders.WordPiece()
tokenizer.decode(output.ids)

'테스트를 목적으로 하는 문장.'

# convert transformers tokenizer and save

In [13]:
from transformers import ElectraTokenizerFast


hf_tokenizer = ElectraTokenizerFast(tokenizer_object=tokenizer)

In [14]:
for token in user_defined_symbols:
    setattr(hf_tokenizer, token[1:-1].lower(), token)

special_tokens_dict = {"additional_special_tokens": user_defined_symbols}
hf_tokenizer.add_special_tokens(special_tokens_dict)

0

In [15]:
hf_tokenizer.decode(hf_tokenizer.encode("테스트를 목적으로 하는 문장."))

'[CLS] 테스트를 목적으로 하는 문장. [SEP]'

In [16]:
hf_tokenizer.save_pretrained("../models/")

('../models/tokenizer_config.json',
 '../models/special_tokens_map.json',
 '../models/vocab.txt',
 '../models/added_tokens.json',
 '../models/tokenizer.json')

In [18]:
loaded_tokenizer = ElectraTokenizerFast.from_pretrained("../models/")

test = loaded_tokenizer(
    [["정확히 동작하는지", "확인하는 테스트 절차"]],
    max_length=10,
    padding=True,
)

for k, v in test.items():
    print(k, v)

for ids in test['input_ids']:
    print(loaded_tokenizer.decode(ids))

input_ids [[2, 13964, 7552, 1762, 3, 7451, 5290, 8898, 3]]
token_type_ids [[0, 0, 0, 0, 0, 1, 1, 1, 1]]
attention_mask [[1, 1, 1, 1, 1, 1, 1, 1, 1]]
[CLS] 정확히 동작하는지 [SEP] 확인하는 테스트 절차 [SEP]
