# Tokenizer

Make a tokenizer for our sequences

In [None]:
import os
import json

from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.models import WordLevel
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordLevelTrainer
from transformers import PreTrainedTokenizerFast

with open("../solves.json", "r") as f:
    data = json.load(f)

len(data)

8716

In [22]:
def data_iterator(data):
    for row in data:
        yield " ".join(row["scramble"])
        yield " ".join(row["solve"])

In [None]:
if not os.path.exists("../rubiks-tokenizer") and True:
    tokenizer = Tokenizer(WordLevel())
    tokenizer.pre_tokenizer = WhitespaceSplit()

    trainer = WordLevelTrainer(
        special_tokens=["<bos>", "<eos>", "<unk>", "<pad>"],
        show_progress=True,
    )

    tokenizer.train_from_iterator(
        data_iterator(data), trainer=trainer, length=len(data)
    )

    bos_id = tokenizer.token_to_id("<bos>")
    eos_id = tokenizer.token_to_id("<eos>")

    tokenizer.post_processor = TemplateProcessing(
        single="<bos> $0 <eos>",
        pair="<bos> $A <eos> $B:1 <eos>:1",
        special_tokens=[("<bos>", bos_id), ("<eos>", eos_id)],
    )

    tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        unk_token="<unk>",
        bos_token="<bos>",
        eos_token="<eos>",
        pad_token="<pad>",
    )
    tokenizer.save_pretrained("../rubiks-tokenizer")

In [24]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("../rubiks-tokenizer")