# Create the tokenizer

In [1]:
import pandas as pd
from enum import Enum
from pathlib import Path
from dataclasses import dataclass

In [2]:
from transformers import PreTrainedTokenizerFast
from tokenizers import (
    Tokenizer,
    pre_tokenizers,
    normalizers,
    trainers,
    decoders,
    models
)

In [3]:
from configs import VOCAB_SIZE

In [4]:
class SpecialTokensStr(Enum):
    PAD = '[PAD]'
    CLS = '[CLS]'
    UNK = '[UNK]'
    MASK = '[MASK]'
    SOS = '[SOS]'
    EOS = '[EOS]'

    @classmethod
    def todict(cls):
        return {f'{token.name.lower()}_token': token.value for token in cls}

    @classmethod
    def tolist(cls):
        return list(cls.todict())

class SpecialTokensInt(Enum):
    PAD = 0
    CLS = 1
    UNK = 2
    MASK = 3
    SOS = 4
    EOS = 5

    @classmethod
    def todict(cls):
        return {token.name: token.value for token in cls}

In [15]:
@dataclass
class TokenizerImDB:
    vocab_size: int
    tokenizer_path: Path
    tokenizer: Tokenizer = None

    def __post_init__(self):
        if self.tokenizer_path.exists():
            self.load_tokenizer()

    def train(self, text_iterator):
        self.tokenizer = Tokenizer(models.BPE())

        self.tokenizer.normalizer = normalizers.Sequence(
            [
                normalizers.Lowercase(),
                normalizers.Strip(),
                normalizers.NFC(),
                normalizers.NFD(),
                normalizers.NFKC(),
                normalizers.NFKD(),
            ]
        )

        self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

        self.tokenizer.decoder = decoders.ByteLevel()

        special_tokens = SpecialTokensStr.tolist()

        self.trainer = trainers.BpeTrainer(
            vocab_size=self.vocab_size, 
            special_tokens=special_tokens
        )

        self.tokenizer.train_from_iterator(text_iterator, self.trainer)

        self.tokenizer.save(str(self.tokenizer_path))

    def encoder(self, text: str, **kwargs) -> list[int]:
        return self.tokenizer.encode(text, **kwargs).ids

    def decoder(self, ids: list[int]) -> str:
        return self.tokenizer.decode(ids)

    def load_tokenizer(self):
        self.tokenizer = Tokenizer.from_file(str(self.tokenizer_path))

In [16]:
print(f'Special Tokens Sting: {SpecialTokensStr.todict()}')
print(f'Special Tokens Indices: {SpecialTokensInt.todict()}')

Special Tokens Sting: {'pad_token': '[PAD]', 'cls_token': '[CLS]', 'unk_token': '[UNK]', 'mask_token': '[MASK]', 'sos_token': '[SOS]', 'eos_token': '[EOS]'}
Special Tokens Indices: {'PAD': 0, 'CLS': 1, 'UNK': 2, 'MASK': 3, 'SOS': 4, 'EOS': 5}


In [17]:
file_dataset = Path('../data/imdb-reviews-pt-br.csv')
dataset = pd.read_csv(file_dataset)

def text_iterator(dataset: pd.DataFrame, language: str):
    match language:
        case 'pt':
            text_col = dataset['text_pt']
        case 'en':
            text_col = dataset['text_en']
    for text in text_col:
        yield text

In [18]:
tokenizer_path_pt = Path('artifacts/tokenizer_pt.json')
tokenizer_path_en = Path('artifacts/tokenizer_en.json')

tokenizer_pt = TokenizerImDB(vocab_size=VOCAB_SIZE, tokenizer_path=tokenizer_path_pt)
tokenizer_en = TokenizerImDB(vocab_size=VOCAB_SIZE, tokenizer_path=tokenizer_path_en)

# Train

In [19]:
tokenizer_pt.train(text_iterator(dataset, 'pt'))






In [20]:
tokenizer_en.train(text_iterator(dataset, 'en'))






# Loaded tokenizer

In [7]:
tokenizer_pt = TokenizerImDB(vocab_size=VOCAB_SIZE, tokenizer_path=tokenizer_path_pt)
tokenizer_en = TokenizerImDB(vocab_size=VOCAB_SIZE, tokenizer_path=tokenizer_path_en)

# Validation

In [21]:
text_pt = 'Olá como vai você?'
text_en = 'Hello, how are you?'

In [22]:
tokenizer_pt.encoder(text_pt)

[13344, 125, 218, 660, 259, 3878]

In [10]:
tokenizer_en.encoder(text_en)

[15752, 13, 368, 222, 199, 32]

In [11]:
tokenizer_pt.decoder(tokenizer_pt.encoder(text_pt))

'olá como vai você?'

In [12]:
tokenizer_en.decoder(tokenizer_en.encoder(text_en))

'hello, how are you?'

In [13]:
tokenizer_pt.encoder(tokenizer_pt.decoder(tokenizer_pt.encoder(text_pt)))

[13344, 125, 218, 660, 259, 3878]

In [14]:
tokenizer_en.encoder(tokenizer_en.decoder(tokenizer_en.encoder(text_en)))

[15752, 13, 368, 222, 199, 32]