In [7]:
r"""
T5Tokenizer迁移
"""

'\nT5Tokenizer迁移\n'

In [8]:
import os
from mindnlp.configs import DEFAULT_ROOT
from mindnlp.utils.download import cache_file
from tokenizers import Tokenizer, models

URL = {
    "t5-small": "https://huggingface.co/t5-small/resolve/main/tokenizer.json",
    "t5-base": "https://huggingface.co/t5-base/resolve/main/tokenizer.json",
    "t5-large": "https://huggingface.co/t5-large/resolve/main/tokenizer.json",
    "t5-3b": "https://huggingface.co/t5-3b/resolve/main/tokenizer.json",
    "t5-11b": "https://huggingface.co/t5-11b/resolve/main/tokenizer.json"
    
}

class T5Tokenizer():
    def __init__(
        self,
        tokenizer_file=None,
    ):
        if tokenizer_file != None:
            self._tokenizer = Tokenizer(models.Unigram()).from_file(tokenizer_file)

    def __call__(self, text_input):
        return super().__call__(text_input)

    @classmethod
    def from_pretrained(cls, size:str):
        cache_dir = os.path.join(DEFAULT_ROOT, "tokenizers", size)
        path, _ = cache_file(None, url=URL[size], cache_dir=cache_dir)
        tokenizer = cls(tokenizer_file=str(path))
        return tokenizer

    def encode(self, text_input):
        tokens = self._tokenizer.encode(text_input)
        return tokens

    def decode(self, ids: list):
        return self.decode(ids)

In [9]:
from mindnlp.transforms import T5Tokenizer
from transformers import T5TokenizerFast

pt_tokenizer = T5TokenizerFast.from_pretrained('t5-base')
ms_tokenizer = T5Tokenizer.from_pretrained('t5-base')

text = "Believing that faith can triumph over everything is in itself the greatest belief"

print(pt_tokenizer.encode(text))
print(ms_tokenizer.encode(text).ids)

print(pt_tokenizer(text).attention_mask)
print(ms_tokenizer.encode(text).attention_mask)

print(pt_tokenizer.decode(pt_tokenizer.encode(text)))
print(ms_tokenizer.decode(ms_tokenizer.encode(text).ids))

[493, 1896, 3745, 24, 3251, 54, 20020, 147, 762, 19, 16, 1402, 8, 4016, 7750, 1]
[493, 1896, 3745, 24, 3251, 54, 20020, 147, 762, 19, 16, 1402, 8, 4016, 7750, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Believing that faith can triumph over everything is in itself the greatest belief</s>
Believing that faith can triumph over everything is in itself the greatest belief


In [10]:
from mindnlp.transforms import T5Tokenizer
from mindspore.dataset import GeneratorDataset
from transformers import T5TokenizerFast

pt_tokenizer = T5TokenizerFast.from_pretrained('t5-base')
tokenizer = T5Tokenizer.from_pretrained('t5-base')
texts = ['i make a small mistake when i\'m working!']
test_dataset = GeneratorDataset(texts, 'text')
test_dataset = test_dataset.map(operations=tokenizer)
dataset_after = next(test_dataset.create_tuple_iterator())[0]

print(pt_tokenizer.encode(texts[0]))
print(dataset_after)

[3, 23, 143, 3, 9, 422, 6202, 116, 3, 23, 31, 51, 464, 55, 1]
[   3   23  143    3    9  422 6202  116    3   23   31   51  464   55
    1]
