In [1]:
import torch
from transformers import T5Tokenizer, T5TokenizerFast

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
version = "t5-small"
sequence = "The quick brown fox jumps over the lazy dog."

# T5Tokenizer

In [4]:
tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(version)
tokenizer

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


T5Tokenizer(name_or_path='t5-small', vocab_size=32100, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_id_4

## special ids and tokens

In [5]:
print(tokenizer.all_special_ids)
print(tokenizer.all_special_tokens)

[1, 2, 0, 32099, 32098, 32097, 32096, 32095, 32094, 32093, 32092, 32091, 32090, 32089, 32088, 32087, 32086, 32085, 32084, 32083, 32082, 32081, 32080, 32079, 32078, 32077, 32076, 32075, 32074, 32073, 32072, 32071, 32070, 32069, 32068, 32067, 32066, 32065, 32064, 32063, 32062, 32061, 32060, 32059, 32058, 32057, 32056, 32055, 32054, 32053, 32052, 32051, 32050, 32049, 32048, 32047, 32046, 32045, 32044, 32043, 32042, 32041, 32040, 32039, 32038, 32037, 32036, 32035, 32034, 32033, 32032, 32031, 32030, 32029, 32028, 32027, 32026, 32025, 32024, 32023, 32022, 32021, 32020, 32019, 32018, 32017, 32016, 32015, 32014, 32013, 32012, 32011, 32010, 32009, 32008, 32007, 32006, 32005, 32004, 32003, 32002, 32001, 32000]
['</s>', '<unk>', '<pad>', '<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '

## tokenize(sequence) 对文本进行分词

In [5]:
tokens = tokenizer.tokenize(sequence)
tokens

['▁The',
 '▁quick',
 '▁brown',
 '▁',
 'fox',
 '▁jump',
 's',
 '▁over',
 '▁the',
 '▁lazy',
 '▁dog',
 '.']

## convert_tokens_to_ids 将分词后的token映射为数字

In [6]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[37, 1704, 4216, 3, 20400, 4418, 7, 147, 8, 19743, 1782, 5]

## encode = tokenize + convert_tokens_to_ids

In [7]:
print(tokenizer.encode(sequence, add_special_tokens=False))
ids = tokenizer.encode(sequence, add_special_tokens=True)
ids

[37, 1704, 4216, 3, 20400, 4418, 7, 147, 8, 19743, 1782, 5]


[37, 1704, 4216, 3, 20400, 4418, 7, 147, 8, 19743, 1782, 5, 1]

In [8]:
# 没有开始的token
tokenizer.convert_ids_to_tokens([1])

['</s>']

## batch_encode_plus = batch encode

In [9]:
tokenizer.batch_encode_plus([sequence], add_special_tokens=True, padding=True)

{'input_ids': [[37, 1704, 4216, 3, 20400, 4418, 7, 147, 8, 19743, 1782, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

## convert_ids_to_tokens 将数字映射为token

In [10]:
print(tokenizer.convert_ids_to_tokens(ids))
tokens = tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True)
tokens

['▁The', '▁quick', '▁brown', '▁', 'fox', '▁jump', 's', '▁over', '▁the', '▁lazy', '▁dog', '.', '</s>']


['▁The',
 '▁quick',
 '▁brown',
 '▁',
 'fox',
 '▁jump',
 's',
 '▁over',
 '▁the',
 '▁lazy',
 '▁dog',
 '.']

## convert_tokens_to_string 将token转换为string

In [11]:
tokenizer.convert_tokens_to_string(tokens)

'The quick brown fox jumps over the lazy dog.'

## decode = convert_ids_to_tokens + convert_tokens_to_string

In [12]:
print(tokenizer.decode(ids))
print(tokenizer.decode(ids, skip_special_tokens=True))

The quick brown fox jumps over the lazy dog.</s>
The quick brown fox jumps over the lazy dog.


## batch_decode = batch decode

In [13]:
print(tokenizer.batch_decode([ids]))
print(tokenizer.batch_decode([ids], skip_special_tokens=True))

['The quick brown fox jumps over the lazy dog.</s>']
['The quick brown fox jumps over the lazy dog.']


## tokenizer([sequence])

In [14]:
inputs = tokenizer(
    sequence,                           # 单个句子
    truncation = True,                  # 超出max_length截断处理
    padding = True,                     # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,          # 最长长度,不设置默认为模型最大长度
    add_special_tokens = True,          # text添加特殊key
    return_length = True,               # 返回有效长度
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt"               # 返回数据格式 np pt tf jax
)

print(inputs.keys())
print(inputs["input_ids"])      # 对应文字id
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 对应总长度长度

dict_keys(['input_ids', 'attention_mask', 'length'])
tensor([[   37,  1704,  4216,     3, 20400,  4418,     7,   147,     8, 19743,
          1782,     5,     1]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([13])


In [15]:
inputs = tokenizer(
    [sequence] * 2,                     # 句子batch
    truncation = True,                  # 超出max_length截断处理
    padding = True,                     # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,          # 最长长度,不设置默认为模型最大长度
    return_length = True,               # 返回有效长度
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt"               # 返回数据格式 np pt tf jax
).to(device)

print(inputs.keys())
print(inputs["input_ids"])
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 对应有效文字长度

dict_keys(['input_ids', 'length', 'attention_mask'])
tensor([[   37,  1704,  4216,     3, 20400,  4418,     7,   147,     8, 19743,
          1782,     5,     1],
        [   37,  1704,  4216,     3, 20400,  4418,     7,   147,     8, 19743,
          1782,     5,     1]], device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')
tensor([13, 13], device='cuda:0')


# T5TokenizerFast

In [16]:
tokenizer: T5TokenizerFast = T5TokenizerFast.from_pretrained(version)
tokenizer

T5TokenizerFast(name_or_path='t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_i

In [17]:
inputs = tokenizer(
    [sequence] * 2,                     # 句子batch
    truncation = True,                  # 超出max_length截断处理
    padding = True,                     # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,          # 最长长度,不设置默认为模型最大长度
    add_special_tokens = True,          # text添加特殊key
    return_length = True,               # 返回有效长度
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt"               # 返回数据格式 np pt tf jax
).to(device)

print(inputs.keys())
print(inputs["input_ids"])
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 返回文字长度时最长长度,结果不对

dict_keys(['input_ids', 'attention_mask', 'length'])
tensor([[   37,  1704,  4216,     3, 20400,  4418,     7,   147,     8, 19743,
          1782,     5,     1],
        [   37,  1704,  4216,     3, 20400,  4418,     7,   147,     8, 19743,
          1782,     5,     1]], device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')
tensor([13, 13], device='cuda:0')
