In [33]:
import torch
from transformers import ReformerTokenizer, ReformerTokenizerFast

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [5]:
version = "google/reformer-crime-and-punishment"
sequence = "The quick brown fox jumps over the lazy dog."
max_length = 20

# ReformerConfig

In [12]:
tokenizer: ReformerTokenizer = ReformerTokenizer.from_pretrained(version)
tokenizer

ReformerTokenizer(name_or_path='google/reformer-crime-and-punishment', vocab_size=320, model_max_length=524288, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## special ids and tokens

In [20]:
tokenizer.pad_token = tokenizer.eos_token

In [21]:
print(tokenizer.all_special_ids)
print(tokenizer.all_special_tokens)

[2, 0]
['</s>', '<unk>']


## tokenize(sequence) 对文本进行分词

In [22]:
tokens = tokenizer.tokenize(sequence)
tokens

['▁The',
 '▁qu',
 'i',
 'ck',
 '▁b',
 'r',
 'ow',
 'n',
 '▁f',
 'o',
 'x',
 '▁',
 'j',
 'um',
 'p',
 's',
 '▁o',
 'ver',
 '▁the',
 '▁l',
 'a',
 'z',
 'y',
 '▁do',
 'g',
 '.']

## convert_tokens_to_ids 将分词后的token映射为数字

In [23]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[140,
 243,
 264,
 134,
 17,
 267,
 77,
 263,
 22,
 262,
 297,
 258,
 304,
 177,
 279,
 266,
 14,
 89,
 13,
 35,
 261,
 299,
 272,
 137,
 275,
 278]

## encode = tokenize + convert_tokens_to_ids

In [24]:
print(tokenizer.encode(sequence, add_special_tokens=False))
ids = tokenizer.encode(sequence, add_special_tokens=True)
ids

[140, 243, 264, 134, 17, 267, 77, 263, 22, 262, 297, 258, 304, 177, 279, 266, 14, 89, 13, 35, 261, 299, 272, 137, 275, 278]


[140,
 243,
 264,
 134,
 17,
 267,
 77,
 263,
 22,
 262,
 297,
 258,
 304,
 177,
 279,
 266,
 14,
 89,
 13,
 35,
 261,
 299,
 272,
 137,
 275,
 278]

In [25]:
tokenizer.convert_ids_to_tokens([101, 102])

['ere', '▁S']

## batch_encode_plus = batch encode

In [26]:
tokenizer.batch_encode_plus([sequence], add_special_tokens=True, padding=True)

{'input_ids': [[140, 243, 264, 134, 17, 267, 77, 263, 22, 262, 297, 258, 304, 177, 279, 266, 14, 89, 13, 35, 261, 299, 272, 137, 275, 278]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

## convert_ids_to_tokens 将数字映射为token

In [27]:
print(tokenizer.convert_ids_to_tokens(ids))
tokens = tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True)
tokens

['▁The', '▁qu', 'i', 'ck', '▁b', 'r', 'ow', 'n', '▁f', 'o', 'x', '▁', 'j', 'um', 'p', 's', '▁o', 'ver', '▁the', '▁l', 'a', 'z', 'y', '▁do', 'g', '.']


['▁The',
 '▁qu',
 'i',
 'ck',
 '▁b',
 'r',
 'ow',
 'n',
 '▁f',
 'o',
 'x',
 '▁',
 'j',
 'um',
 'p',
 's',
 '▁o',
 'ver',
 '▁the',
 '▁l',
 'a',
 'z',
 'y',
 '▁do',
 'g',
 '.']

## convert_tokens_to_string 将token转换为string

In [28]:
tokenizer.convert_tokens_to_string(tokens)

'The quick brown fox jumps over the lazy dog.'

## decode = convert_ids_to_tokens + convert_tokens_to_string

In [29]:
print(tokenizer.decode(ids))
print(tokenizer.decode(ids, skip_special_tokens=True))

The quick brown fox jumps over the lazy dog.
The quick brown fox jumps over the lazy dog.


## batch_decode = batch decode

In [30]:
print(tokenizer.batch_decode([ids]))
print(tokenizer.batch_decode([ids], skip_special_tokens=True))

['The quick brown fox jumps over the lazy dog.']
['The quick brown fox jumps over the lazy dog.']


## tokenizer([sequence])

In [31]:
inputs = tokenizer(
    sequence,                           # 单个句子
    truncation = True,                  # 超出max_length截断处理
    padding = True,                     # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,          # 最长长度,不设置默认为模型最大长度
    add_special_tokens = True,          # text添加特殊key
    return_length = True,               # 返回有效长度
    return_attention_mask = True,       # 返回attention_mask
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt",              # 返回数据格式 np pt tf jax
).to(device)

print(inputs.keys())
print(inputs["input_ids"])      # 对应文字id
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 对应总长度长度

dict_keys(['input_ids', 'attention_mask', 'length'])
tensor([[140, 243, 264, 134,  17, 267,  77, 263,  22, 262, 297, 258, 304, 177,
         279, 266,  14,  89,  13,  35, 261, 299, 272, 137, 275, 278]],
       device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]], device='cuda:0')
tensor([26], device='cuda:0')


In [32]:
inputs = tokenizer(
    [sequence] * 2,                     # 句子batch
    truncation = True,                  # 超出max_length截断处理
    padding = True,                     # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,          # 最长长度,不设置默认为模型最大长度
    return_length = True,               # 返回有效长度
    return_attention_mask = True,       # 返回attention_mask
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt",              # 返回数据格式 np pt tf jax
).to(device)

print(inputs.keys())
print(inputs["input_ids"])
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 对应有效文字长度

dict_keys(['input_ids', 'length', 'attention_mask'])
tensor([[140, 243, 264, 134,  17, 267,  77, 263,  22, 262, 297, 258, 304, 177,
         279, 266,  14,  89,  13,  35, 261, 299, 272, 137, 275, 278],
        [140, 243, 264, 134,  17, 267,  77, 263,  22, 262, 297, 258, 304, 177,
         279, 266,  14,  89,  13,  35, 261, 299, 272, 137, 275, 278]],
       device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]], device='cuda:0')
tensor([26, 26], device='cuda:0')


# ReformerTokenizerFast

In [34]:
tokenizer: ReformerTokenizerFast = ReformerTokenizerFast.from_pretrained(version)
tokenizer

ReformerTokenizerFast(name_or_path='google/reformer-crime-and-punishment', vocab_size=320, model_max_length=524288, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [36]:
inputs = tokenizer(
    [sequence] * 2,                     # 句子batch
    truncation = True,                  # 超出max_length截断处理
    # padding = True,                   # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,          # 最长长度,不设置默认为模型最大长度
    add_special_tokens = True,          # text添加特殊key
    return_length = True,               # 返回有效长度
    return_attention_mask = True,       # 返回attention_mask
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt",              # 返回数据格式 np pt tf jax
).to(device)

print(inputs.keys())
print(inputs["input_ids"])
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 返回文字长度时最长长度,结果不对

dict_keys(['input_ids', 'attention_mask', 'length'])
tensor([[140, 243, 264, 134,  17, 267,  77, 263,  22, 262, 297, 258, 304, 177,
         279, 266,  14,  89,  13,  35, 261, 299, 272, 137, 275, 278],
        [140, 243, 264, 134,  17, 267,  77, 263,  22, 262, 297, 258, 304, 177,
         279, 266,  14,  89,  13,  35, 261, 299, 272, 137, 275, 278]],
       device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]], device='cuda:0')
tensor([26, 26], device='cuda:0')
