https://huggingface.co/docs/transformers/model_doc/reformer

In [1]:
import torch
from transformers import ReformerModel, ReformerTokenizer

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [5]:
version = "google/reformer-crime-and-punishment"
sequence = "The quick brown fox jumps over the lazy dog."
max_length = 20

# ReformerTokenizer

In [6]:
tokenizer: ReformerTokenizer = ReformerTokenizer.from_pretrained(version)
tokenizer

ReformerTokenizer(name_or_path='google/reformer-crime-and-punishment', vocab_size=320, model_max_length=524288, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## tokenizer([sequence])

In [8]:
inputs = tokenizer(
    [sequence] * 2,                     # 句子batch
    truncation = True,                  # 超出max_length截断处理
    # padding = True,                   # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,          # 最长长度,不设置默认为模型最大长度
    add_special_tokens = True,          # text添加特殊key
    return_length = True,               # 返回有效长度
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt"               # 返回数据格式 np pt tf jax
).to(device, torch.float16)    # https://github.com/huggingface/transformers/issues/16359

print(inputs.keys())
print(inputs["input_ids"])
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 对应有效文字长度

dict_keys(['input_ids', 'length', 'attention_mask'])
tensor([[140, 243, 264, 134,  17, 267,  77, 263,  22, 262, 297, 258, 304, 177,
         279, 266,  14,  89,  13,  35, 261, 299, 272, 137, 275, 278],
        [140, 243, 264, 134,  17, 267,  77, 263,  22, 262, 297, 258, 304, 177,
         279, 266,  14,  89,  13,  35, 261, 299, 272, 137, 275, 278]],
       device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]], device='cuda:0')
tensor([26, 26], device='cuda:0')


In [9]:
print(inputs["input_ids"])

tensor([[140, 243, 264, 134,  17, 267,  77, 263,  22, 262, 297, 258, 304, 177,
         279, 266,  14,  89,  13,  35, 261, 299, 272, 137, 275, 278],
        [140, 243, 264, 134,  17, 267,  77, 263,  22, 262, 297, 258, 304, 177,
         279, 266,  14,  89,  13,  35, 261, 299, 272, 137, 275, 278]],
       device='cuda:0')


# ReformerModel

The bare Reformer Model transformer outputting raw hidden-stateswithout any specific head on top. Reformer was proposed in Reformer: The Efficient Transformer by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.

In [12]:
model: ReformerModel = ReformerModel.from_pretrained(version, torch_dtype=torch.float16).to(device)
model

ReformerModel(
  (embeddings): ReformerEmbeddings(
    (word_embeddings): Embedding(320, 256)
    (position_embeddings): AxialPositionEmbeddings(
      (weights): ParameterList(
          (0): Parameter containing: [torch.float32 of size 512x1x64 (cuda:0)]
          (1): Parameter containing: [torch.float32 of size 1x1024x192 (cuda:0)]
      )
    )
  )
  (encoder): ReformerEncoder(
    (layers): ModuleList(
      (0): ReformerLayer(
        (attention): ReformerAttention(
          (layer_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
          (self_attention): LocalSelfAttention(
            (query): Linear(in_features=256, out_features=128, bias=False)
            (key): Linear(in_features=256, out_features=128, bias=False)
            (value): Linear(in_features=256, out_features=128, bias=False)
          )
          (output): ReformerSelfOutput(
            (dense): Linear(in_features=128, out_features=256, bias=False)
          )
        )
        (feed_forward): 

In [13]:
model.eval()
with torch.inference_mode():
    outputs = model(
        input_ids = inputs["input_ids"],
        attention_mask = inputs["attention_mask"],
    )
outputs

ReformerModelOutput(last_hidden_state=tensor([[[-0.1733,  0.2357,  0.3284,  ...,  0.6766,  0.4675,  0.1250],
         [-0.1559,  0.2170,  0.7596,  ...,  0.2714,  0.6945,  0.2284],
         [ 0.3256,  0.7184,  0.3327,  ..., -0.7029,  0.2363,  0.1679],
         ...,
         [-0.7055,  0.1181, -0.6565,  ...,  3.2471,  1.2641, -0.4303],
         [ 0.8279, -0.5580,  0.0164,  ...,  0.3295, -1.9403, -0.1466],
         [ 0.6393,  0.2772,  0.0120,  ..., -0.1929,  0.4733, -3.3677]],

        [[-0.1733,  0.2357,  0.3284,  ...,  0.6766,  0.4675,  0.1250],
         [-0.1559,  0.2170,  0.7596,  ...,  0.2714,  0.6945,  0.2284],
         [ 0.3256,  0.7184,  0.3327,  ..., -0.7029,  0.2363,  0.1679],
         ...,
         [-0.7055,  0.1181, -0.6565,  ...,  3.2471,  1.2641, -0.4303],
         [ 0.8279, -0.5580,  0.0164,  ...,  0.3295, -1.9403, -0.1466],
         [ 0.6393,  0.2772,  0.0120,  ..., -0.1929,  0.4733, -3.3677]]],
       device='cuda:0'), past_buckets_states=[(None, tensor([[[ 0.1464,  0.401

In [16]:
# 最后一层的输出
outputs.last_hidden_state.shape

torch.Size([2, 26, 512])

In [17]:
outputs.hidden_states

In [20]:
outputs.attentions