https://huggingface.co/docs/transformers/main/model_doc/flan-t5

In [1]:
import torch
from transformers import T5Model, T5Tokenizer

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
version = "google/flan-t5-small"
encoder_input = "Studies have been shown that owning a dog is good for you"
decoder_input = "Studies show that"

# T5Tokenizer

In [4]:
tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(version)
tokenizer

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


T5Tokenizer(name_or_path='google/flan-t5-small', vocab_size=32100, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', 

## tokenizer([sequence])

In [5]:
tokenizer.tokenize(encoder_input)

['▁Studies',
 '▁have',
 '▁been',
 '▁shown',
 '▁that',
 '▁own',
 'ing',
 '▁',
 'a',
 '▁dog',
 '▁is',
 '▁good',
 '▁for',
 '▁you']

In [6]:
encoder_inputs = tokenizer(
    encoder_input,                      # 句子batch
    truncation = True,                  # 超出max_length截断处理
    padding = True,                     # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,          # 最长长度,不设置默认为模型最大长度
    add_special_tokens = True,          # text添加特殊key
    return_length = True,               # 返回有效长度
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt"               # 返回数据格式 np pt tf jax
).to(device, torch.float16)

print(encoder_inputs.keys())
print(encoder_inputs["input_ids"])
print(encoder_inputs["attention_mask"]) # 对应是否是文字
print(encoder_inputs["length"])         # 对应有效文字长度

dict_keys(['input_ids', 'attention_mask', 'length'])
tensor([[6536,   43,  118, 2008,   24,  293,   53,    3,    9, 1782,   19,  207,
           21,   25,    1]], device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')
tensor([15], device='cuda:0')


In [7]:
tokenizer.tokenize(decoder_input)

['▁Studies', '▁show', '▁that']

In [8]:
decoder_inputs = tokenizer(
    decoder_input,                      # 句子batch
    truncation = True,                  # 超出max_length截断处理
    padding = True,                     # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,          # 最长长度,不设置默认为模型最大长度
    add_special_tokens = True,          # text添加特殊key
    return_length = True,               # 返回有效长度
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt"               # 返回数据格式 np pt tf jax
).to(device)

print(decoder_inputs.keys())
print(decoder_inputs["input_ids"])
print(decoder_inputs["attention_mask"]) # 对应是否是文字
print(decoder_inputs["length"])         # 对应有效文字长度

dict_keys(['input_ids', 'attention_mask', 'length'])
tensor([[6536,  504,   24,    1]], device='cuda:0')
tensor([[1, 1, 1, 1]], device='cuda:0')
tensor([4], device='cuda:0')


# T5Model

The bare T5 Model transformer outputting raw hidden-states without any specific head on top.

In [9]:
model: T5Model = T5Model.from_pretrained(version, torch_dtype=torch.float16).to(device)
model

T5Model(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): Linear(in_features=

In [12]:
model.eval()
with torch.inference_mode():
    outputs = model(
        input_ids = encoder_inputs["input_ids"],
        attention_mask = encoder_inputs["attention_mask"],
        decoder_input_ids = decoder_inputs["input_ids"],
        decoder_attention_mask = decoder_inputs["attention_mask"],
    )
outputs
# Seq2SeqModelOutput

Seq2SeqModelOutput(last_hidden_state=tensor([[[ 0.8067, -0.1053, -0.0626,  ..., -0.0833, -0.1984,  0.2641],
         [ 0.8427, -0.1310, -0.0666,  ..., -0.0688, -0.1848,  0.2535],
         [ 1.0123, -0.1345, -0.0779,  ..., -0.0833, -0.1490,  0.2490],
         [ 1.0462, -0.1486, -0.1723,  ..., -0.0240, -0.1080,  0.1971]]],
       device='cuda:0'), past_key_values=((tensor([[[[ 0.2883,  0.9149, -0.9261,  ..., -0.6955,  1.9378,  1.4641],
          [-0.5891, -0.0793,  0.0257,  ...,  0.6827, -0.3829,  1.1592],
          [ 0.5566,  1.2770, -0.5993,  ...,  0.4500, -0.2466,  0.6266],
          [-0.3573,  0.1031,  2.2171,  ...,  0.7543,  0.2273,  0.6642]],

         [[-0.0835, -0.4179,  1.0919,  ...,  1.0454, -0.4170,  0.9568],
          [ 1.0788, -0.2012, -0.9339,  ...,  1.3904, -0.4810,  1.1273],
          [ 1.5544, -0.1026,  0.2404,  ...,  0.8335, -1.3271,  1.0959],
          [ 1.3747,  3.3671,  2.7419,  ...,  0.0274,  1.8002,  0.4336]],

         [[ 0.3703,  1.0281, -1.1772,  ..., -0.1946, -

In [13]:
# 最后一层的输出
outputs.last_hidden_state.shape

torch.Size([1, 4, 512])

In [14]:
outputs.last_hidden_state

tensor([[[ 0.8067, -0.1053, -0.0626,  ..., -0.0833, -0.1984,  0.2641],
         [ 0.8427, -0.1310, -0.0666,  ..., -0.0688, -0.1848,  0.2535],
         [ 1.0123, -0.1345, -0.0779,  ..., -0.0833, -0.1490,  0.2490],
         [ 1.0462, -0.1486, -0.1723,  ..., -0.0240, -0.1080,  0.1971]]],
       device='cuda:0')

In [15]:
len(outputs.past_key_values)

8

In [16]:
for past_key in outputs.past_key_values:
    for past in past_key:
        print(past.shape)
    print("-" * 25)

torch.Size([1, 6, 4, 64])
torch.Size([1, 6, 4, 64])
torch.Size([1, 6, 15, 64])
torch.Size([1, 6, 15, 64])
-------------------------
torch.Size([1, 6, 4, 64])
torch.Size([1, 6, 4, 64])
torch.Size([1, 6, 15, 64])
torch.Size([1, 6, 15, 64])
-------------------------
torch.Size([1, 6, 4, 64])
torch.Size([1, 6, 4, 64])
torch.Size([1, 6, 15, 64])
torch.Size([1, 6, 15, 64])
-------------------------
torch.Size([1, 6, 4, 64])
torch.Size([1, 6, 4, 64])
torch.Size([1, 6, 15, 64])
torch.Size([1, 6, 15, 64])
-------------------------
torch.Size([1, 6, 4, 64])
torch.Size([1, 6, 4, 64])
torch.Size([1, 6, 15, 64])
torch.Size([1, 6, 15, 64])
-------------------------
torch.Size([1, 6, 4, 64])
torch.Size([1, 6, 4, 64])
torch.Size([1, 6, 15, 64])
torch.Size([1, 6, 15, 64])
-------------------------
torch.Size([1, 6, 4, 64])
torch.Size([1, 6, 4, 64])
torch.Size([1, 6, 15, 64])
torch.Size([1, 6, 15, 64])
-------------------------
torch.Size([1, 6, 4, 64])
torch.Size([1, 6, 4, 64])
torch.Size([1, 6, 15, 64