https://huggingface.co/docs/transformers/model_doc/gpt2

In [1]:
import torch
from transformers import GPT2Model, GPT2Tokenizer

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
version = "gpt2"
sequence = "The quick brown fox jumps over the lazy dog."
max_length = 20

# GPT2Tokenizer

In [4]:
tokenizer: GPT2Tokenizer = GPT2Tokenizer.from_pretrained(version)
tokenizer

GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)}, clean_up_tokenization_spaces=True)

In [5]:
# 需要手动设置pad_token
tokenizer.pad_token = tokenizer.eos_token

## tokenizer([sequence])

In [6]:
inputs = tokenizer(
    [sequence] * 2,  # 句子batch
    truncation=True,  # 超出max_length截断处理
    padding=True,  # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,          # 最长长度,不设置默认为模型最大长度
    add_special_tokens=True,  # text添加特殊key
    return_length=True,  # 返回有效长度
    return_overflowing_tokens=False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors="pt",  # 返回数据格式 np pt tf jax
).to(device, torch.float16)  # https://github.com/huggingface/transformers/issues/16359

print(inputs.keys())
print(inputs["input_ids"])
print(inputs["attention_mask"])  # 对应是否是文字
print(inputs["length"])  # 对应有效文字长度

dict_keys(['input_ids', 'length', 'attention_mask'])
tensor([[  464,  2068,  7586, 21831, 18045,   625,   262, 16931,  3290,    13],
        [  464,  2068,  7586, 21831, 18045,   625,   262, 16931,  3290,    13]],
       device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')
tensor([10, 10], device='cuda:0')


In [7]:
print(inputs["input_ids"])

tensor([[  464,  2068,  7586, 21831, 18045,   625,   262, 16931,  3290,    13],
        [  464,  2068,  7586, 21831, 18045,   625,   262, 16931,  3290,    13]],
       device='cuda:0')


# GPT2Model

The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.

In [12]:
model: GPT2Model = GPT2Model.from_pretrained(version, torch_dtype=torch.float16).to(
    device
)
model

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [13]:
model.eval()
with torch.inference_mode():
    outputs = model(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
    )
outputs

BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[-0.0470, -0.0333, -0.1626,  ..., -0.1337, -0.0571, -0.1059],
         [ 0.1794, -0.5020, -0.8029,  ...,  0.1106,  0.6057, -0.4551],
         [ 0.4096,  0.6703, -0.3369,  ...,  0.5203,  0.1552, -0.5944],
         ...,
         [ 0.3133,  0.3148, -0.3117,  ...,  0.0921,  0.0111, -0.3207],
         [ 0.1388, -0.1685, -1.0656,  ...,  0.1676,  0.1104, -0.0937],
         [ 0.1967, -0.3877, -0.1826,  ..., -0.0813,  0.2733, -0.1671]],

        [[-0.0470, -0.0333, -0.1626,  ..., -0.1337, -0.0571, -0.1059],
         [ 0.1794, -0.5020, -0.8029,  ...,  0.1106,  0.6057, -0.4551],
         [ 0.4096,  0.6703, -0.3369,  ...,  0.5203,  0.1552, -0.5944],
         ...,
         [ 0.3133,  0.3148, -0.3117,  ...,  0.0921,  0.0111, -0.3207],
         [ 0.1388, -0.1685, -1.0656,  ...,  0.1676,  0.1104, -0.0937],
         [ 0.1967, -0.3877, -0.1826,  ..., -0.0813,  0.2733, -0.1671]]],
       device='cuda:0'), past_key_values=((tensor([[[[-0

In [14]:
# 最后一层的输出
outputs.last_hidden_state.shape

torch.Size([2, 10, 768])

In [16]:
for past_key in outputs.past_key_values:
    for past in past_key:
        print(past.shape)
    print("-" * 25)

torch.Size([2, 12, 10, 64])
torch.Size([2, 12, 10, 64])
-------------------------
torch.Size([2, 12, 10, 64])
torch.Size([2, 12, 10, 64])
-------------------------
torch.Size([2, 12, 10, 64])
torch.Size([2, 12, 10, 64])
-------------------------
torch.Size([2, 12, 10, 64])
torch.Size([2, 12, 10, 64])
-------------------------
torch.Size([2, 12, 10, 64])
torch.Size([2, 12, 10, 64])
-------------------------
torch.Size([2, 12, 10, 64])
torch.Size([2, 12, 10, 64])
-------------------------
torch.Size([2, 12, 10, 64])
torch.Size([2, 12, 10, 64])
-------------------------
torch.Size([2, 12, 10, 64])
torch.Size([2, 12, 10, 64])
-------------------------
torch.Size([2, 12, 10, 64])
torch.Size([2, 12, 10, 64])
-------------------------
torch.Size([2, 12, 10, 64])
torch.Size([2, 12, 10, 64])
-------------------------
torch.Size([2, 12, 10, 64])
torch.Size([2, 12, 10, 64])
-------------------------
torch.Size([2, 12, 10, 64])
torch.Size([2, 12, 10, 64])
-------------------------


In [17]:
outputs.hidden_states

In [18]:
outputs.attentions

In [19]:
outputs.cross_attentions