https://huggingface.co/docs/transformers/main/model_doc/bert

In [1]:
import torch
from transformers import BertModel, BertTokenizer
from transformers import AutoModel, AutoTokenizer

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
version = "bert-base-uncased"
sequence = "The quick brown fox jumps over the lazy dog."
max_length = 20

# BertTokenizer

In [4]:
tokenizer: BertTokenizer = BertTokenizer.from_pretrained(version)
tokenizer

'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /bert-base-uncased/resolve/main/vocab.txt (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x00000223DFCAB210>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: fe814dfc-8c80-4552-9514-4470ad71102d)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt


BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

## tokenizer([sequence])

In [5]:
inputs = tokenizer(
    [sequence] * 2,                     # 句子batch
    truncation = True,                  # 超出max_length截断处理
    padding = True,                     # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,          # 最长长度,不设置默认为模型最大长度
    add_special_tokens = True,          # text添加特殊key
    return_length = True,               # 返回有效长度
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt"               # 返回数据格式 np pt tf jax
).to(device, torch.float16)    # https://github.com/huggingface/transformers/issues/16359

print(inputs.keys())
print(inputs["input_ids"])
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 对应有效文字长度

dict_keys(['input_ids', 'token_type_ids', 'length', 'attention_mask'])
tensor([[  101,  1996,  4248,  2829,  4419, 14523,  2058,  1996, 13971,  3899,
          1012,   102],
        [  101,  1996,  4248,  2829,  4419, 14523,  2058,  1996, 13971,  3899,
          1012,   102]], device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')
tensor([12, 12], device='cuda:0')


In [6]:
print(inputs["input_ids"])

tensor([[  101,  1996,  4248,  2829,  4419, 14523,  2058,  1996, 13971,  3899,
          1012,   102],
        [  101,  1996,  4248,  2829,  4419, 14523,  2058,  1996, 13971,  3899,
          1012,   102]], device='cuda:0')


# BertModel

The bare Bert Model transformer outputting raw hidden-states without any specific head on top.

In [26]:
# > from_pretrained Parameters
#   pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*)
#   model_args (sequence of positional arguments, *optional*)
#   config (`Union[PretrainedConfig, str, os.PathLike]`, *optional*)
#   state_dict (`Dict[str, torch.Tensor]`, *optional*)
#   cache_dir (`Union[str, os.PathLike]`, *optional*)
#   from_tf (`bool`, *optional*, defaults to `False`)
#   from_flax (`bool`, *optional*, defaults to `False`)
#   ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`)
#   force_download (`bool`, *optional*, defaults to `False`)
#   resume_download (`bool`, *optional*, defaults to `False`)
#   proxies (`Dict[str, str]`, *optional*)
#   output_loading_info(`bool`, *optional*, defaults to `False`)
#   local_files_only(`bool`, *optional*, defaults to `False`)
#   token (`str` or `bool`, *optional*)
#   revision (`str`, *optional*, defaults to `"main"`)
# > Parameters for big model inference
#   low_cpu_mem_usage(`bool`, *optional*)
#   torch_dtype: `torch.float16` or `torch.bfloat16` or `torch.float` or `"auto"`
#   device_map (`str` or `Dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*)
#   max_memory (`Dict`, *optional*)
#   offload_folder (`str` or `os.PathLike`, *optional*)
#   offload_state_dict (`bool`, *optional*)
#   load_in_8bit (`bool`, *optional*, defaults to `False`)
#   quantization_config (`Dict`, *optional*)
#   subfolder (`str`, *optional*, defaults to `""`)
#   variant (`str`, *optional*)
#   use_safetensors (`bool`, *optional*, defaults to `None`)
#   **kwargs

model: BertModel = BertModel.from_pretrained(version, torch_dtype=torch.float16).to(device)
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [27]:
model.eval()
with torch.inference_mode():
    outputs = model(
        input_ids = inputs["input_ids"],
        attention_mask = inputs["attention_mask"],
    )
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.3608,  0.2271, -0.3030,  ..., -0.4224,  0.6949,  0.6213],
         [-0.3276, -0.3762, -0.5044,  ..., -0.3660,  1.1588, -0.2188],
         [-0.4000, -0.4212,  0.4903,  ..., -0.4081,  0.8508, -0.0882],
         ...,
         [ 0.6786,  0.0645,  0.2290,  ..., -0.2903,  0.4909,  0.6316],
         [-0.1088, -0.1644, -0.2961,  ...,  0.2168,  0.2916, -0.5030],
         [ 0.7099,  0.4367, -0.4851,  ..., -0.0067, -0.1472, -0.2670]],

        [[-0.3608,  0.2271, -0.3030,  ..., -0.4224,  0.6949,  0.6213],
         [-0.3276, -0.3762, -0.5044,  ..., -0.3660,  1.1588, -0.2188],
         [-0.4000, -0.4212,  0.4903,  ..., -0.4081,  0.8508, -0.0882],
         ...,
         [ 0.6786,  0.0645,  0.2290,  ..., -0.2903,  0.4909,  0.6316],
         [-0.1088, -0.1644, -0.2961,  ...,  0.2168,  0.2916, -0.5030],
         [ 0.7099,  0.4367, -0.4851,  ..., -0.0067, -0.1472, -0.2670]]],
       device='cuda:0'), pooler_output=tensor([[-0.82

In [28]:
# 最后一层的输出
outputs.last_hidden_state.shape

torch.Size([2, 12, 768])

In [29]:
# 对文字长度进行pool
outputs.pooler_output.shape

torch.Size([2, 768])

In [30]:
outputs.hidden_states

In [31]:
outputs.past_key_values

In [32]:
outputs.attentions

In [33]:
outputs.cross_attentions

# AutoTokenizer

https://huggingface.co/docs/transformers/main/autoclass_tutorial#autotokenizer

In [34]:
tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(version)

## processor

In [35]:
inputs = tokenizer(
    [sequence] * 2,                     # 句子batch
    truncation = True,                  # 超出max_length截断处理
    padding = True,                     # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,          # 最长长度,不设置默认为模型最大长度
    add_special_tokens = True,          # text添加特殊key
    return_length = True,               # 返回有效长度
    return_attention_mask = True,       # 返回attention_mask
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt",              # 返回数据格式 np pt tf jax
).to(device)

print(inputs.keys())
print(inputs["input_ids"])
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 返回文字最长长度

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'length'])
tensor([[  101,  1996,  4248,  2829,  4419, 14523,  2058,  1996, 13971,  3899,
          1012,   102],
        [  101,  1996,  4248,  2829,  4419, 14523,  2058,  1996, 13971,  3899,
          1012,   102]], device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')
tensor([12, 12], device='cuda:0')


# AutoModel

https://huggingface.co/docs/transformers/main/model_doc/auto#transformers.AutoModel

In [36]:
model: AutoModel = AutoModel.from_pretrained(version, torch_dtype=torch.float16).to(device)

In [37]:
model.eval()
with torch.inference_mode():
    outputs = model(
        input_ids = inputs["input_ids"],
        attention_mask = inputs["attention_mask"],
    )
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.3608,  0.2271, -0.3030,  ..., -0.4224,  0.6949,  0.6213],
         [-0.3276, -0.3762, -0.5044,  ..., -0.3660,  1.1588, -0.2188],
         [-0.4000, -0.4212,  0.4903,  ..., -0.4081,  0.8508, -0.0882],
         ...,
         [ 0.6786,  0.0645,  0.2290,  ..., -0.2903,  0.4909,  0.6316],
         [-0.1088, -0.1644, -0.2961,  ...,  0.2168,  0.2916, -0.5030],
         [ 0.7099,  0.4367, -0.4851,  ..., -0.0067, -0.1472, -0.2670]],

        [[-0.3608,  0.2271, -0.3030,  ..., -0.4224,  0.6949,  0.6213],
         [-0.3276, -0.3762, -0.5044,  ..., -0.3660,  1.1588, -0.2188],
         [-0.4000, -0.4212,  0.4903,  ..., -0.4081,  0.8508, -0.0882],
         ...,
         [ 0.6786,  0.0645,  0.2290,  ..., -0.2903,  0.4909,  0.6316],
         [-0.1088, -0.1644, -0.2961,  ...,  0.2168,  0.2916, -0.5030],
         [ 0.7099,  0.4367, -0.4851,  ..., -0.0067, -0.1472, -0.2670]]],
       device='cuda:0'), pooler_output=tensor([[-0.82

In [38]:
# 最后一层的输出
outputs.last_hidden_state.shape

torch.Size([2, 12, 768])

In [39]:
# 对文字长度进行pool
outputs.pooler_output.shape

torch.Size([2, 768])

In [40]:
outputs.hidden_states

In [41]:
outputs.past_key_values

In [42]:
outputs.attentions

In [43]:
outputs.cross_attentions