In [1]:
import torch
from transformers import BlipProcessor, BlipTextModel

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
version = "Salesforce/blip-image-captioning-base"
text = ["a photo of 2 cats", "a photo of a dog", "a plane in the blue sky"]

# BlipProcessor

In [4]:
processor: BlipProcessor = BlipProcessor.from_pretrained(version)
processor

BlipProcessor:
- image_processor: BlipImageProcessor {
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "BlipImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "processor_class": "BlipProcessor",
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 384,
    "width": 384
  }
}

- tokenizer: BertTokenizerFast(name_or_path='Salesforce/blip-image-captioning-base', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

## processor

In [5]:
inputs = processor(
    text = text,                # 可以为列表或单个string
    return_tensors = "pt",      # 返回数据格式 np pt tf jax
    padding = True,             # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,  # 如果使用max_length要将padding设置为 "max_length"
    add_special_tokens = True,  # text添加特殊key
).to(device, torch.float16)
inputs

{'input_ids': tensor([[ 101, 1037, 6302, 1997, 1016, 8870,  102,    0],
        [ 101, 1037, 6302, 1997, 1037, 3899,  102,    0],
        [ 101, 1037, 4946, 1999, 1996, 2630, 3712,  102]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [6]:
inputs["input_ids"]

tensor([[ 101, 1037, 6302, 1997, 1016, 8870,  102,    0],
        [ 101, 1037, 6302, 1997, 1037, 3899,  102,    0],
        [ 101, 1037, 4946, 1999, 1996, 2630, 3712,  102]], device='cuda:0')

## batch_decode

In [7]:
print(processor.batch_decode(inputs["input_ids"]))
print(processor.batch_decode(inputs["input_ids"], skip_special_tokens=True))

['[CLS] a photo of 2 cats [SEP] [PAD]', '[CLS] a photo of a dog [SEP] [PAD]', '[CLS] a plane in the blue sky [SEP]']
['a photo of 2 cats', 'a photo of a dog', 'a plane in the blue sky']


## decode

In [8]:
print(processor.decode(inputs["input_ids"][0]))
print(processor.decode(inputs["input_ids"][0], skip_special_tokens=True))

[CLS] a photo of 2 cats [SEP] [PAD]
a photo of 2 cats


# BlipTextModel

The model can behave as an encoder (with only self-attention) as well as a decoder

In [9]:
text_model: BlipTextModel = BlipTextModel.from_pretrained(version, torch_dtype=torch.float16).to(device)
text_model

Some weights of BlipTextModel were not initialized from the model checkpoint at Salesforce/blip-image-captioning-base and are newly initialized: ['encoder.layer.5.output.dense.bias', 'encoder.layer.11.crossattention.output.dense.bias', 'encoder.layer.9.attention.self.query.bias', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.3.crossattention.self.value.weight', 'encoder.layer.4.output.LayerNorm.bias', 'encoder.layer.7.output.LayerNorm.bias', 'encoder.layer.3.output.dense.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.5.attention.self.query.bias', 'encoder.layer.4.attention.output.dense.weight', 'encoder.layer.2.attention.output.LayerNorm.bias', 'encoder.layer.9.intermediate.dense.bias', 'encoder.layer.8.crossattention.self.key.bias', 'encoder.layer.9.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.self.key.weight', 'pooler.dense.bias', 'encoder.layer.9.attention.self.key.bias', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.9.cros

BlipTextModel(
  (embeddings): BlipTextEmbeddings(
    (word_embeddings): Embedding(30524, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): BlipTextEncoder(
    (layer): ModuleList(
      (0-11): 12 x BlipTextLayer(
        (attention): BlipTextAttention(
          (self): BlipTextSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): BlipTextSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )

In [10]:
text_model.eval()
with torch.inference_mode():
    outputs = text_model(**inputs)
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-2.2979, -1.7190, -0.8251,  ..., -0.5837,  2.1027,  1.2016],
         [-1.1430, -1.1887,  0.7231,  ..., -0.7695, -0.9998,  1.8963],
         [-0.3444, -1.5037,  0.2573,  ..., -1.1309,  0.2408,  2.7538],
         ...,
         [-0.7810, -0.5631, -1.3456,  ..., -1.0601,  0.8383,  2.9879],
         [-0.1547,  0.0396,  0.6592,  ...,  0.0644, -0.0628,  2.0471],
         [-1.5574,  0.0929,  0.1392,  ..., -1.7714,  1.4565,  2.4659]],

        [[-1.8166, -1.3529, -0.7097,  ..., -1.0158,  2.0907,  1.1875],
         [-0.8335, -1.0432,  0.7825,  ..., -1.0618, -0.7747,  1.9698],
         [ 0.0682, -1.1981,  0.2835,  ..., -1.4119,  0.1731,  2.5956],
         ...,
         [ 0.3892, -1.7910,  0.1197,  ...,  0.0439,  0.8506,  3.1678],
         [ 0.0994,  0.3958,  0.7491,  ..., -0.2503, -0.1343,  1.8826],
         [-1.0241,  0.4199,  0.1813,  ..., -2.0382,  1.3635,  2.2894]],

        [[-1.6700, -0.9050, -0.9313,  ..., -0.2866,  

In [11]:
# 最后一层的输出
outputs.last_hidden_state.shape

torch.Size([3, 8, 768])

In [12]:
# 对文字长度进行pool
outputs.pooler_output.shape

torch.Size([3, 768])

In [13]:
outputs.hidden_states

In [14]:
outputs.attentions