# Image Captioning, Image-to-text : BLIP-2

## BLIP

**BLIP(Bootstrapping Language-Image Pre-training)**

**Model settings about Blip2Config**

In [2]:
from transformers import Blip2Config

In [3]:
model_name = "Salesforce/blip2-opt-2.7b"

config = Blip2Config.from_pretrained(model_name)
print(config)

Blip2Config {
  "architectures": [
    "Blip2ForConditionalGeneration"
  ],
  "image_text_hidden_size": 256,
  "image_token_index": null,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "model_type": "blip-2",
  "num_query_tokens": 32,
  "qformer_config": {
    "classifier_dropout": null,
    "model_type": "blip_2_qformer"
  },
  "text_config": {
    "_name_or_path": "facebook/opt-2.7b",
    "activation_dropout": 0.0,
    "architectures": [
      "OPTForCausalLM"
    ],
    "eos_token_id": 50118,
    "ffn_dim": 10240,
    "hidden_size": 2560,
    "model_type": "opt",
    "num_attention_heads": 32,
    "num_hidden_layers": 32,
    "prefix": "</s>",
    "torch_dtype": "float16",
    "word_embed_proj_dim": 2560
  },
  "torch_dtype": "float32",
  "transformers_version": "4.46.1",
  "use_decoder_only_language_model": true,
  "vision_config": {
    "dropout": 0.0,
    "initializer_factor": 1.0,
    "model_type": "blip_2_vision_model",
    "num_channels": 3,
    "projection_dim": 

**Model structure of Blip2ForConditionalGeneration**

In [4]:
import torch
from transformers import Blip2ForConditionalGeneration

2024-11-07 00:33:20.855043: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-07 00:33:20.949538: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-07 00:33:20.953333: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-11-07 00:33:20.953343: I tensorflow/stream_executor/cuda

In [5]:
model = Blip2ForConditionalGeneration.from_pretrained(
    model_name, torch_dtype=torch.float16, device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
for main_name, main_module in model.named_children():
    print(main_name)
    for sub_name, sub_module in main_module.named_children():
        print("L", sub_name)
        for ssub_name, ssub_module in sub_module.named_children():
            print("| L", ssub_name)
            for sssub_name, sssub_module in ssub_module.named_children():
                print("|  L", sssub_name)

vision_model
L embeddings
| L patch_embedding
L encoder
| L layers
|  L 0
|  L 1
|  L 2
|  L 3
|  L 4
|  L 5
|  L 6
|  L 7
|  L 8
|  L 9
|  L 10
|  L 11
|  L 12
|  L 13
|  L 14
|  L 15
|  L 16
|  L 17
|  L 18
|  L 19
|  L 20
|  L 21
|  L 22
|  L 23
|  L 24
|  L 25
|  L 26
|  L 27
|  L 28
|  L 29
|  L 30
|  L 31
|  L 32
|  L 33
|  L 34
|  L 35
|  L 36
|  L 37
|  L 38
L post_layernorm
qformer
L layernorm
L dropout
L encoder
| L layer
|  L 0
|  L 1
|  L 2
|  L 3
|  L 4
|  L 5
|  L 6
|  L 7
|  L 8
|  L 9
|  L 10
|  L 11
language_projection
language_model
L model
| L decoder
|  L embed_tokens
|  L embed_positions
|  L final_layer_norm
|  L layers
L lm_head


**BLIP-2 모델의 vision_model**

In [7]:
import torch
from datasets import load_dataset
from transformers import Blip2Processor, Blip2ForConditionalGeneration

In [8]:
model_name = "Salesforce/blip2-opt-2.7b"

processor = Blip2Processor.from_pretrained(model_name)
model = Blip2ForConditionalGeneration.from_pretrained(
    model_name, torch_dtype=torch.float16, device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
dataset = load_dataset("huggingface/cats-image")

image = dataset["test"]["image"][0]

In [10]:
inputs = processor(images=image, return_tensors="pt").to(model.device, dtype=torch.float16)

In [11]:
image_embeds = model.vision_model(
    inputs["pixel_values"], return_dict=True
).last_hidden_state

In [12]:
print(model.vision_model)
print(image_embeds)
print(image_embeds.shape)

Blip2VisionModel(
  (embeddings): Blip2VisionEmbeddings(
    (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
  )
  (encoder): Blip2Encoder(
    (layers): ModuleList(
      (0-38): 39 x Blip2EncoderLayer(
        (self_attn): Blip2Attention(
          (dropout): Dropout(p=0.0, inplace=False)
          (qkv): Linear(in_features=1408, out_features=4224, bias=True)
          (projection): Linear(in_features=1408, out_features=1408, bias=True)
        )
        (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (mlp): Blip2MLP(
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1408, out_features=6144, bias=True)
          (fc2): Linear(in_features=6144, out_features=1408, bias=True)
        )
        (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
      )
    )
  )
  (post_layernorm): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
)
tensor([[[ 0.1372,  0.1934,  0.5537,  ..., -0.2

**qformer module**

In [13]:
image_attention_mask = torch.ones(
    image_embeds.size()[:-1], dtype=torch.long, device=model.device
)

In [14]:
model.query_tokens.size()

torch.Size([1, 32, 768])

In [15]:
query_tokens = model.query_tokens.expand(image_embeds.shape[0], -1, -1)

In [16]:
query_tokens.size()

torch.Size([1, 32, 768])

In [17]:
query_outputs = model.qformer(
    query_embeds=query_tokens,
    encoder_hidden_states=image_embeds,
    encoder_attention_mask=image_attention_mask,
    return_dict=True
)

In [18]:
query_outputs = query_outputs.last_hidden_state

In [19]:
print(image_attention_mask.shape)
print(query_tokens.shape)
print(query_outputs.shape)

torch.Size([1, 257])
torch.Size([1, 32, 768])
torch.Size([1, 32, 768])


**language_model**

In [20]:
language_model_inputs = model.language_projection(query_outputs)

In [22]:
language_model_inputs.size()

torch.Size([1, 32, 2560])

In [23]:
language_attention_mask = torch.ones(
    language_model_inputs.size()[:-1],
    dtype=torch.long,
    device=model.device
)

In [24]:
input_ids = (
    torch.LongTensor([[model.config.text_config.bos_token_id]])
    .repeat(inputs["pixel_values"].shape[0], 1)
    .to(model.device)
)

In [25]:
input_ids

tensor([[2]], device='cuda:0')

In [26]:
attention_mask = torch.ones_like(input_ids)
attention_mask = torch.cat(
    [language_attention_mask, attention_mask.to(model.device)], dim=1
)

In [27]:
attention_mask

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')

In [28]:
inputs_embeds = model.get_input_embeddings()(input_ids)
inputs_embeds = torch.cat(
    [language_model_inputs, inputs_embeds.to(model.device)], dim=1
)

In [30]:
inputs_embeds.size()

torch.Size([1, 33, 2560])

In [31]:
outputs = model.language_model.generate(
    inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_length=50
)

In [32]:
outputs

tensor([[ 7109, 10017, 11963,    15,    10, 16433, 50118]], device='cuda:0')

In [33]:
outputs.shape

torch.Size([1, 7])

**Image captioning**

In [34]:
import torch
from datasets import load_dataset
from transformers import Blip2Processor, Blip2ForConditionalGeneration

In [35]:
model_name = "Salesforce/blip2-opt-2.7b"

processor = Blip2Processor.from_pretrained(model_name)
model = Blip2ForConditionalGeneration.from_pretrained(
    model_name, torch_dtype=torch.float16, device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [36]:
dataset = load_dataset("huggingface/cats-image")

image = dataset["test"]["image"][0]
inputs = processor(images=image, return_tensors="pt").to(model.device, dtype=torch.float16)
generated_ids = model.generate(**inputs, max_length=50)
print(generated_ids)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(generated_text)

Expanding inputs for image tokens in BLIP-2 should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.


tensor([[    2,  7109, 10017, 11963,    15,    10, 16433, 50118]],
       device='cuda:0')
two cats laying on a couch


In [37]:
prompt = "Question: Describe the location of th image. Answer:"
inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, dtype=torch.float16)

generated_ids = model.generate(**inputs)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(generated_text)

Expanding inputs for image tokens in BLIP-2 should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.


A couch


# 문서 질의 응답: Layout:LM