In [1]:
%%capture
!pip install torch torchvision transformers
!pip install diffusers["torch"] transformers

In [2]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
import numpy as np
import matplotlib.pyplot as plt
import torchvision.datasets as datasets
from torch.utils.data import DataLoader, Dataset
from transformers import CLIPProcessor, CLIPModel, CLIPVisionModelWithProjection, AutoProcessor, CLIPTextModelWithProjection
from diffusers import UnCLIPScheduler, DiffusionPipeline, DDPMScheduler, StableUnCLIPPipeline, UnCLIPPipeline, StableUnCLIPImg2ImgPipeline
from PIL import Image
import seaborn as sns
import requests

In [3]:
from transformers import BlipConfig, BlipModel, BlipTextConfig, BlipVisionConfig, Blip2Processor, Blip2Model, Blip2ForConditionalGeneration

In [4]:
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")

In [None]:
# model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", 
#                                                       torch_dtype=torch.float16, 
#                                                       device_map="auto")

model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", 
                                                      torch_dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

In [15]:
question = "how many dogs are in the picture?"

In [16]:
inputs = processor(raw_image, question, return_tensors="pt").to("cuda", torch.float16)

In [17]:
out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True).strip())






In [55]:
vision_model = model.vision_model
vision_model

Blip2VisionModel(
  (embeddings): Blip2VisionEmbeddings(
    (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
  )
  (encoder): Blip2Encoder(
    (layers): ModuleList(
      (0-38): 39 x Blip2EncoderLayer(
        (self_attn): Blip2Attention(
          (dropout): Dropout(p=0.0, inplace=False)
          (qkv): Linear(in_features=1408, out_features=4224, bias=True)
          (projection): Linear(in_features=1408, out_features=1408, bias=True)
        )
        (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (mlp): Blip2MLP(
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1408, out_features=6144, bias=True)
          (fc2): Linear(in_features=6144, out_features=1408, bias=True)
        )
        (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
      )
    )
  )
  (post_layernorm): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
)

In [51]:
lang_model = model.language_model
lang_model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 2560, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 2560)
      (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-31): 32 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): Laye

In [13]:
model.qformer.encoder.layer[2]

Blip2QFormerLayer(
  (attention): Blip2QFormerAttention(
    (attention): Blip2QFormerMultiHeadAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): Blip2QFormerSelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (crossattention): Blip2QFormerAttention(
    (attention): Blip2QFormerMultiHeadAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=1408, out_features=768, bias=True)
      (value): Linear(in_features=1408, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): Blip2QFormerSelfOutput(
      (dense): L

In [64]:
output = model(**inputs)
output.keys()

odict_keys(['logits', 'vision_outputs', 'qformer_outputs', 'language_model_outputs'])

In [19]:
q = output.qformer_outputs 
q.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [20]:
inputs.keys()

dict_keys(['pixel_values', 'input_ids', 'attention_mask'])

In [56]:
# construct layer by layer: vision_model
x = vision_model(inputs['pixel_values'], output_attentions=True)

In [57]:
x.keys()

odict_keys(['last_hidden_state', 'pooler_output', 'attentions'])

In [24]:
x['last_hidden_state'].size()

torch.Size([1, 257, 1408])

In [27]:
x['pooler_output'].size()

torch.Size([1, 1408])

In [68]:
len(x['attentions'])

39

In [31]:
# construct layer by payer: qformer previous
q_norm = model.qformer.layernorm
q_dropout = model.qformer.dropout

In [74]:
model.qformer(inputs['input_ids'], attention_mask=inputs['attention_mask'], encoder_hidden_states=x['last_hidden_state'])

RuntimeError: Given normalized_shape=[768], expected input with shape [*, 768], but got input of size[1, 9]

In [50]:
model.qformer.encoder.layer[2].attention

Blip2QFormerAttention(
  (attention): Blip2QFormerMultiHeadAttention(
    (query): Linear(in_features=768, out_features=768, bias=True)
    (key): Linear(in_features=768, out_features=768, bias=True)
    (value): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (output): Blip2QFormerSelfOutput(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [89]:
output['qformer_outputs']['last_hidden_state'].shape

torch.Size([1, 32, 768])

In [85]:
model

Blip2ForConditionalGeneration(
  (vision_model): Blip2VisionModel(
    (embeddings): Blip2VisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): Blip2Encoder(
      (layers): ModuleList(
        (0-38): 39 x Blip2EncoderLayer(
          (self_attn): Blip2Attention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          (mlp): Blip2MLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((

In [92]:
lang_out = lang_model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

In [93]:
lang_out.keys()

odict_keys(['logits', 'past_key_values'])

In [95]:
output['language_model_outputs'].keys()

odict_keys(['logits', 'past_key_values'])

In [101]:
print(lang_out['logits'].size())
print(output['language_model_outputs']['logits'].size())

torch.Size([1, 9, 50272])
torch.Size([1, 41, 50272])
