In [1]:
%%capture
!pip install torch torchvision transformers
!pip install diffusers["torch"] transformers

In [2]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
import numpy as np
import matplotlib.pyplot as plt
import torchvision.datasets as datasets
from torch.utils.data import DataLoader, Dataset
from transformers import CLIPProcessor, CLIPModel, CLIPVisionModelWithProjection, AutoProcessor, CLIPTextModelWithProjection
from diffusers import UnCLIPScheduler, DiffusionPipeline, DDPMScheduler, StableUnCLIPPipeline, UnCLIPPipeline, StableUnCLIPImg2ImgPipeline
from PIL import Image
import seaborn as sns
import requests

In [3]:
from transformers import BlipConfig, BlipModel, BlipTextConfig, BlipVisionConfig, Blip2Processor, Blip2Model, Blip2ForConditionalGeneration

In [4]:
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")

In [5]:
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", 
                                                      torch_dtype=torch.float16, 
                                                      device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [40]:
vision_model = model.vision_model
vision_model

Blip2VisionModel(
  (embeddings): Blip2VisionEmbeddings(
    (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
  )
  (encoder): Blip2Encoder(
    (layers): ModuleList(
      (0-38): 39 x Blip2EncoderLayer(
        (self_attn): Blip2Attention(
          (dropout): Dropout(p=0.0, inplace=False)
          (qkv): Linear(in_features=1408, out_features=4224, bias=True)
          (projection): Linear(in_features=1408, out_features=1408, bias=True)
        )
        (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (mlp): Blip2MLP(
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1408, out_features=6144, bias=True)
          (fc2): Linear(in_features=6144, out_features=1408, bias=True)
        )
        (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
      )
    )
  )
  (post_layernorm): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
)

In [33]:
lang_model = model.language_model

In [37]:
qformer = model.qformer

In [38]:
qformer

Blip2QFormerModel(
  (layernorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (encoder): Blip2QFormerEncoder(
    (layer): ModuleList(
      (0): Blip2QFormerLayer(
        (attention): Blip2QFormerAttention(
          (attention): Blip2QFormerMultiHeadAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): Blip2QFormerSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (crossattention): Blip2QFormerAttention(
          (attention): Blip2QFormerMultiHeadAttention(
            (query):

In [23]:
model.qformer.encoder.layer[11]

Blip2QFormerLayer(
  (attention): Blip2QFormerAttention(
    (attention): Blip2QFormerMultiHeadAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): Blip2QFormerSelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (intermediate_query): Blip2QFormerIntermediate(
    (dense): Linear(in_features=768, out_features=3072, bias=True)
    (intermediate_act_fn): GELUActivation()
  )
  (output_query): Blip2QFormerOutput(
    (dense): Linear(in_features=3072, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [41]:
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

In [42]:
question = "how many dogs are in the picture?"

In [43]:
inputs = processor(raw_image, question, return_tensors="pt").to("cuda", torch.float16)

In [47]:
out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True).strip())




In [44]:
output = model(**inputs)

In [49]:
q = output.qformer_outputs 

In [51]:
q.keys()

odict_keys(['last_hidden_state', 'pooler_output'])