## BLIP-2 

Boostrapping Language-Image Pre-training with Frozen Image Encoder and LLM

In [1]:
# Explain the image and answer question based on the Image
# Using ViT and FlanT5 for these tasks - interlinked with Q-Former (Blip2)

from transformers import (
    Blip2VisionConfig,
    Blip2VisionModel,
    Blip2QFormerModel,
    OPTConfig,
    Blip2Config,
    Blip2ForConditionalGeneration,
)


In [None]:
from PIL import Image
import requests
from transformers import Blip2Processor
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"


model_to_use = "Salesforce/blip2-opt-2.7b"

processor = Blip2Processor.from_pretrained(model_to_use)

model = Blip2ForConditionalGeneration.from_pretrained(
    model_to_use, torch_dtype=torch.float16    
)
model.to(device)
url = "http://images.cocodataset.org/val2017/000000029769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)

generated_ids = model.generate(**inputs)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(generated_text)


In [None]:
image

In [None]:
type(image)

### With provided text prompt

It can be done with or without the prompt input

In [None]:
prompt = "Question: How many cats are there? Answer: "

inputs = processor(images=image, text=prompt, return_tensors="pt").to(device, torch.float16)

generated_ids = model.generate(**inputs)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(prompt, generated_text)


In [None]:
from transformers import AutoProcessor

processor_auto = AutoProcessor.from_pretrained(model_to_use)
inputs = processor_auto(image, return_tensors="pt")
generated_ids = model.generate(**inputs, max_new_tokens=50)
generated_text_2 = processor_auto.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(generated_text_2)

In [2]:

# Initialize a Blip2Config with model config
configuration = Blip2Config()

# Initialize the Blip2ForConditionalGeneration with random weights for the model
model = Blip2ForConditionalGeneration(configuration)

In [3]:
configuration = model.config   # complete different Vision and Language system

print(configuration)

Blip2Config {
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "model_type": "blip-2",
  "num_query_tokens": 32,
  "qformer_config": {
    "model_type": "blip_2_qformer"
  },
  "text_config": {
    "model_type": "opt"
  },
  "transformers_version": "4.43.3",
  "use_decoder_only_language_model": true,
  "vision_config": {
    "model_type": "blip_2_vision_model"
  }
}



In [4]:
configuration_vision = Blip2VisionConfig()
model = Blip2VisionModel(configuration_vision)
configuration_2 = model.config

print(configuration_2)

In [None]:
model