In [8]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

In [2]:
# Load BLIP processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
# The processor prepares the image and question for the VQA model
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-large")

# The model is trained for Visual Question Answering tasks
# It can answer questions about objects, actions, locations, and yes/no questions
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-large")

In [None]:
img_url = 'https://github.com/HichamBouzalim/BLIP-for-image-captioning/blob/main/Picture.jpg?raw=true'


In [11]:
response = requests.get(img_url)
response.raise_for_status() 

In [3]:
# Image URL 
img_url = 'https://github.com/HichamBouzalim/BLIP-for-image-captioning/blob/main/Picture.jpg?raw=true'
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

In [4]:
# Specify the question you want to ask about the image
question = "Describe to me who is in the picture?"

In [None]:
# Use the processor to prepare inputs for VQA (image + question)
inputs = processor(raw_image, question, return_tensors="pt")


In [6]:
# Generate the answer from the model
out = model.generate(**inputs)

In [7]:
# Decode and print the answer to the question
answer = processor.decode(out[0], skip_special_tokens=True)
print(f"Answer: {answer}")

Answer: describe to me who is in the picture?
