<a href="https://colab.research.google.com/github/Hatim23-M/Image-Captioning/blob/main/Blip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import torch
from transformers import BlipImageProcessor, BlipForConditionalGeneration, AutoTokenizer
from PIL import Image

In [3]:
# Load the model
model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-large')

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-23): 24 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1024, out_features=3072, bias=True)
            (projection): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((1024,),

In [5]:
tokenizer = AutoTokenizer.from_pretrained('Salesforce/blip-image-captioning-large')
feature_extractor = BlipImageProcessor.from_pretrained('Salesforce/blip-image-captioning-large')

In [6]:
def caption_prediction(image_path):
  img = Image.open(image_path) 
  if img.mode != "RGB":
    img = img.convert(mode = "RGB")

  pixel_values = feature_extractor(images=img, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = model.generate(pixel_values, max_length = 100)
  captions = tokenizer.batch_decode(output_ids, skip_special_tokens = True)

  return captions

In [7]:
caption_prediction('/content/Image2.png')

['there are two horses standing together in a field under a cloudy sky']

In [8]:
from transformers import BlipProcessor

In [9]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")

In [42]:
def caption_with_init_txt(image_path ,text):
  image = Image.open(image_path)
  if image.mode != "RGB":
    image = image.convert(mode = "RGB")

  inputs = processor(image, text, return_tensors="pt")
  inputs = inputs.to(device)

  out = model.generate(**inputs, max_length = 50)

  caption = processor.decode(out[0], skip_special_tokens=True)

  return caption

In [51]:
caption_with_init_txt('/content/Image3.png', "")

'our early adopters poster with four people and a sun'