# Importing Libraries
- Model used : https://huggingface.co/Salesforce/blip-image-captioning-large
- Advantage : More Accurate captioning according to the image
- Disadvantage : Take time to predict

In [2]:
import requests
import random
import torch
import os
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

# Setting up random Seed

In [3]:
# Set random seed
random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x273bef7a690>

# Initiating Model

In [4]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda")

# Preprocessing & Function Defination

In [5]:
def predict(image_path,num_captions):
    # Empty List
    captions = []

    # Convert to RGB
    image = Image.open(image_path).convert('RGB')
    inputs = processor(image, return_tensors="pt").to("cuda")

    # Generate multiple captions
    while len(captions) < num_captions:
        # Random Seed
        random_seed = random.randint(999, 1000000)
        random.seed(random_seed)
        torch.manual_seed(random_seed)

        # Caption Generation
        out = model.generate(
            **inputs,
            num_return_sequences=num_captions,
            do_sample=True,
            top_k=100,
            temperature=0.7,
            max_length=50
        )
        
        # Append the captions
        for i in range(num_captions):
            caption = processor.decode(out[i], skip_special_tokens=True)
            captions.append(caption.capitalize())
    
    return captions

# Preparing Path of Images

In [6]:
image_path = []

for filename in os.listdir('img'):
    image_path.append('img/' + filename)

# Generating captions

In [7]:
Final = {}
for image in image_path:
    caption = predict(image,5)
    Final[image] = caption

# Printing Captions

In [8]:
for key,value in Final.items():
    print(key.split("/")[-1])
    for i in value:
        print(f'Caption : {i}')
    print()

Image1.png
Caption : Marvin adonis of manchester united during a match against liverpool united
Caption : Leroy is a very good player in soccer, just playing for the team
Caption : He is running with the ball on the field in a soccer game
Caption : Ronald ronald in action during a game against liverpool
Caption : Dancer on a soccer field in the middle of a run

Image2.png
Caption : Gloomy storm clouds hover over two horses in a field
Caption : Cloudy skies above two horses in a field with grass and rocks
Caption : Muddy two horses with a blue patch on their forehead and tail
Caption : Horsebacks are standing in a field under cloudy skies and black clouds
Caption : Boone of horses with blue tags on their heads standing in a field

Image3.png
Caption : Logos of five people are all about their roles
Caption : Diagram of a group of three people, with various emotions
Caption : Screenshot of people are all looking at the camera around the sun
Caption : Seating for a live dinner at the sun b

In [9]:
import json
  
with open('Captions_BLIP.txt', 'w') as convert_file:
    convert_file.write(json.dumps(Final))