# Importing Libraries
- Model used : https://huggingface.co/nlpconnect/vit-gpt2-image-captioning
- Advantage : It is fast
- Disadvantage : Not so good predictions

In [1]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import os
import torch
import random
from PIL import Image

# Initiating Model

In [2]:
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Checking GPU

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
device

device(type='cuda')

# Caption Generation

In [4]:
def predict_step(image_path, num_captions):

    # Empty List
    captions = []

    # Convert Image to 3 Channel Image
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
        i_image = i_image.convert(mode="RGB")
    
    # Preprocessing
    pixel_values = feature_extractor(images=[i_image], return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    # Generating Captions
    for _ in range(num_captions):
        random_seed = random.randint(999, 1000000)
        random.seed(random_seed)
        torch.random.manual_seed(random_seed)

        sampled_output_ids = model.generate(pixel_values, do_sample=True)

        preds = tokenizer.batch_decode(sampled_output_ids, skip_special_tokens=True)
        preds = [pred.strip() for pred in preds]

        # Filter out duplicate captions
        unique_preds = []
        for pred in preds:
            if pred not in unique_preds:
                unique_preds.append(pred)
            if len(unique_preds) == num_captions:
                break

        captions.extend(unique_preds)
    
    return captions

# Peparing Path of Images

In [5]:
image_path = []

for filename in os.listdir('img'):
    image_path.append('img/' + filename)

# Predicting

In [6]:
Final = {}
for image in image_path:
    caption = predict_step(image,5)
    Final[image] = caption



# Printing

In [7]:
for key,value in Final.items():
    print(key.split("/")[-1])
    for i in value:
        print(f'Caption : {i}')
    print()

Image1.png
Caption : a player in an old time soccer arena making a kick
Caption : a man dressed in white kicks a soccer ball
Caption : man kicking soccer ball in stadium with audience watching
Caption : young male soccer player playing with a soccer ball
Caption : a man on a field with a soccer ball

Image2.png
Caption : woman in a pony field with short black hair
Caption : a woman is standing in a grassy field next to a horse
Caption : cows are grazing on a dry field
Caption : the woman is holding a white and black horse
Caption : a person sitting on top of a brown horse with no shirt on

Image3.png
Caption : a collage of pictures of people wearing tie-dye
Caption : a poster showing a girl with cat signs on it
Caption : pictures from a women in the 1950s
Caption : a collage of several different photographs of female tennis players
Caption : poster showing a number of different images on one page



In [8]:
import json
  
with open('Captions_ViT.txt', 'w') as convert_file:
    convert_file.write(json.dumps(Final))