<a href="https://colab.research.google.com/github/Naveenand/Computer-vision/blob/main/image_caption_generator_using_pre_trained_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install -q kaggle
!pip install datasets rouge_score

In [None]:
!cp kaggle.json ~/.kaggle/
!chmod 600 kaggle.json
!kaggle datasets download -d adityajn105/flickr8k

Downloading flickr8k.zip to /content
100% 1.04G/1.04G [00:04<00:00, 245MB/s]
100% 1.04G/1.04G [00:04<00:00, 228MB/s]


In [None]:
!unzip /content/flickr8k.zip

In [None]:
import os

In [None]:
import numpy as np
from tqdm.notebook import tqdm
from datasets import load_metric

In [None]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

In [None]:
caption_file = '/content/captions.txt'
image_directory = '/content/Images'

In [None]:
print(len(os.listdir(image_directory)))

8091


#Load the Captions Data

In [None]:
from tqdm import tqdm

def create_image_caption_mapping(caption_file):
    # Open the caption file
    with open(caption_file, 'r') as f:
        next(f)
        captions_doc = f.read()

    # Create a mapping of images to captions
    mapping = {}


    for line in tqdm(captions_doc.split('\n')):
        tokens = line.split(',')
        if len(tokens) < 2:
            continue
        image_id, caption = tokens[0], tokens[1:]
        image_id = image_id.split('.')[0]
        caption = " ".join(caption)
        if image_id not in mapping:
            mapping[image_id] = []
        mapping[image_id].append(caption)

    return mapping

In [None]:
mapping = create_image_caption_mapping(caption_file)

100%|██████████| 40456/40456 [00:00<00:00, 157639.09it/s]


In [None]:
len(mapping)

8091

In [None]:
mapping['1000268201_693b08cb0e']

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

In [None]:
import numpy as np
from PIL import Image

# Open the image
image = Image.open("/content/Images/1000268201_693b08cb0e.jpg")

# Convert the image to a NumPy array
image_array = np.array(image)

# Get the shape of the array
height, width, channels = image_array.shape

# Print the shape
print("Image Width:", width)
print("Image Height:", height)
print("Number of Channels:", channels)


Image Width: 375
Image Height: 500
Number of Channels: 3


#Model

In [None]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

In [None]:
def generate_caption(image_path, text, max_length=None):
    # Load the image
    raw_image = Image.open(image_path)

    # Conditional image captioning
    inputs = processor(raw_image, text, return_tensors="pt")
    out = model.generate(**inputs,max_new_tokens=max_length)
    return processor.decode(out[0], skip_special_tokens=True)

In [None]:
def generate_captions_for_images(image_directory, mapping, max_images=5):
    # Initialize a dictionary to store generated captions
    generated_captions = {}

    # Variable to keep track of the number of processed images
    processed_images = 0

    for image_name, captions in mapping.items():
        if processed_images >= max_images:
            break  # Exit the loop if the maximum number of images is reached

        image_path = os.path.join(image_directory, image_name + ".jpg")
        for caption in captions:
            generated_caption = generate_caption(image_path, caption)
            if image_name not in generated_captions:
                generated_captions[image_name] = [generated_caption]
            else:
                generated_captions[image_name].append(generated_caption)

        processed_images += 1  # Increment the processed image count

    return generated_captions

In [None]:
max_length = 30

In [None]:
generated_captions = generate_captions_for_images(image_directory, mapping)



In [None]:
generated_captions.keys()

dict_keys(['1000268201_693b08cb0e', '1001773457_577c3a7d70', '1002674143_1b742ab4b8', '1003163366_44323f5815', '1007129816_e794419615'])

In [None]:
list(mapping.values())[:5]

[['A child in a pink dress is climbing up a set of stairs in an entry way .',
  'A girl going into a wooden building .',
  'A little girl climbing into a wooden playhouse .',
  'A little girl climbing the stairs to her playhouse .',
  'A little girl in a pink dress going into a wooden cabin .'],
 ['A black dog and a spotted dog are fighting',
  'A black dog and a tri-colored dog playing with each other on the road .',
  'A black dog and a white dog with brown spots are staring at each other in the street .',
  'Two dogs of different breeds looking at each other on the road .',
  'Two dogs on pavement moving toward each other .'],
 ['A little girl covered in paint sits in front of a painted rainbow with her hands in a bowl .',
  'A little girl is sitting in front of a large painted rainbow .',
  'A small girl in the grass plays with fingerpaints in front of a white canvas with a rainbow on it .',
  'There is a girl with pigtails sitting in front of a rainbow painting .',
  'Young girl w

#ROUGE

In [None]:
rouge_metric = load_metric('rouge')

In [None]:
records = []
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

In [None]:
reference = list(mapping.values())[:5]
rouge_metric.add(prediction =  list(generated_captions.values())[:5], reference = reference )
score = rouge_metric.compute()
rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )
print('rouge_dict ', rouge_dict )
records.append(rouge_dict)

rouge_dict  {'rouge1': 0.9620253164556963, 'rouge2': 0.9365079365079365, 'rougeL': 0.9620253164556963, 'rougeLsum': 0.9620253164556963}


In [None]:
rouge_dict

{'rouge1': 0.9620253164556963,
 'rouge2': 0.9365079365079365,
 'rougeL': 0.9620253164556963,
 'rougeLsum': 0.9620253164556963}

In [None]:
def rouge_function(mapping,generated_captions):
  rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
  reference = list(mapping.values())[:5]
  rouge_metric.add(prediction =  list(generated_captions.values())[:5], reference = reference )
  score = rouge_metric.compute()
  rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )
  return rouge_dict

In [None]:
def functionone(caption_file,image_directory):
  mapping = create_image_caption_mapping(caption_file)
  generated_captions = generate_captions_for_images(image_directory, mapping)
  rouge_function(mapping,generated_captions)
  print('rouge score',rouge_function)