# Image Caption Generator

In [1]:
#Importing Libraries

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import image as mpimg
import string
import tensorflow as tf
from tensorflow import keras
from PIL import Image
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences, to_categorical
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Add
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from nltk.translate.bleu_score import sentence_bleu
import gc
import random



In [2]:
#Mounting google drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




---



In [3]:
#Set up Paths
image_dir = '/content/drive/MyDrive/Image_Caption_Generator/dataset/Images'
caption_file = '/content/drive/MyDrive/Image_Caption_Generator/dataset/captions.txt'

In [4]:
#Pre-processing images
def preprocess_image(image_path, target_size=(299, 299)):
    img = load_img(image_path, target_size=target_size)
    img_array = img_to_array(img) / 255.0  # Normalize image
    return img_array


In [8]:
#Loading captions using batches to avoid excessive memory usage
def load_captions_in_batches(caption_file):
    captions = {}
    table = str.maketrans('', '', string.punctuation)

    with open(caption_file, 'r') as file:
        lines = file.readlines()
        for line in lines:
            tokens = line.strip().split(',')
            if len(tokens) == 2:
                image_id, caption = tokens
                image_id_no_extension = os.path.splitext(image_id)[0]
                caption = caption.lower().translate(table)
                captions.setdefault(image_id_no_extension, []).append(f"<start> {caption} <end>")

    return captions

In [9]:
captions = load_captions_in_batches(caption_file )

In [10]:
#Splitting dataset into train, validation, and test (80% train, 16% validation, 4% test)
image_keys = list(captions.keys())
train_keys, test_keys = train_test_split(image_keys, test_size=0.2, random_state=42)
train_keys, val_keys = train_test_split(train_keys, test_size=0.2, random_state=42)


In [11]:
# Encoding Captions
train_captions = {k: captions[k] for k in train_keys}
val_captions = {k: captions[k] for k in val_keys}
test_captions = {k: captions[k] for k in test_keys}



In [12]:
#Tokenize Captions using Keras tokenizer
def tokenize_captions(captions):
    all_captions = [cap for caps in captions.values() for cap in caps]
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_captions)
    vocab_size = len(tokenizer.word_index) + 1
    max_length = max(len(seq.split()) for seq in all_captions)
    return tokenizer, vocab_size, max_length

In [13]:
tokenizer, vocab_size, max_length = tokenize_captions(captions)

In [14]:
#Feature Extraction using InceptionV3
base_model = InceptionV3(weights='imagenet')
encoder_model = Model(inputs=base_model.input, outputs=base_model.layers[-2].output)


In [15]:
def extract_features(image_dict, model, image_dir, batch_size=32):
    features = {}
    image_keys = list(image_dict.keys())

    for i in range(0, len(image_keys), batch_size):
        batch_keys = image_keys[i:i + batch_size]
        batch_images = []
        for key in batch_keys:
            img_filename = image_dict[key]
            img_path = os.path.join(image_dir, img_filename)

            #Preprocessing and adding the image to the batch
            img_array = preprocess_image(img_path)
            batch_images.append(img_array)

        if len(batch_images) > 0:
            batch_images = np.array(batch_images)
            batch_features = model.predict(batch_images, verbose=1)

            for j, key in enumerate(batch_keys):
                features[key] = batch_features[j]

            # Clearing memory after processing the batch
            del batch_images, batch_features
            gc.collect()

    return features


In [None]:
image_files = os.listdir(image_dir)
image_id_to_filename = {os.path.splitext(f)[0]: f for f in image_files}

train_image_files = {k: image_id_to_filename[k] for k in train_keys if k in image_id_to_filename}
val_image_files = {k: image_id_to_filename[k] for k in val_keys if k in image_id_to_filename}
test_image_files = {k: image_id_to_filename[k] for k in test_keys if k in image_id_to_filename}

# Extract features from the images for the training, validation, and testing sets
train_features = extract_features(train_image_files, encoder_model, image_dir, batch_size=32)
val_features = extract_features(val_image_files, encoder_model, image_dir, batch_size=32)
test_features = extract_features(test_image_files, encoder_model, image_dir, batch_size=32)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 14s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step
[1m1/1[0m [32

In [None]:
#Encoding captions
def encode_captions(captions_dict, tokenizer, max_length):
    encoded = {}
    for key, caps in captions_dict.items():
        encoded[key] = [
            pad_sequences([tokenizer.texts_to_sequences([cap])[0]], maxlen=max_length, padding='post')[0]
            for cap in caps
        ]
    return encoded

In [None]:
train_encoded = encode_captions(train_captions, tokenizer, max_length)
val_encoded = encode_captions(val_captions, tokenizer, max_length)
test_encoded = encode_captions(test_captions, tokenizer, max_length)


In [None]:
#Encoder-Decoder Model
embedding_dim = 256
lstm_units = 256
image_input = Input(shape=(2048,))
image_dense = Dense(embedding_dim, activation='relu')(image_input)

caption_input = Input(shape=(max_length,))
caption_embed = Embedding(vocab_size, embedding_dim, mask_zero=True)(caption_input)
decoder_lstm = LSTM(lstm_units)(caption_embed)

decoder_combined = Add()([image_dense, decoder_lstm])
output = Dense(vocab_size, activation='softmax')(decoder_combined)

model = Model(inputs=[image_input, caption_input], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy')



In [None]:
#Data Generator for training
def data_generator(features, captions, batch_size, tokenizer, max_length, vocab_size):

    keys = list(features.keys())
    while True:
        batch_images = []
        batch_sequences = []
        batch_targets = []

        for i in range(0, len(keys), batch_size):
            batch_keys = keys[i:i + batch_size]  # Get the next batch of keys

            # If the batch size is smaller than the requested batch size, skip this batch
            if len(batch_keys) < batch_size:
                continue  # Skip if batch is smaller than batch_size

            # Loop through the images and their associated captions
            for key in batch_keys:
                image = features[key]  # Get the image feature for this batch

                if len(captions[key]) == 0:  # Check if there are no captions
                    continue

                caption_array = captions[key][0]  # Take the first caption
                seq = list(caption_array)  # Convert caption to sequence

                # Generate one sequence (input and output) per image
                for j in range(1, len(seq)):  # For each word in the caption
                    in_seq, out_seq = seq[:j], seq[j]  # Split into input and output sequence

                    # Pad input sequence to max length
                    in_seq = pad_sequences([in_seq], maxlen=max_length, padding='post')[0]
                    # Convert output sequence to categorical (one-hot encoding)
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                    # Add the current image and caption sequence to the batch
                    batch_images.append(image)  # Add the image feature
                    batch_sequences.append(in_seq)  # Add the input sequence
                    batch_targets.append(out_seq)  # Add the output sequence

            # Ensure the batch size is correct (convert to numpy arrays after collecting data)
            batch_images = np.array(batch_images, dtype=np.float32)  # Convert list to np.array
            batch_sequences = np.array(batch_sequences, dtype=np.int32)  # Convert list to np.array
            batch_targets = np.array(batch_targets, dtype=np.float32)  # Convert list to np.array

            # If the batch size is less than the expected batch size, skip this batch
            if batch_images.shape[0] != batch_size:
                continue  # Skip this batch if it doesn't match batch_size

            # Yield the batch
            yield (  # Yield a tuple of (inputs, targets)
                (batch_images, batch_sequences),  # Inputs tuple
                batch_targets  # Targets
            )


In [408]:
# Creating training data generator
train_generator = data_generator(
    features=train_features,
    captions=train_encoded,
    batch_size=32,
    tokenizer=tokenizer,
    max_length=max_length,
    vocab_size=vocab_size
)



In [409]:
#Creating valid data generator
val_generator = data_generator(
    features=val_features,
    captions=val_encoded,
    batch_size=32,
    tokenizer=tokenizer,
    max_length=max_length,
    vocab_size=vocab_size
)

## Training the model

In [410]:
steps_per_epoch = len(train_encoded) // 32
validation_steps = len(val_encoded) // 32

In [411]:
# Defining EarlyStopping and reducing learning rate callback
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=2, min_lr=1e-6, verbose=1)

In [None]:
history = model.fit(
    train_generator,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_generator,
    validation_steps=validation_steps,
    epochs=10,
    callbacks=[early_stopping, reduce_lr]
    verbose = 1
    )

In [None]:
print(history.history)

In [296]:
#Plotting training and validation loss curves

def plot_training_history(history):
    plt.figure(figsize=(10, 6))
    plt.plot(history.history['loss'], label='Training Loss')

    if 'val_loss' in history.history:
        plt.plot(history.history['val_loss'], label='Validation Loss')

    plt.title('Model Loss Progression')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.show()
plot_training_history(history)

NameError: name 'history' is not defined

In [None]:
#Saving the trained model
model.save('/content/drive/MyDrive/Image_Caption_Generator/model.h5')


In [None]:
def generate_caption(image, model, tokenizer, max_length):
    """Generate caption for a given image."""
    start_word = '<start>'
    caption = [start_word]

    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([caption])[0]
        sequence = pad_sequences([sequence], maxlen=max_length, padding='post')

        pred = model.predict([image, sequence], verbose=0)
        pred_word_idx = np.argmax(pred)

        word = tokenizer.index_word.get(pred_word_idx)
        if word == '<end>':
            break
        caption.append(word)

    return ' '.join(caption[1:])

In [None]:
def visualize_caption_predictions(model, tokenizer, image_dir, test_keys, max_length, num_samples=5):
    sample_keys = random.sample(test_keys, num_samples)

    plt.figure(figsize=(15, 10))

    for i, key in enumerate(sample_keys):
        img_path = os.path.join(image_dir, key)
        img = mpimg.imread(img_path)
        img_array = preprocess_image(img_path)
        img_features = encoder_model.predict(np.expand_dims(img_array, axis=0))
        caption = generate_caption(img_features, model, tokenizer, max_length)

        plt.subplot(1, num_samples, i+1)
        plt.imshow(img)
        plt.axis('off')
        plt.title(caption, fontsize=12)

    plt.show()
visualize_caption_predictions(model, tokenizer, image_dir, test_keys, max_length)

In [None]:
#Evaluating the generated caption using BLEU score against multiple references

def evaluate_caption(generated_caption, reference_captions):
    generated = generated_caption.split()

    if not generated:
        return 0.0

    bleu_scores = []
    for reference in reference_captions:
        reference_tokens = reference.split()

        score = sentence_bleu([reference_tokens], generated, smoothing_function=SmoothingFunction().method4)
        bleu_scores.append(score)

    return np.mean(bleu_scores)

In [None]:
#Evaluate model performance on the test set using BLEU score for all references

def evaluate_on_test_set(model, tokenizer, test_features, test_captions, max_length):
    bleu_scores = []

    for key in test_features.keys():
        reference_captions = test_captions[key]
        img_features = test_features[key]
        generated_caption = generate_caption(img_features, model, tokenizer, max_length)
        score = evaluate_caption(generated_caption, reference_captions)
        bleu_scores.append(score)

    avg_bleu_score = np.mean(bleu_scores)
    print(f"Average BLEU score on test set: {avg_bleu_score}")
    return avg_bleu_score


In [None]:
evaluate_on_test_set(model, tokenizer, test_features, test_encoded, max_length)
visualize_caption_predictions(model, tokenizer, image_dir, test_keys, max_length)