In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.efficientnet import preprocess_input
from transformers import DistilBertTokenizer, TFDistilBertModel
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Add, Concatenate, Layer, MultiHeadAttention, Dropout, LayerNormalization
from tensorflow.keras.callbacks import History
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer, TFVisionEncoderDecoderModel

# Function to evaluate the model using BLEU, METEOR, ROUGE, CIDEr
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider

# Ensure nltk resources are downloaded
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# required packages
# pip install tensorflow transformers nltk matplotlib pycocoevalcap

In [None]:
# Load the captions file
captions_file = './flickr8k/captions.txt'
captions = pd.read_csv(captions_file)
captions.columns = ['image', 'caption']

# Display the first few rows of the captions file
captions.head()

In [None]:
# Display sample image
def display_image(image_path):
    img = plt.imread(image_path)
    plt.imshow(img)
    plt.axis('off')
    plt.show()

display_image('./flickr8k/Images/96420612_feb18fc6c6.jpg')

In [None]:
# Text preprocessing
def text_preprocessing(data):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    data['caption'] = data['caption'].astype(str).fillna('')
    data['caption'] = data['caption'].apply(lambda x: x.lower())
    data['caption'] = data['caption'].apply(lambda x: re.sub(r'[^a-z]', ' ', x))
    data['caption'] = data['caption'].apply(lambda x: re.sub(r'\s+', ' ', x))
    data['caption'] = data['caption'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split() if word not in stop_words and len(word) > 1]))
    data['caption'] = data['caption'].apply(lambda x: f"startseq {x.strip()} endseq")
    return data

captions = text_preprocessing(captions)
captions.head()


In [None]:
# Sample 7000 entries from the dataset
# captions_sample = captions.sample(n=7000, random_state=42).reset_index(drop=True)

# Create a dictionary to map images to their captions
captions_dict = {}
for idx, row in captions.iterrows():
    img, caption = row['image'], row['caption']
    if img not in captions_dict:
        captions_dict[img] = []
    captions_dict[img].append(caption)

print(f"Number of images: {len(captions_dict)}")

# Split the data into training and validation sets
train_images, val_images = train_test_split(list(captions_dict.keys()), test_size=0.2, random_state=42)

# Tokenize the captions
all_captions = [caption for captions in captions_dict.values() for caption in captions]

print(f"Number of captions: {len(all_captions)}")

# Fit the tokenizer on the captions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)

# Define the vocabulary size and maximum caption length
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption.split()) for caption in all_captions)

print(f"Vocabulary size: {vocab_size}")
print(f"Maximum caption length: {max_length}")

# Encode the captions
def encode_captions(captions):
    sequences = tokenizer.texts_to_sequences(captions)
    return pad_sequences(sequences, maxlen=max_length, padding='post')

# Create dictionaries to map images to their encoded captions
train_captions = {img: encode_captions(captions_dict[img]) for img in train_images}
val_captions = {img: encode_captions(captions_dict[img]) for img in val_images}

print(f"Number of training images: {len(train_captions)}")
print(f"Number of validation images: {len(val_captions)}")


In [None]:
# Using EfficientNet for feature extraction
base_model = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg')
model_extract = Model(inputs=base_model.input, outputs=base_model.layers[-1].output)

# function to extract features from an image
def extract_features(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)
    features = model_extract.predict(img, verbose=0)
    return features

# Extract features for the training and validation images
train_features = {img: extract_features(os.path.join('./flickr8k/Images', img)) for img in train_images}
val_features = {img: extract_features(os.path.join('./flickr8k/Images', img)) for img in val_images}

# save the features
np.save('train_features.npy', train_features)
np.save('val_features.npy', val_features)





In [None]:
# Load DistilBERT model and tokenizer
distil_bert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distil_bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

def distil_bert_encode(texts):
    inputs = distil_bert_tokenizer(texts, return_tensors="tf", padding=True, truncation=True, max_length=max_length)
    outputs = distil_bert_model(inputs, training=False)
    return outputs.last_hidden_state

train_distil_bert_features = {img: distil_bert_encode(captions_dict[img]) for img in train_images}
val_distil_bert_features = {img: distil_bert_encode(captions_dict[img]) for img in val_images}

# Display the shapes of the extracted features
print(train_distil_bert_features[train_images[0]].shape)
print(val_distil_bert_features[val_images[0]].shape)

# save the features
np.save('train_distil_bert_features.npy', train_distil_bert_features)
np.save('val_distil_bert_features.npy', val_distil_bert_features)


In [None]:
# bahdanau attention
class BahdanauAttention(Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

In [None]:
# Load the features
train_features = np.load('train_features.npy', allow_pickle=True).item()
val_features = np.load('val_features.npy', allow_pickle=True).item()

# Load the features
train_distil_bert_features = np.load('train_distil_bert_features.npy', allow_pickle=True).item()
val_distil_bert_features = np.load('val_distil_bert_features.npy', allow_pickle=True).item()


# Model with a more complex combination of image and text features
def build_model(vocab_size, max_length):
    units = 256
    image_input = Input(shape=(train_features[train_images[0]].shape[1],))
    img_features = Dense(units, activation='relu')(image_input)
    img_features = tf.expand_dims(img_features, 1)
    img_features = Dropout(0.5)(img_features)

    text_input = Input(shape=(max_length,))
    text_features = Embedding(vocab_size, units)(text_input)
    text_features = LSTM(units, return_sequences=True)(text_features)
    text_features = Dropout(0.5)(text_features)

    attention_layer = BahdanauAttention(units)
    context_vector, _ = attention_layer(img_features, text_features[:, -1, :])

    combined_features = Concatenate()([context_vector, text_features[:, -1, :]])

    output = Dense(vocab_size, activation='softmax')(combined_features)

    model = Model(inputs=[image_input, text_input], outputs=output)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
    return model

model = build_model(vocab_size, max_length)

model.summary()


In [None]:
# Training the model
history = History()

# Data generator
def data_generator(features, captions, batch_size):
    n = len(captions)
    while True:
        for i in range(0, n, batch_size):
            batch_images = list(features.keys())[i:i+batch_size]
            batch_features = [features[img][0] for img in batch_images]
            batch_captions = [captions[img] for img in batch_images]
            x1, x2, y = [], [], []
            for j, img_features in enumerate(batch_features):
                for caption in batch_captions[j]:
                    seq = caption
                    for k in range(1, len(seq)):
                        in_seq, out_seq = seq[:k], seq[k]
                        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                        out_seq = out_seq
                        x1.append(img_features)
                        x2.append(in_seq)
                        y.append(out_seq)
            yield [np.array(x1), np.array(x2)], np.array(y)

# Training the model with more epochs, early stopping, and learning rate scheduling
batch_size = 64
epochs = 50
steps_per_epoch = len(train_captions) // batch_size
validation_steps = len(val_captions) // batch_size

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, verbose=1, min_lr=1e-7)
history_callback = History()

# Train the model
history = model.fit(
    data_generator(train_features, train_captions, batch_size),
    steps_per_epoch=steps_per_epoch,
    epochs=epochs,
    validation_data=data_generator(val_features, val_captions, batch_size),
    validation_steps=validation_steps,
    callbacks=[early_stopping, lr_scheduler, history_callback]
)

# Plot training and validation loss
def plot_loss(history):
    plt.plot(history.history['loss'], label='train_loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.show()

plot_loss(history)




In [None]:
# Beam search implementation for caption generation using the trained model

def beam_search(image_features, beam_width=3):
    # Start with the start token
    start = [tokenizer.word_index['startseq']]
    start_word = [[start, 0.0]]

    # Start the loop to perform Beam Search
    while len(start_word[0][0]) < max_length:
        temp = []
        for s in start_word:
            sequence = pad_sequences([s[0]], maxlen=max_length)
            preds = model.predict([image_features, sequence], verbose=0)
            word_preds = np.argsort(preds[0])[-beam_width:]

            for w in word_preds:
                next_cap, prob = s[0][:], s[1]
                next_cap.append(w)
                prob += preds[0][w]
                temp.append([next_cap, prob])

        start_word = sorted(temp, reverse=False, key=lambda l: l[1])
        start_word = start_word[-beam_width:]

    final_caption = start_word[-1][0]
    final_caption = [tokenizer.index_word[i] for i in final_caption if i > 0]
    return ' '.join(final_caption[1:-1])


# Above function for generating captions using beam search is good but the is



In [None]:
# making a caption for a single image
def make_caption(image_path):
    img = extract_features(image_path)
    caption_beam_search = beam_search(img, beam_width=7)
    # caption = generate_caption(model, tokenizer, img, max_length)
    original_caption = captions[captions['image'] == image_path.split('/')[-1]]['caption'].values[0]
    # return [caption_beam_search, original_caption]
    return [caption_beam_search, original_caption]

# Display the image and caption
def display_image_caption(image_path):
    img = plt.imread(image_path)
    plt.imshow(img)
    plt.axis('off')
    plt.show()
    captions = make_caption(image_path)
    print(f"Caption using Beam Search: {captions[0]}, original caption: {captions[1]} /n bleu score: {sentence_bleu(captions[0], captions[1])}")

# display_image_caption('./flickr30k/Images/1000092795.jpg')
# display_image_caption('./flickr30k/Images/10002456.jpg')
# display_image_caption('./flickr30k/Images/1001545525.jpg')
# display_image_caption('./archive/Images/115684808_cb01227802.jpg')
# display_image_caption('./archive/Images/1178705300_c224d9a4f1.jpg')
# display_image_caption('./archive/Images/2040941056_7f5fd50794.jpg')

