## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Activation, Dropout, Flatten, Dense, Input, Layer
from tensorflow.keras.layers import Embedding, LSTM, add, Concatenate, Reshape, concatenate, Bidirectional
from tensorflow.keras.applications import VGG16, ResNet50, DenseNet201
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from textwrap import wrap

plt.rcParams['font.size'] = 12
sns.set_style("dark")
warnings.filterwarnings('ignore')

## Data Reading

In [None]:
image_path = '../input/flickr8k/Images'

In [None]:
data = pd.read_csv("../input/flickr8k/captions.txt")
data.head()

In [None]:
def readImage(path,img_size=224):
    img = load_img(path,color_mode='rgb',target_size=(img_size,img_size))
    img = img_to_array(img)
    img = img/255.
    
    return img

def display_images(temp_df):
    temp_df = temp_df.reset_index(drop=True)
    plt.figure(figsize = (20 , 20))
    n = 0
    for i in range(15):
        n+=1
        plt.subplot(5 , 5, n)
        plt.subplots_adjust(hspace = 0.7, wspace = 0.3)
        image = readImage(f"../input/flickr8k/Images/{temp_df.image[i]}")
        plt.imshow(image)
        plt.title("\n".join(wrap(temp_df.caption[i], 20)))
        plt.axis("off")

In [None]:
display_images(data.sample(15))

In [None]:
def text_preprocessing(data):
    data['caption'] = data['caption'].apply(lambda x: x.lower())
    data['caption'] = data['caption'].apply(lambda x: x.replace("[^A-Za-z]",""))
    data['caption'] = data['caption'].apply(lambda x: x.replace("\s+"," "))
    data['caption'] = data['caption'].apply(lambda x: " ".join([word for word in x.split() if len(word)>1]))
    data['caption'] = "startseq "+data['caption']+" endseq"
    return data

## PreProcessed Text


In [None]:
data = text_preprocessing(data)
captions = data['caption'].tolist()
captions[:10]

## Tokenization 

In [None]:

tokenizer = Tokenizer()

# Fit the tokenizer on the caption data to create a word vocabulary.
tokenizer.fit_on_texts(captions)

vocab_size = len(tokenizer.word_index) + 1

max_length = max(len(caption.split()) for caption in captions)

# Identify unique image filenames and split the dataset into training and validation sets.
images = data['image'].unique().tolist()
nimages = len(images)

split_index = round(0.85*nimages)
train_images = images[:split_index]
val_images = images[split_index:]

# Segment the data into training and validation sets based on image filenames.
train = data[data['image'].isin(train_images)]
test = data[data['image'].isin(val_images)]

train.reset_index(inplace=True,drop=True)
test.reset_index(inplace=True,drop=True)

tokenizer.texts_to_sequences([captions[1]])[0]



## Image Feature Extraction

In [None]:

# Path to the downloaded DenseNet201 weights file
weights_path = '/kaggle/input/densenet/densenet201_weights_tf_dim_ordering_tf_kernels.h5'

# Load the DenseNet201 model with pre-trained ImageNet weights
densenet_model = DenseNet201(weights=None, include_top=True)
densenet_model.load_weights(weights_path)

# Create a feature extraction model
fe = Model(inputs=densenet_model.input, outputs=densenet_model.layers[-2].output)

# Image directory
image_path = '../input/flickr8k/Images'
img_size = 224

features = {}

# Assuming data['image'] contains filenames or paths of images
for image_filename in tqdm(data['image'].unique().tolist()):
    img_path = os.path.join(image_path, image_filename)
    if os.path.exists(img_path):
        img = load_img(img_path, target_size=(img_size, img_size))
        img = img_to_array(img)
        img = img / 255.0  # Normalize pixel values
        img = np.expand_dims(img, axis=0)
        feature = fe.predict(img, verbose=0)
        features[image_filename] = feature
    else:
        print(f"Image file {image_filename} not found at {img_path}")

## Data Generation

In [None]:
class CustomDataGenerator(Sequence):
    
    def __init__(self, df, X_col, y_col, batch_size, directory, tokenizer, 
                 vocab_size, max_length, features,shuffle=True):
        # Copies the input DataFrame and sets attributes for data manipulation
        self.df = df.copy()
        self.X_col = X_col
        self.y_col = y_col
        self.directory = directory
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.features = features
        self.shuffle = shuffle
        self.n = len(self.df)
        
    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)
    
    def __len__(self):
        return self.n // self.batch_size
    
    def __getitem__(self,index):
    
        batch = self.df.iloc[index * self.batch_size:(index + 1) * self.batch_size,:]
        X1, X2, y = self.__get_data(batch)        
        return (X1, X2), y
    
    def __get_data(self,batch):
        # create empty list
        X1, X2, y = list(), list(), list()
        
        images = batch[self.X_col].tolist()
           
        for image in images:
            feature = self.features[image][0]
            
            captions = batch.loc[batch[self.X_col]==image, self.y_col].tolist()
            for caption in captions:
                seq = self.tokenizer.texts_to_sequences([caption])[0]

                for i in range(1,len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=self.max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=self.vocab_size)[0]
                    # Append generated sequences to respective lists
                    X1.append(feature)
                    X2.append(in_seq)
                    y.append(out_seq)
        # Convert lists to numpy arrays    
        X1, X2, y = np.array(X1), np.array(X2), np.array(y)
                
        return X1, X2, y

## Modelling

In [None]:
#set input layer for the model

input1 = Input(shape=(1920,))
input2 = Input(shape=(max_length,))

# Process image features through a Dense layer
img_features = Dense(256, activation='relu')(input1)
img_features_reshaped = Reshape((1, 256), input_shape=(256,))(img_features)

# Process text sequences through an Embedding layer
sentence_features = Embedding(vocab_size, 256, mask_zero=False)(input2)

# Merge image and text features
merged = concatenate([img_features_reshaped,sentence_features],axis=1)

# Apply necessary methods
sentence_features = LSTM(256)(merged)
x = Dropout(0.5)(sentence_features)
x = add([x, img_features])
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)

# Generate the final output predictions using a softmax activation
output = Dense(vocab_size, activation='softmax')(x)

caption_model = Model(inputs=[input1,input2], outputs=output)
caption_model.compile(loss='categorical_crossentropy',optimizer='adam')

In [None]:
from tensorflow.keras.utils import plot_model

## Model Modification

In [None]:
plot_model(caption_model)

In [None]:
caption_model.summary()

In [None]:
# Creating a custom data generator for training and validation

# Training data generator
train_generator = CustomDataGenerator(
    df=train,X_col='image'
    ,y_col='caption',
    batch_size=64,
    directory=image_path,
    tokenizer=tokenizer,
    vocab_size=vocab_size,
    max_length=max_length,
    features=features
)

# Validation data generator
validation_generator = CustomDataGenerator(
    df=test,
    X_col='image',
    y_col='caption',
    batch_size=64,
    directory=image_path,
    tokenizer=tokenizer,
    vocab_size=vocab_size,
    max_length=max_length,
    features=features
)

In [None]:
model_name = "model.h5"
checkpoint = ModelCheckpoint(
    model_name,
    monitor="val_loss",
    mode="min",
    save_best_only = True,
    verbose=1
                            )

# Early stopping to stop training if validation loss doesn't improve after a certain number of epochs
earlystopping = EarlyStopping(
    monitor='val_loss', # Metric to monitor for early stopping
    min_delta = 0, # Minimum change in the monitored quantity to qualify as improvement
    patience = 5, # Number of epochs with no improvement after which training will be stopped
    verbose = 1, # Verbosity mode (1: displays messages about early stopping)
    restore_best_weights=True # Restore the weights of the best model
                            )

# Reduce learning rate when the validation loss has stopped improving
learning_rate_reduction = ReduceLROnPlateau(
    monitor='val_loss', # Metric to monitor for reducing learning rate
    patience=3, # Number of epochs to wait before applying the reduction factor
    verbose=1, # Verbosity mode (1: displays messages about reducing learning rate)
    factor=0.2, # Factor by which the learning rate will be reduced
    min_lr=0.00000001  # Minimum learning rate allowed after reduction
                                           )

In [None]:
# Train the caption generation model using the previously defined generators and callbacks

history = caption_model.fit(
        train_generator,
        epochs=5,
        validation_data=validation_generator,
        callbacks=[checkpoint,earlystopping,learning_rate_reduction])

## Caption Generation 

In [None]:
def idx_to_word(integer,tokenizer):
    # Iterate through the word index in the tokenizer
    for word, index in tokenizer.word_index.items():
        # Check if the index matches the provided integer
        if index==integer:
            return word
    return None

In [None]:
def predict_caption(model, image, tokenizer, max_length, features):
    
    # Retrieve the features of the given image   
    feature = features[image]
    
    # Initialize the caption with the start token
    in_text = "startseq"
    
    for i in range(max_length):
        # Convert the current caption text to a sequence of word indices using the tokenizer
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], max_length)

        y_pred = model.predict([feature,sequence])
        y_pred = np.argmax(y_pred)
        
        word = idx_to_word(y_pred, tokenizer)
        
        if word is None:
            break
            
        in_text+= " " + word
        
        # break loop if token ends, add endseq
        if word == 'endseq':
            break
            
    return in_text 

## Taking 15 random samples for caption prediction

In [None]:
samples = test.sample(15)
samples.reset_index(drop=True,inplace=True)

In [None]:
for index,record in samples.iterrows():

    img = load_img(os.path.join(image_path,record['image']),target_size=(224,224))
    img = img_to_array(img)
    img = img/255.
    
    caption = predict_caption(caption_model, record['image'], tokenizer, max_length, features)
    samples.loc[index,'caption'] = caption

## Results

In [None]:
display_images(samples)

In [None]:
# Check lengths of the lists
print(f"Number of images (generated): {len(generated_captions)}")
print(f"Number of images (reference): {len(reference_captions)}")

# Check lengths of sublists within generated and reference captions
for i in range(len(generated_captions)):
    print(f"Image {i+1} - Generated: {len(generated_captions[i])}, Reference: {len(reference_captions[i])}")


In [None]:
from nltk.translate.bleu_score import sentence_bleu

# Assuming you have a function to generate captions similar to 'predict_caption'
# predict_caption(model, image, tokenizer, max_length, features)

# Create lists to store generated and reference captions
generated_captions = []
reference_captions = []

# Generate captions and collect references
for index, record in samples.iterrows():
    generated_caption = predict_caption(caption_model, record['image'], tokenizer, max_length, features)
    generated_captions.append(generated_caption.split())  # Tokenize generated caption
    reference_captions.append([record['caption'].split()])  # Tokenize reference caption

# Compute BLEU score
bleu_scores = []
for i in range(len(generated_captions)):
    bleu_score = sentence_bleu(reference_captions[i], generated_captions[i])
    bleu_scores.append(bleu_score)

# Average BLEU score across all samples
avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
print("Average BLEU Score:", avg_bleu_score)


In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.util import ngrams

# Function to calculate ROUGE-N scores
def rouge_n(reference, candidate, n=2):
    reference_ngrams = list(ngrams(reference.split(), n))
    candidate_ngrams = list(ngrams(candidate.split(), n))

    # Calculate intersection (common ngrams)
    intersection = len(set(reference_ngrams).intersection(candidate_ngrams))

    # Calculate ROUGE-N precision and recall
    precision = intersection / len(candidate_ngrams)
    recall = intersection / len(reference_ngrams)

    # Calculate ROUGE-N score (F1 score)
    rouge_score = 2 * ((precision * recall) / (precision + recall + 1e-8))

    return rouge_score

# Example usage:
reference_caption = "This is a reference caption"
generated_caption = "This is a generated caption"

rouge_2_score = rouge_n(reference_caption, generated_caption, n=2)
print(f"ROUGE-2 score: {rouge_2_score}")
