In [None]:
import tensorflow as tf
import random
import numpy as np
import os
import warnings

def configure_gpus():
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.list_logical_devices('GPU')
            print(f"{len(gpus)} Physical GPU(s), {len(logical_gpus)} Logical GPU(s) configured.")
        except RuntimeError as e:
            print(f"RuntimeError in configuring GPUs: {e}")
    else:
        print("No GPU is available.")

def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

def check_jupyter_notebook():
    try:
        cfg = get_ipython().config 
        print("Jupyter Notebook environment detected. Configuring...")
        cfg.NotebookApp.iopub_msg_rate_limit = 20000.0
        cfg.NotebookApp.rate_limit_window = 10.0
    except NameError:
        print("Not running in a Jupyter Notebook environment.")

# Clear TensorFlow session and suppress warnings
tf.keras.backend.clear_session()
warnings.filterwarnings("ignore")

# GPU configuration and seed setting
configure_gpus()
seed_everything()

# Check if running in Jupyter Notebook and configure
check_jupyter_notebook()

# Initialize TensorFlow distributed strategy
#strategy = tf.distribute.MirroredStrategy()

In [None]:
import pickle
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib #for saving model files as pkl files
import os
import seaborn as sns
import cv2
import imgaug.augmenters as iaa
sns.set(palette='muted',style='white')
import tensorflow as tf
from tensorflow.keras.layers import Dense,GlobalAveragePooling2D, Input, Embedding, LSTM,Dot,Reshape,Concatenate,BatchNormalization, GlobalMaxPooling2D, Dropout, Add
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.translate.bleu_score import sentence_bleu #bleu score
from tensorflow.keras.layers import AveragePooling2D
from tensorflow.keras.layers import GRU
tf.compat.v1.enable_eager_execution()
import os
import math
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
import numpy as np
import pandas as pd
import re
import nltk

In [None]:
TRAIN_CSV = 'ImageCLEFmedical_Caption_2023_caption_prediction_train_labels.csv'
VALID_CSV = 'ImageCLEFmedical_Caption_2023_caption_prediction_valid_labels.csv'


# Load datasets
df_train = pd.read_csv(TRAIN_CSV, delimiter='\t')
df_valid = pd.read_csv(VALID_CSV, delimiter='\t')


image_caption_pairs = {}
caption_ls = []
vocab_list = set()
cap_len = []

def process_caption(text):
    # Lowercase the caption label
    caption_labels_lower = text.lower()
    caption_labels_lower = caption_labels_lower.strip()

    # Replace hyphens with spaces
    caption_labels_clean = re.sub(r'-', ' ', caption_labels_lower)

    # Remove special characters using regular expressions
    caption_labels_clean = re.sub(r'[^a-zA-Z0-9\s]', '', caption_labels_clean)

    # Tokenize the caption label into words
    words = caption_labels_clean.split()

    # Remove numbers, words containing any numeric values
    processed_words = [word for word in words if not word.isdigit() and not any(char.isdigit() for char in word)]

    # Filter out words with 4 or fewer characters
    processed_words = [word for word in processed_words if len(word) > 4]
    
    # Join the processed words back into a sentence
    caption_labels_processed = ' '.join(processed_words)
    caption_labels_processed = '<START> '+caption_labels_processed+ ' <END>'
    return caption_labels_processed


# Process training images and captions
for index, row in tqdm(df_train.iterrows(), total=df_train.shape[0], desc="Training Image and Caption Data"):
    img_id, img_caption = row['ID'], row['caption']
    img_path = 'ImageCLEFmedical_Caption_2023_train_images/' + str(img_id) + '.jpg'
    caption = process_caption(img_caption)

    # Check if the image file exists
    if os.path.exists(img_path):
        # Check if img_id is already a key in caption_mapping; if not, initialize it
        if img_id not in image_caption_pairs:
            image_caption_pairs[img_path] = []

        # Add the processed caption to the list for this image
        image_caption_pairs[img_path].append(caption)
        caption_ls.append(caption)
        vocab_list.update(caption.split(' '))
        words = caption.split(' ')
        cap_len.append(len(words))
    else:
        print(f"File not found: {img_path}")

# Process validation images and captions
for index, row in tqdm(df_valid.iterrows(), total=df_valid.shape[0], desc="Validation Image and Caption Data"):
    img_id, img_caption = row['ID'], row['caption']
    img_path = 'ImageCLEFmedical_Caption_2023_valid_images/' + str(img_id) + '.jpg'
    caption = process_caption(img_caption)

    # Check if the image file exists
    if os.path.exists(img_path):
        # Check if img_id is already a key in caption_mapping; if not, initialize it
        if img_id not in image_caption_pairs:
            image_caption_pairs[img_path] = []

        # Add the processed caption to the list for this image
        image_caption_pairs[img_path].append(caption)

        # Additional processing (if needed)
        caption_ls.append(caption)
        vocab_list.update(caption.split(' '))
        words = caption.split(' ')
        cap_len.append(len(words))
    else:
        print(f"File not found: {img_path}")

In [None]:
from collections import Counter
# Calculate frequencies
frequency = Counter(cap_len)


# Sort items by frequency in descending order
sorted_frequency = sorted(frequency.items(), key=lambda x: x[1], reverse=True)

# Total number of items
total_items = sum(frequency.values())

# Sort items by frequency in descending order
sorted_frequency = sorted(frequency.items(), key=lambda x: x[1], reverse=True)

# Calculate cumulative frequency and find the 90% point
cumulative = 0
ninety_percent_mark = total_items * 0.99
for num, freq in sorted_frequency:
    cumulative += freq
    if cumulative >= ninety_percent_mark:
        print(f"The item at the 99% cumulative frequency mark is: {num}")
        break
        
# So the maximum length should be 85
max_length = num
print("The maximum length is:", max_length)
print("# of words in Vocabulary set:", len(vocab_list))

# Words to check
words_to_check = ["<START>", "<END>", ""]

# Check if the <START>, <END>, '' are in the list
for word in words_to_check:
    if word in vocab_list:
        print(f"'{word}' is in the list.")
    else:
        print(f"'{word}' is not in the list.")

# Check for empty strings
empty_string_found = False
for index, caption in enumerate(caption_ls):
    if caption.strip() == "":  # Using strip() to also catch strings with only whitespace
        empty_string_found = True
        print(f"Empty string found at index {index}")

if not empty_string_found:
    print("No empty strings found in the list.")

In [None]:
data = pd.read_pickle('pickles_clef23/data.pkl')
max_pad = 100
df = data
input_size = (224,224)
batch_size=16
embedding_dim = 300
dense_dim = 512
lstm_units = dense_dim
dropout_rate = 0.2
vocab_size = len(vocab_list)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming 'data' is your original dictionary with image paths as keys and caption lists as values
# and 'max_pad' is the maximum length for padding the captions
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
all_captions = [cap[0] for cap in data.values()]  # Extract all captions
tokenizer.fit_on_texts(all_captions)

# # Tokenize and pad all captions
# tokenized_captions = tokenizer.texts_to_sequences(all_captions)
# padded_captions = pad_sequences(tokenized_captions, maxlen=max_pad, padding='post')

# # Add tokenized captions back to the data dictionary
# for i, key in enumerate(data.keys()):
#     data[key] = padded_captions[i]

In [None]:
import numpy as np

def train_val_test_split(caption_data, train_size=0.8, val_size=0.15, shuffle=True):
    all_images = list(caption_data.keys())
    if shuffle:
        np.random.shuffle(all_images)
    total_size = len(caption_data)
    train_end = int(total_size * train_size)
    val_end = train_end + int(total_size * val_size)  # Calculate the end index for validation set
    
    # Split the data
    training_data = {img_name: caption_data[img_name] for img_name in all_images[:train_end]}
    validation_data = {img_name: caption_data[img_name] for img_name in all_images[train_end:val_end]}
    test_data = {img_name: caption_data[img_name] for img_name in all_images[val_end:]}  # Remaining data for testing

    return training_data, validation_data, test_data

# Assuming 'data' is your dataset
train_data, val_data, test_data = train_val_test_split(data)

print("Number of training samples: ", len(train_data))
print("Number of validation samples: ", len(val_data))
print("Number of test samples: ", len(test_data))

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence

class Dataset(Sequence):
    def __init__(self, data, input_size, tokenizer, max_pad, augment=False):
        self.images = list(data.keys())
        self.captions = list(data.values())
        self.input_size = input_size
        self.tokenizer = tokenizer
        self.max_pad = max_pad
        self.augment = augment  # Flag to control augmentation

    def __getitem__(self, i):
        image_path = self.images[i]
        image = tf.io.read_file(image_path)
        image = tf.image.decode_jpeg(image, channels=3)
        image = tf.image.resize(image, self.input_size)

        # Apply augmentation if enabled
        if self.augment:
            image = self.apply_augmentation(image)

        image = tf.cast(image, tf.float32) / 255.0

        caption = self.captions[i]
        caption_seq = self.tokenizer.texts_to_sequences(caption)
        caption_seq = pad_sequences(caption_seq, maxlen=self.max_pad, padding='post')
        caption_seq = tf.squeeze(caption_seq, axis=0)

        # Creating the target sequence by shifting the caption
        target_seq = np.zeros_like(caption_seq)
        target_seq[:-1] = caption_seq[1:]

        return image, caption_seq, target_seq

    def apply_augmentation(self, image):
        # Randomly flip the image horizontally
        image = tf.image.random_flip_left_right(image)

        # Randomly adjust brightness
        image = tf.image.random_brightness(image, max_delta=0.1)

        # Add more augmentation methods here if needed

        return image

    def __len__(self):
        return len(self.images)

class Dataloader(Sequence):
    def __init__(self, dataset, batch_size=1, shuffle=True):
        self.dataset = dataset
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indexes = np.arange(len(self.dataset))

    def __getitem__(self, i):
        start = i * self.batch_size
        stop = min((i + 1) * self.batch_size, len(self.dataset))
        indexes = self.indexes[start:stop]

        batch_images = []
        batch_captions = []
        batch_targets = []

        for idx in indexes:
            image, caption, target = self.dataset[idx]
            if image is not None and caption is not None:
                batch_images.append(image)
                batch_captions.append(caption)
                batch_targets.append(target)

        if batch_images and batch_captions:
            batch_images = np.stack(batch_images, axis=0)
            batch_captions = np.stack(batch_captions, axis=0)
            batch_targets = np.stack(batch_targets, axis=0)
        else:
            batch_images = np.array([])
            batch_captions = np.array([])
            batch_targets = np.array([])

        return [batch_images, batch_captions], batch_targets

    def __len__(self):
        return len(self.indexes) // self.batch_size

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indexes)

In [None]:

train_dataset = Dataset(train_data, input_size, tokenizer, max_pad, augment=True)
val_dataset = Dataset(val_data, input_size, tokenizer, max_pad, augment=False)  # Usually, we don't apply augmentation to validation data
test_dataset = Dataset(test_data, input_size, tokenizer, max_pad, augment=False)

train_dataloader = Dataloader(train_dataset, batch_size=batch_size)
val_dataloader = Dataloader(val_dataset, batch_size=batch_size)  # Create dataloader for validation data
test_dataloader = Dataloader(test_dataset, batch_size=batch_size)

# Check the number of batches
print(f"Number of batches in train_dataloader: {len(train_dataloader)}")
print(f"Number of batches in val_dataloader: {len(val_dataloader)}")  # Print for validation dataloader
print(f"Number of batches in test_dataloader: {len(test_dataloader)}")

# Check a single batch from each dataloader
train_batch = next(iter(train_dataloader))
val_batch = next(iter(val_dataloader))  # Get a batch from val_dataloader
test_batch = next(iter(test_dataloader))

train_images, train_captions = train_batch[0]
train_targets = train_batch[1]

valid_images, valid_captions = val_batch[0]
valid_targets = val_batch[1]

test_images, test_captions = test_batch[0]
test_targets = test_batch[1]

print(f"Shape of train images: {train_images.shape}")
print(f"Shape of train captions: {train_captions.shape}")
print(f"Shape of train targets: {train_targets.shape}")

print(f"Shape of train images: {valid_images.shape}")
print(f"Shape of train captions: {valid_captions.shape}")
print(f"Shape of train targets: {valid_targets.shape}")


print(f"Shape of test images: {test_images.shape}")
print(f"Shape of test captions: {test_captions.shape}")
print(f"Shape of test targets: {test_targets.shape}")

In [None]:
chexnet_weights = "brucechou1983_CheXNet_Keras_0.3.0_weights.h5"

def create_chexnet(chexnet_weights = chexnet_weights):
    """
    chexnet_weights: weights value in .h5 format of chexnet
    creates a chexnet model with preloaded weights present in chexnet_weights file
    """
    model = tf.keras.applications.DenseNet121(include_top=False) #importing densenet the last layer will be a relu activation layer

    #we need to load the weights so setting the architecture of the model as same as the one of tha chexnet
    x = model.output #output from chexnet
    x = GlobalAveragePooling2D()(x)
    x = Dense(14, activation="sigmoid", name="chexnet_output")(x) #here activation is sigmoid as seen in research paper

    chexnet = tf.keras.Model(inputs = model.input,outputs = x)
    chexnet.load_weights(chexnet_weights)
    chexnet = tf.keras.Model(inputs = model.input,outputs = chexnet.layers[-2].output)  #we will be taking the penultimate layer (second last layer here it is global avgpooling)
    return chexnet

class Image_encoder(tf.keras.layers.Layer):
    """
    This layer will output image backbone features after passing it through chexnet
    here chexnet will be not be trainable
    """
    def __init__(self,name = "image_encoder_block"):
        super().__init__()
        self.chexnet = create_chexnet()
        self.chexnet.trainable = False


    def call(self,data):
        op = self.chexnet(data)
        return op

In [None]:
glove = {}
vocab_size = len(vocab_list)
with open('ChestIU X-Ray Dataset/glove_6B/glove.6B.300d.txt',encoding='utf-8') as f: #taking 300 dimesions
    for line in f:
        word = line.split()
        glove[word[0]] = np.asarray(word[1:], dtype='float32')
print(f"Number of words loaded from GloVe: {len(glove)}")
print(f"Vocabulary size: {vocab_size}")
embedding_dim = 300
# create a weight matrix for words in training docs for embedding purpose
embedding_matrix = np.zeros((vocab_size+1, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = glove.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector[:embedding_dim]

In [None]:
tf.keras.backend.clear_session()
from tensorflow.keras.regularizers import l2
image = Input(shape = (input_size + (3,))) #shape = 224,224,3
caption = Input(shape = (max_pad,))




img_encoder = Image_encoder()
bk_feat = img_encoder(image)
image_dense = Dense(dense_dim,
                    activation = 'relu',
                    name = 'Image_dense',
                    use_bias='False',
                    kernel_regularizer=l2(0.01))                    
image_bkbone = image_dense(bk_feat)
image_dense_op = tf.keras.backend.expand_dims(image_bkbone,axis=1)



# embedding = Embedding(input_dim  = vocab_size+1,
#                               output_dim = embedding_dim,
#                               input_length = max_pad,
#                               mask_zero = True,
#                               weights = [embedding_matrix],
#                               name = 'embedding')

embedding = Embedding(input_dim=vocab_size+1,
                      output_dim=embedding_dim,
                      input_length=max_pad,
                      mask_zero=True,
                      trainable=True,  # Ensure the embedding layer is trainable
                      name='embedding')


embed_op = embedding(caption)

lstm_layer = LSTM(units = lstm_units,
                  return_sequences= True,
                  return_state = True)

lstm_op,lstm_h,lstm_c = lstm_layer(embed_op,initial_state = [image_bkbone,image_bkbone]) #op_shape = batch_size*input_length*lstm_units

lstm_op = BatchNormalization()(lstm_op)
add = Add()([image_dense_op,lstm_op])

op_dense = Dense(vocab_size+1,
                 activation = 'softmax',
                 name = 'output_dense')

output = op_dense(add)

model = tf.keras.Model(inputs = [image,caption], outputs = output)

In [None]:
model.summary()

In [None]:
loss_func = tf.keras.losses.SparseCategoricalCrossentropy() 

def custom_loss(y_true, y_pred):
    #getting mask value to not consider those words which are not present in the true caption
    mask = tf.math.logical_not(tf.math.equal(y_true, 0))

    #calculating the loss
    loss_ = loss_func(y_true, y_pred)
    
    #converting mask dtype to loss_ dtype
    mask = tf.cast(mask, dtype=loss_.dtype)
    
    #applying the mask to loss
    loss_ = loss_*mask
    
    #returning mean over all the values
    return tf.reduce_mean(loss_)

# # @tf.keras.utils.register_keras_serializable()
def custom_accuracy(y_true, y_pred):
    # Identify non-padding tokens
    mask = tf.cast(tf.not_equal(y_true, 0), dtype=tf.float32)
    
    # Predicted class (highest probability)
    y_pred_classes = tf.argmax(y_pred, axis=-1, output_type=tf.int32)
    
    # Ensure y_true is also of type int32 for the comparison
    y_true_int = tf.cast(y_true, tf.int32)
    
    # Boolean tensor of correct predictions, ignoring padding
    correct_predictions = tf.cast(tf.equal(y_true_int, y_pred_classes), dtype=tf.float32) * mask
    
    # Sum of correct predictions / Sum of non-padding tokens
    accuracy = tf.reduce_sum(correct_predictions) / (tf.reduce_sum(mask) + tf.keras.backend.epsilon())

    # # Debugging print statements
    # tf.print("Mask:", mask)
    # tf.print("Correct predictions:", correct_predictions)
    # tf.print("Accuracy:", accuracy)
    
    
    return accuracy



lr = 10**-3
optimizer = tf.keras.optimizers.Adam(learning_rate = lr)   
model.compile(optimizer=optimizer,loss=loss_func, metrics= [custom_accuracy])

In [None]:
tf.keras.backend.clear_session()
model_save = "saved_models/cnn_lstm.keras"
my_callbacks = [
    tf.keras.callbacks.EarlyStopping(patience = 5,
                                     verbose = 2),
    tf.keras.callbacks.ModelCheckpoint(filepath=model_save,
                                       save_best_only = True,
                                       save_weights_only = False,
                                       verbose = 2),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                              patience=2, min_lr=10**-7, verbose = 2)]

In [None]:
history = model.fit(train_dataloader,
          validation_data = test_dataloader,
          epochs = 50,
          callbacks = my_callbacks)

In [None]:
# Extracting loss and other metrics from the history object
loss = history.history['loss']
val_loss = history.history['val_loss']

# Update these lines with the correct keys
acc = history.history['custom_accuracy']  # Assuming 'acc' is the correct key for accuracy
val_acc = history.history['val_custom_accuracy']  # And 'val_acc' for validation accuracy

epochs = range(1, len(loss) + 1)

# Plotting training and validation loss
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs, loss, 'b-', label='Training Loss')
plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Plotting training and validation accuracy
plt.subplot(1, 2, 2)
plt.plot(epochs, acc, 'b-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r-', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:
# Iterate over the test dataloader
for batch in test_dataloader:
    # Unpack the batch
    (img_batch, caption_batch), _ = batch  # No need for targets during inference
    sample_img = img_batch[0].numpy()  # Convert the first image in the batch from tensor to numpy array
    
    print("Image shape:", sample_img.shape)
    plt.imshow(sample_img)
    plt.title("Sample Image")
    plt.show()
    
    # Expand dimensions to fit the model input
    image = np.expand_dims(sample_img, axis=0)  # Adjust if your model expects a different input shape
    print("Image shape after reshaping:", image.shape)
    
    # Process the image through your model
    cnn_img = caption_model.cnn_model(image)
    encoded_img = caption_model.encoder(cnn_img, training=False)
    
    vocab = vectorization.get_vocabulary()
    index_lookup = dict(zip(range(len(vocab)), vocab))
    max_decoded_sentence_length = SEQ_LENGTH - 1
    
    # Since we are not using 'caption' from the dataloader directly, we skip processing it here
    # Begin the decoding process
    decoded_caption = "<start>"
    for i in range(max_decoded_sentence_length):
        tokenized_caption = vectorization([decoded_caption])[:, :-1]
        mask = tf.math.not_equal(tokenized_caption, 0)
        predictions = caption_model.decoder(
            tokenized_caption, encoded_img, training=False, mask=mask
        )
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = index_lookup[sampled_token_index]
        if sampled_token == "<end>":
            break
        decoded_caption += " " + sampled_token
    
    decoded_caption = decoded_caption.replace("<start> ", "").replace(" <end>", "").strip()
    print("Predicted Caption:", decoded_caption)
    
    # Break after displaying the first image and its caption
    break


In [None]:
for batch in test_dataloader:
    (img_batch, caption_batch), _ = batch
    plt_img = img_batch[0]
    sample_img = img_batch[0:1]  # Ensure this is a batch of size 1 for model prediction
    sample_cap = caption_batch[0:1]  # Ensure this is a batch of size 1 for model prediction
    
    # Display the sample image
    plt.imshow(plt_img)  # Convert to numpy array if needed
    plt.title("Sample Image")
    plt.show()
    
    # Predict caption
    pred = model.predict([sample_img, sample_cap])
    predicted_token_indices = np.argmax(pred, axis=-1)[0]  # Take the first sequence (batch size = 1) and get the argmax for each time step

    predicted_words = []
    for idx in predicted_token_indices:
        word = tokenizer.index_word.get(idx, '')
        if word == 'end':  # Ensure this matches the special token used during training
            break  # Stop adding words once the '<end>' token is encountered
        if word and word not in ['<start>', '<pad>']:  # Optionally skip '<start>' and '<pad>' tokens
            predicted_words.append(word)

    predicted_caption = ' '.join(predicted_words)
    print("Predicted Caption:", predicted_caption)

    # Find the actual caption
    actual_tokens = [int(token) for token in sample_cap[0] if token != 0]  # Filter out padding tokens (assuming 0 is the padding token)
    actual_words = [tokenizer.index_word.get(token, '') for token in actual_tokens]
    actual_caption = ' '.join(word for word in actual_words if word not in ['start', 'end', '<pad>'])  # Optionally skip special tokens
    print("Actual Caption:", actual_caption)
    break

In [None]:
from tqdm import tqdm

predicted_captions = []
actual_captions = []

for batch in tqdm(test_dataloader, desc="Predicting for Test Dataset..."):
    (img_batch, caption_batch), _ = batch
    
    # Predict caption
    pred = model.predict([img_batch, caption_batch])
    predicted_token_indices_batch = np.argmax(pred, axis=-1)  # Process the whole batch
    
    for idx, predicted_token_indices in enumerate(predicted_token_indices_batch):
        predicted_words = []
        for token_idx in predicted_token_indices:
            word = tokenizer.index_word.get(token_idx, '')
            if word == 'end':  # Ensure this matches the special token used during training
                break  # Stop adding words once the '<end>' token is encountered
            if word and word not in ['start', '<OOV>']:  # Optionally skip '<start>' and '<OOV>' tokens
                predicted_words.append(word)
        predicted_caption = ' '.join(predicted_words)
        predicted_captions.append(predicted_caption)
        
        # Find the actual caption
        actual_tokens = [int(token) for token in caption_batch[idx] if token != 0]  # Filter out padding tokens (assuming 0 is the padding token)
        actual_words = [tokenizer.index_word.get(token, '') for token in actual_tokens]
        actual_caption = ' '.join(word for word in actual_words if word not in ['start', 'end', '<pad>'])  # Optionally skip special tokens
        actual_captions.append(actual_caption)

In [None]:
def get_bleu(reference,prediction):
    """
    Given a reference and prediction string, outputs the 1-gram,2-gram,3-gram and 4-gram bleu scores
    """
    reference = [reference.split()] #should be in an array (cos of multiple references can be there here only 1)
    prediction = prediction.split()
    bleu1 = sentence_bleu(reference,prediction,weights = (1,0,0,0))
    bleu2 = sentence_bleu(reference,prediction,weights = (0.5,0.5,0,0))
    bleu3 = sentence_bleu(reference,prediction,weights = (0.33,0.33,0.33,0))
    bleu4 = sentence_bleu(reference,prediction,weights = (0.25,0.25,0.25,0.25))

    return bleu1,bleu2,bleu3,bleu4

In [None]:
from nltk.translate.bleu_score import sentence_bleu
def mean_bleu(pred_ls, act_ls, **kwargs):

    bleu1, bleu2, bleu3, bleu4 = [], [], [], []

    for k in range (len(pred_ls)):
        # Tokenize the true and predicted captions
        true_tokens = act_ls[k]
        #print ("True: ",true_tokens)
        predict_tokens = pred_ls[k]
        #print ("Predicted: ",predict_tokens)
        
        # Calculate BLEU scores
        bleu1.append(sentence_bleu([true_tokens], predict_tokens, weights=(1, 0, 0, 0)))
        bleu2.append(sentence_bleu([true_tokens], predict_tokens, weights=(0.5, 0.5, 0, 0)))
        bleu3.append(sentence_bleu([true_tokens], predict_tokens, weights=(0.33, 0.33, 0.33, 0)))
        bleu4.append(sentence_bleu([true_tokens], predict_tokens, weights=(0.25, 0.25, 0.25, 0.25)))

    return np.array(bleu1).mean(), np.array(bleu2).mean(), np.array(bleu3).mean(), np.array(bleu4).mean()
bleu1,bleu2,bleu3,bleu4 = mean_bleu(predicted_captions,actual_captions)
print ("Bleu Score 1-gram: ",bleu1)
print ("Bleu Score 2-gram: ",bleu2)
print ("Bleu Score 3-gram: ",bleu3)
print ("Bleu Score 4-gram: ",bleu4)

In [None]:
from rouge import Rouge

filtered_predicted_captions = [pred for i, pred in enumerate(predicted_captions) if actual_captions[i].strip()]
filtered_actual_captions = [act for act in actual_captions if act.strip()]


rouge_scores = calculate_rouge_scores(filtered_predicted_captions, filtered_actual_captions)
print(rouge_scores)



def calculate_rouge_scores(predictions, references):
    rouge = Rouge()
    scores = rouge.get_scores(predictions, references, avg=True)
    return scores

rouge_scores = calculate_rouge_scores(filtered_predicted_captions, filtered_actual_captions)
print(rouge_scores)

ROUGE-1: Measures the overlap of unigrams (single words) between the predicted and actual captions.

Recall (r): 18.75% of the words in the actual captions are also found in the predicted captions.
Precision (p): 26.29% of the words in the predicted captions are correct (i.e., they appear in the actual captions).
F1-score (f): The harmonic mean of precision and recall is 20.92%, indicating the overall unigram overlap efficiency.
ROUGE-2: Measures the overlap of bigrams (pairs of consecutive words) between the predicted and actual captions.

Recall (r): 4.30% of the bigrams in the actual captions are also found in the predicted captions.
Precision (p): 5.22% of the bigrams in the predicted captions are correct.
F1-score (f): The harmonic mean of precision and recall is 4.50%, indicating the model's efficiency in capturing two-word phrases.
ROUGE-L: Focuses on the longest common subsequence and can capture longer-term dependencies than ROUGE-1 or ROUGE-2, giving a sense of the overall structure captured by the predictions.

Recall (r): 17.88% of the longest sequences in the actual captions are also found in the predicted captions.
Precision (p): 24.97% of the sequences in the predicted captions are correct.
F1-score (f): The harmonic mean of precision and recall is 19.93%, indicating the model's effectiveness in capturing longer sequence patterns.

In [None]:
for i, caption in enumerate(actual_captions):
    if not caption.strip():
        print(f"Empty actual caption at index {i}")
for i, caption in enumerate(predicted_captions):
    if not caption.strip():
        print(f"Empty predicted caption at index {i}")