## Loading Modules

In [None]:
import os 
# import pickle
import numpy as np
from tqdm import tqdm

from tensorflow.keras.applications.vgg16 import VGG16,preprocess_input
from tensorflow.keras.preprocessing.image import load_img,img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical,plot_model
from tensorflow.keras.layers import Input,Embedding,LSTM,Dense,add,Dropout

In [None]:
import shutil

# Path to the text file containing image names
file_path = r'D:\med-image-captioning\data\captions\random_sample.txt'

# Read the file and extract image names
with open(file_path, 'r') as file:
    lines = file.readlines()
image_names = [line.split(',')[0] for line in lines]

# Define the source and destination folder paths
source_folder_path = r'D:\thesis\medicat_release\release\figures'
destination_folder_path = r'D:\med-image-captioning\images'

# Ensure the destination folder exists
os.makedirs(destination_folder_path, exist_ok=True)

# Iterate over the list of image names
for image_name in image_names:
    source_image_path = os.path.join(source_folder_path, image_name)
    
    # Check if the image exists in the source folder
    if os.path.exists(source_image_path):
        destination_image_path = os.path.join(destination_folder_path, image_name)
        
        # Copy the image to the destination folder
        shutil.copy2(source_image_path, destination_image_path)
        print(f"Copied {image_name} to destination folder.")
    else:
        print(f"Image {image_name} not found in source folder.")


## Image Feature Extraction

In [None]:
model = VGG16()
# restarcture the model
model = Model(inputs=model.inputs,outputs=model.layers[-2].output)

In [None]:
print(model.summary())

In [None]:
## extract features from each photo in the directory
features={}
directory=r'D:\med-image-captioning\images'

for img_name in tqdm(os.listdir(directory)):
    filename=directory+'\\'+img_name

    # load an image from file
    image=load_img(filename,target_size=(224,224))

    # convert the image pixels to a numpy array
    image=img_to_array(image)

    # reshape data for the model
    image=image.reshape((1,image.shape[0],image.shape[1],image.shape[2]))

    # prepare the image for the VGG model
    image=preprocess_input(image)

    # get features
    feature=model.predict(image,verbose=0)

    # get image id
    image_id=img_name.split('.')[0]

    #save the features
    features[image_id]=feature

In [None]:
# # save to file
# pickle.dump(features,open(r'D:\med-image-captioning\data\image_features\features.pkl','wb'))

## Loading the captions

In [None]:
# load the caption features using pickle
with open(r'D:\med-image-captioning\data\captions\random_sample.txt','r') as file:
    captions_doc=file.read()

In [None]:
# create mapping of image captions
mapping={}

for line in tqdm(captions_doc.split('\n')):
    # split the line by comma
    tokens=line.split(',')

    if len(tokens) >= 2:
        image_id,caption=tokens[0],tokens[1]

        #remove filename from image_id
        image_id=image_id.split('.')[0]


        if image_id not in mapping:
            mapping[image_id]=[]

        mapping[image_id].append(caption)




In [None]:
print(len(mapping))

In [None]:
## add start and end sequence to the captions
for key,captions in mapping.items():
    for i in range(len(captions)):
        captions[i]='<start> '+captions[i]+' <end>'

In [None]:
print(mapping['0a60e80445d6be21f3da582eeeec7bec83d82e74_4-Figure1-1'])

In [None]:
all_captions=[]
for key in mapping.keys():
    [all_captions.append(caption) for caption in mapping[key]]

In [None]:
len(all_captions)

In [None]:
print(all_captions[:5])

In [None]:
# tokenize the text

tokenizer=Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size=len(tokenizer.word_index)+1

In [None]:
print(vocab_size)

In [None]:
max_length=max(len(caption.split()) for caption in all_captions)
print(max_length)

## Train test split

In [None]:
image_ids=list(mapping.keys())
split=int(len(image_ids)*0.8)
train=image_ids[:split]
test=image_ids[split:]


In [None]:
# create data generator
def data_generator(data_keys,mapping,features,tokenizer,max_length,vocab_size,batch_size):
    X1,X2,y= list(),list(),list()
    n=0
    while 1:
        for key in data_keys:
            n+=1
            captions=mapping[key]

            for caption in captions:
                sequence=tokenizer.texts_to_sequences([caption])[0]
                
                for i in range(1,len(sequence)):
                    # split into input and output pair
                    in_seq,out_seq=sequence[:i],sequence[i]

                    # pad input sequence
                    in_seq=pad_sequences([in_seq],maxlen=max_length)[0]

                    # encode output sequence
                    out_seq=to_categorical([out_seq],num_classes=vocab_size)[0]

                    # store the input and output sequence

                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            if n==batch_size:
                X1,X2,y=np.array(X1),np.array(X2),np.array(y)
                yield [X1,X2],y
                X1,X2,y=list(),list(),list()
                n=0





## Model Building

In [None]:
# encoder
# image feature model
imputs1=Input(shape=(4096,))
fe1=Dropout(0.4)(imputs1)
fe2=Dense(256,activation='relu')(fe1)

# sequence feature layer
inputs2=Input(shape=(max_length,))
se1=Embedding(vocab_size,256,mask_zero=True)(inputs2)
se2=Dropout(0.4)(se1)
se3=LSTM(256)(se2)

# decoder
decoder1=add([fe2,se3])
decoder2=Dense(256,activation='relu')(decoder1)
outputs=Dense(vocab_size,activation='softmax')(decoder2)

# model definiton
model=Model(inputs=[imputs1,inputs2],outputs=outputs)
model.compile(loss='categorical_crossentropy',optimizer='adam')



In [None]:
# plot the model
# plot_model(model,show_shapes=True)

## Train the model

In [None]:
epochs = 10
batch_size = 5
steps = len(train)//batch_size

for i in tqdm(range(epochs)):
    generattor=data_generator(train,mapping,features,tokenizer,max_length,vocab_size,batch_size)
    
    model.fit(next(generattor), epochs=1, steps_per_epoch=1, verbose=1)

In [None]:
# model.save(r'D:\med-image-captioning\model\model_50.h5')

## Generate captions

In [None]:
def index_to_word(integer,tokenizer):
    for word,index in tokenizer.word_index.items():
        if index==integer:
            return word
    return None

In [None]:
# generate a description for an image
def predict_caption(model,image,tokenizer,max_length):
    in_text="<start>"

    for i in range(max_length):
        sequence=tokenizer.texts_to_sequences([in_text])[0]
        sequence=pad_sequences([sequence],max_length)

        yhat=model.predict([image,sequence],verbose=0)

        # convert probability to integer
        yhat=np.argmax(yhat)

        # convert index to word
        word=index_to_word(yhat,tokenizer)

        if word is None:
            break
        ## append as input for generating the next word
        in_text+=" "+word

        # stop if we cannot generate the next word
        if word=='<end>':
            break
    return in_text

In [None]:
# evaluate the model

actual,predicted=list(),list()

for key in test:
    try:
        # get actual caption
        captions=mapping[key]

        # generate caption
        y_pred=predict_caption(model,features[key],tokenizer,max_length)

        #split into words
        actual_captions=[caption.split() for caption in captions]
        y_pred=y_pred.split()
        #append to the list
        actual.append(actual_captions)
        predicted.append(y_pred)
    except KeyError:
        continue

# calculate BLEU score




In [None]:
from nltk.translate.bleu_score import corpus_bleu
# calculate BLEU score
print("BLUE-1: %f" % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
print("BLUE-2: %f" % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))

## Visualize the results

In [None]:
from PIL import Image
import matplotlib.pyplot as plt

In [None]:
def generate_caption(image_name):

    # image_name="780d51420ad492a61d00295adb6919bd492e6bd3_8-Figure3-1.png"
    img_path=r'D:\med-image-captioning\images\\'+image_name
    img_name=image_name.split('.')[0]
    image=Image.open(img_path)
    captions=mapping[img_name]

    print("----------------Actual Captions----------------")
    print(captions)
    y_pred=predict_caption(model,features[img_name],tokenizer,max_length)
    print("----------------Predicted Caption----------------")
    print(y_pred)
    plt.imshow(image)


In [None]:
generate_caption('0a60e80445d6be21f3da582eeeec7bec83d82e74_4-Figure1-1.png')