In this project, we would like to use the CNN, RNN, and LSTM to build the model of image label generator

# Load the data

token_pt = 'Flickr8k/Flickr8k.lemma.token.txt'
train_pt = 'Flickr8k/Flickr_8k.trainImages.txt' 
test_pt = 'Flickr8k/Flickr_8k.testImages.txt'
glove_pt = 'Flickr8k/glove.6B.200d.txt'
image_pt = 'Flicker8k_Dataset/'

doc = open(token_pt,'r').read()
print(doc[:1000])   #example of the image id and description, and find that for each image, there are 5 captions

#create a dictionary containing the image id and list of all the 5 captions for it.

descriptions = dict()
for line in doc.split('\n'):
        tokens = line.split()
        if len(line) > 2:
            image_id = tokens[0].split('.')[0]
            image_desc = ' '.join(tokens[1:])
            if image_id not in descriptions: #merge the captions for same id
                descriptions[image_id] = list()
            descriptions[image_id].append(image_desc)

dict(list(descriptions.items())[:2])

# Clean the data

import string
from spacy.lang.en import stop_words as spacy_stopwords

punc = string.punctuation
stopwd =  spacy_stopwords.STOP_WORDS

for key, desc_list in descriptions.items():
    for i in range(len(desc_list)):
        dc = desc_list[i]
        dc = dc.split()
        dc = [word.lower() for word in dc]
        dc = [word for word in dc if word not in stopwd and word not in punc]
        desc_list[i] =  ' '.join(dc)
         

dict(list(descriptions.items())[:2]) #we can see that the words are converted to lowercase and punctuations are removed

#save the cleaned description into original format
lines = list()
for key, desc_list in descriptions.items():
    for desc in desc_list:
        lines.append(key + ' ' + desc)
new_descriptions = '\n'.join(lines)

print(new_descriptions[:1000])

#visulize the image with its formatted descriptions
import matplotlib.pyplot as plt
pic = '1305564994_00513f9a5b.jpg'
x = plt.imread(image_pt+pic)
plt.imshow(x)
plt.show()
descriptions['1305564994_00513f9a5b']

#load the train data

doc_train = open(train_pt,'r').read()
dataset = list()
for line in doc_train.split('\n'):
    if len(line) > 1:
        identifier = line.split('.')[0]
        dataset.append(identifier)

train = set(dataset)

#load the test data
doc_test = open(test_pt,'r').read()
dataset = list()
for line in doc_test.split('\n'):
    if len(line) > 1:
        identifier = line.split('.')[0]
        dataset.append(identifier)

test = set(dataset)

#locate the train and test image path based on the set information from above cell
import glob

img = glob.glob(image_pt + '*.jpg')#give the path for all the images

#get all paths of the image for training
train_image = set(open(train_pt, 'r').read().strip().split('\n'))
train_image_collection=[]
for i in img:
    if i[len(image_pt):] in train_image:
        train_image_collection.append(i)
        
#get all paths of the image for testing
test_image = set(open(test_pt, 'r').read().strip().split('\n'))
test_image_collection=[]
for i in img:
    if i[len(image_pt):] in test_image:
        test_image_collection.append(i)


#get the descriptions of train images
train_desc = {}

for line in new_descriptions.split('\n'):
    tokens = line.split()
    image_id, image_desc = tokens[0], tokens[1:]
    if image_id in train:
        if image_id not in train_desc:
            train_desc[image_id] = list()
        desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
        train_desc[image_id].append(desc)
            
#view the top 5 lines in the train description
dict(list(train_desc.items())[:2])

all_train_captions = []
for key, val in train_desc.items():
    for cap in val:
        all_train_captions.append(cap)

#Create the vocabulary for train images

Vocab = set()
for key in train_desc:
        [Vocab.update(d.split()) for d in descriptions[key]]
        
print('Original Vocabulary Size: %d' % len(Vocab))

#decrease the vocabulary by keeping those words have more than 15 frequency
word_freq_thd = 15
word_freq = {}
nsents = 0
for sent in all_train_captions:
    nsents += 1
    for w in sent.split(' '):
        word_freq[w] = word_freq.get(w, 0) + 1
vocab = [w for w in word_freq if word_freq[w] >= word_freq_thd]

print('Vocabulary = %d' % (len(vocab))) #we are able to find the vocab has decreased from 2320 to 521

ixtoword = {}
wordtoix = {}
ix = 1
for w in vocab:
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1

vocab_size = len(ixtoword) + 1

#find the max length of the description
all_desc = list()
for key in train_desc.keys():
    [all_desc.append(d) for d in train_desc[key]]
lines = all_desc
max_length = max(len(d.split()) for d in lines)

print('Description Length: %d' % max_length)

## Glove embedding

import numpy as np

embeddings_index = {} 
f = open(glove_pt, encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

#make an matrix for the vocabulary
embedding_matrix = np.zeros((vocab_size, 200))
for word, i in wordtoix.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## Build the model

!pip3 install --upgrade tensorflow

pip install Keras

import tensorflow as tf

#Using the pretrained network InceptionV3
from tensorflow import keras
from keras.applications.inception_v3 import InceptionV3

original_model = InceptionV3(weights='imagenet')

#remove the classify layer in the InceptionV3
from keras.models import Model

model = Model(original_model.input, original_model.layers[-2].output)

#define the function to reshape the image for InceptionV3
from keras.preprocessing import image
from keras.applications.inception_v3 import preprocess_input

def preprocess(image_path):
    img = image.load_img(image_path, target_size=(299, 299))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return x

#define the function to extract image vectors
def encode(image):
    image = preprocess(image) 
    fea_vec = model.predict(image) 
    fea_vec = np.reshape(fea_vec, fea_vec.shape[1])
    return fea_vec

#extract the vectors from train and test images

##train data
train_vecs = {}
for img in train_image_collection:
    train_vecs[img[len(image_pt):]] = encode(img)


##test data
test_vecs = {}
for img in test_image_collection:
    test_vecs[img[len(image_pt):]] = encode(img)

from keras.layers import LSTM, Embedding, Dense, Activation, Flatten, Reshape, Dropout
from keras import Input, layers
from keras.layers.merge import add

from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

#build the feature extractor model
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

#build the description sequence model
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 200, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

#decoder model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

#merge two models
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.summary()

# use pre-fixed weights for embeddding layer and not trainable.
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False
# model compile
model.compile(loss='categorical_crossentropy', optimizer='adam')


# define the data generator function
def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch):
    X1, X2, y = list(), list(), list()
    n=0
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            n+=1
            # retrieve the photo feature
            photo = photos[key+'.jpg']
            for desc in desc_list:
                # encode the sequence
                seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
                # split one sequence into multiple X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pair
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # store
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)
            # yield the batch data
            if n==num_photos_per_batch:
                yield [[np.array(X1), np.array(X2)], np.array(y)]
                X1, X2, y = list(), list(), list()
                n=0

#train the model with 5 epochs due to the storage
epochs = 5
batch_size = 3
steps = len(train_desc)//batch_size

generator = data_generator(train_desc, train_vecs, wordtoix, max_length, batch_size)
model.fit(generator, epochs=epochs, steps_per_epoch=steps, verbose=1)

## Generate the next word prediction based on the greedy search

def greedySearch(photo):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = [wordtoix[w] for w in in_text.split() if w in wordtoix]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = ixtoword[yhat]
        in_text += ' ' + word
        if word == 'endseq':
            break

    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

def beam_search_predictions(image, beam_index = 3):
    start = [wordtoix["startseq"]]
    start_word = [[start, 0.0]]
    while len(start_word[0][0]) < max_length:
        temp = []
        for s in start_word:
            par_caps = sequence.pad_sequences([s[0]], maxlen=max_length, padding='post')
            preds = model.predict([image,par_caps], verbose=0)
            word_preds = np.argsort(preds[0])[-beam_index:]
            # Getting the top <beam_index>(n) predictions and creating a 
            # new list so as to put them via the model again
            for w in word_preds:
                next_cap, prob = s[0][:], s[1]
                next_cap.append(w)
                prob += preds[0][w]
                temp.append([next_cap, prob])
                    
        start_word = temp
        # Sorting according to the probabilities
        start_word = sorted(start_word, reverse=False, key=lambda l: l[1])
        # Getting the top words
        start_word = start_word[-beam_index:]
    
    start_word = start_word[-1][0]
    intermediate_caption = [ixtoword[i] for i in start_word]
    final_caption = []
    for i in intermediate_caption:
        if i != 'endseq':
            final_caption.append(i)
        else:
            break

    final_caption = ' '.join(final_caption[1:])
    return final_caption

## Test the performance

#pick up a image from test data
import matplotlib.pyplot as plt
pic = '241346971_c100650320.jpg'
x = plt.imread(image_pt+pic)
plt.imshow(x)
plt.show()
descriptions['1305564994_00513f9a5b']

pic = '241346971_c100650320.jpg'
image = test_vecs[pic].reshape((1,2048))
x=plt.imread(image_pt+pic)
plt.imshow(x)
plt.show()

print("Greedy Search:",greedySearch(image))
print("Beam Search, K = 3:",beam_search_predictions(image, beam_index = 3))
print("Beam Search, K = 5:",beam_search_predictions(image, beam_index = 5))
print("Beam Search, K = 7:",beam_search_predictions(image, beam_index = 7))
print("Beam Search, K = 10:",beam_search_predictions(image, beam_index = 10))