# Project description 
The following blog post is chosen as framework for this project. Listed below is also a few possible tweaks to improve its performance:

*   Other pre-trained model: ResNET, Inception...
*   Added approaches from other papers

[How to Develop a Deep Learning Photo Caption Generator from Scratch](https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/)

# **Import**

In [116]:
import numpy as np
from numpy import array
from os import listdir
import os
import sys
import string

# Image processing, pre-trained VGG16 & InceptionResNetV2
from keras.preprocessing.image import load_img, img_to_array
from keras.applications.vgg16 import VGG16
from keras.applications.inception_resnet_v2 import InceptionResNetV2
from keras.applications.vgg16 import preprocess_input as preprocess_VGG16
from keras.applications.inception_resnet_v2 import preprocess_input as preprocess_InceptionResNet

# Text processing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.utils import to_categorical #plot_model
from keras.models import Model
from keras.models import load_model
from keras.layers import Input, Dense, LSTM, CuDNNLSTM, Embedding, Dropout, Bidirectional
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.initializers import Constant

# Helper functions for loading and preprocessing




## Data generation

In [2]:
def data_generator(descriptions, photos, tokenizer, max_length):
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            # retrieve the photo feature
            photo = photos[key][0]
            in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo)
            yield [[in_img, in_seq], out_word]

## Load some file

In [3]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    
    # read all text
    text = file.read()

    # close the file
    file.close()
    return text

## Load descriptions

In [4]:
# extract descriptions for images
def load_descriptions(doc):
    mapping = dict()
    
    # process lines
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        if len(line) < 2:
            continue
            
        # take the first token as the image id, the rest as the description
        image_id, image_desc = tokens[0], tokens[1:]
        
        # remove filename from image id
        image_id = image_id.split('.')[0]
        
        # convert description tokens back to string
        image_desc = ' '.join(image_desc)
        
        # create the list if needed
        if image_id not in mapping:
            mapping[image_id] = list()
            
        # store description
        mapping[image_id].append(image_desc)
        
    return mapping

## Clean text in order to reduce the size of the vocabulary

In [5]:
import string
 
def clean_descriptions(descriptions):
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            # tokenize
            desc = desc.split()
            
            # convert to lower case
            desc = [word.lower() for word in desc]
            
            # remove punctuation from each token
            desc = [w.translate(table) for w in desc]
            
            # remove hanging 's' and 'a'
            desc = [word for word in desc if len(word)>1]
            
            # remove tokens with numbers in them
            desc = [word for word in desc if word.isalpha()]
            
            # store as string
            desc_list[i] =  ' '.join(desc)


## Convert descriptions to vocabulary and save to file

In [6]:
# convert the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
    # build a list of all description strings
    all_desc = set()
    
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
        
    return all_desc

# save descriptions to file, one per line
def save_descriptions(descriptions, filename):
    lines = list()
    
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
            
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

## Load a pre-defined set of identifiers given the sets filename

In [7]:
# load a pre-defined list of photo identifiers
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    
    # process line by line
    for line in doc.split('\n'):
        # skip empty lines
        if len(line) < 1:
            continue
            
        # get the image identifier
        identifier = line.split('.')[0]
        dataset.append(identifier)
        
    return set(dataset)

## Load clean description into memory

In [8]:
# load clean descriptions into memory, descriptions is a dictionary with image id as key and descriptions as values
def load_clean_descriptions(filename, dataset):
    # load document
    doc = load_doc(filename)
    descriptions = dict()
    
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        
        # split id from description
        image_id, image_desc = tokens[0], tokens[1:]
        
        # skip images not in the set
        if image_id in dataset:
            # create list
            if image_id not in descriptions:
                descriptions[image_id] = list()
                
            # wrap description in tokens
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            
            # store
            descriptions[image_id].append(desc)
            
    return descriptions

## Loads the entire set of photo descriptions and returns a subset of interest for a given set of photo identifiers

In [9]:
# load photo features, connects features to photos
def load_photo_features(filename, dataset):
    # load all features
    all_features = np.load(filename).item() # all_features = load(open(filename, 'rb'))
    
    # filter features
    features = {k: all_features[k] for k in dataset}
    
    return features



## Convert the dictionary of descriptions into a list of strings and l fit a Tokenizer given the loaded photo description text

In [10]:
# covert a dictionary of descriptions to a list of descriptions
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
        
    return all_desc
 
# fit a tokenizer given caption descriptions, creates a indexes for each unique word in descriptions
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

## Create sequences

In [44]:
# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, desc_list, photo):
    X1, X2, y = list(), list(), list()
    
    # walk through each description for the image
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0] # text to list of indices defined by tokenizer
        
        # split one sequence into multiple X,Y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0] # pads sequence to length of max sequence
            
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size_train)[0]
            
            # store
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)

    return np.array(X1), np.array(X2), np.array(y)

## Define length of description with most words

In [12]:
# calculate the length of the description with the most words (also defines padding)
def max_length(descriptions):
    lines = to_lines(descriptions)
    max_val = max(len(d.split()) for d in lines) # Added variable to return due to error(?)
    return max_val

## Extract features from VGG16 or InceptionResNet

In [13]:
class WrongModelType(Exception):
    pass

# extract features from each photo in the directory
def extract_features(directory, model_type):
    # load the model    
    if model_type.lower() == 'vgg16':
        model = VGG16()
    elif model_type.lower() == 'inceptionresnet':
        model = InceptionResNetV2()
    else:
        raise WrongModelType('Pick a valid model_type! Either "VGG16" or "InceptionResNet"')
    
    # re-structure the model
    model.layers.pop()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)

    # summarize
    print(model.summary())

    # extract features from each photo
    features = dict()
    for name in listdir(directory):
        # load an image from file
        filename = directory + '/' + name
        image = load_img(filename, target_size=(224, 224))
        
        # convert the image pixels to a numpy array
        image = img_to_array(image)
        
        # reshape data for the model
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        
        # prepare the image
        if model_type.lower() is 'vgg16':
            image = preprocess_VGG16(image)
        elif model_type.lower() is 'inceptionresnet':
            image = preprocess_InceptionResNet(image)
        
        # get features
        feature = model.predict(image, verbose=0)
        
        # get image id
        image_id = name.split('.')[0]
        
        # store feature
        features[image_id] = feature
        print('>%s' % name)
    
    return features

def extract_single_features(filename, model_type):
    # load the model   
    if model_type.lower() == 'vgg16':
        model = VGG16()
    elif model_type.lower() == 'inceptionresnet':
        model = InceptionResNetV2()
    else:
        raise WrongModelType('Pick a valid model_type! Either "VGG16" or "InceptionResNet"')
        
    # re-structure the model
    model.layers.pop()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
    
    # load the photo
    image = load_img(filename, target_size=(224, 224))
    
    # convert the image pixels to a numpy array
    image = img_to_array(image)
    
    # reshape data for the model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    
    # prepare the image
    if model_type.lower() is 'vgg16':
        image = preprocess_VGG16(image)
    elif model_type.lower() is 'inceptionresnet':
        image = preprocess_InceptionResNet(image)
    
    # get features
    feature = model.predict(image, verbose=0)
    return feature

## Map integer to word

In [14]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

## Generate image description

In [15]:
# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
    # seed the generation process
    in_text = 'startseq'
    # iterate over the whole length of the sequence
    for i in range(max_length):
        # integer encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad input
        sequence = pad_sequences([sequence], maxlen=max_length)
        # predict next word
        yhat = model.predict([photo,sequence], verbose=0)
        # convert probability to integer
        yhat = np.argmax(yhat)
        # map integer to word
        word = word_for_id(yhat, tokenizer)
        # stop if we cannot map the word
        if word is None:
            break
        # append as input for generating the next word
        in_text += ' ' + word
        # stop if we predict the end of the sequence
        if word == 'endseq':
            break
    return in_text

## Evaluate model with BLEU score

In [16]:
# evaluate the skill of the model
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
    actual, predicted = list(), list()
    # step over the whole set
    for key, desc_list in descriptions.items():
        # generate description
        yhat = generate_desc(model, tokenizer_train, photos[key], max_length)
        
        # store actual and predicted
        references = [d.split() for d in desc_list]
        actual.append(references)
        predicted.append(yhat.split())

    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [106]:
from nltk.translate.bleu_score import corpus_bleu

def BLEU(y_true, y_pred):
    pred_list = []
    true_list = []
    for i, caption in enumerate(y_pred): # loop over all predictions (n_samples,)
        pred_text = ''
        for index in caption:
            word = word_for_id(index, tokenizer) # convert the prediced word sequence(indices) to words using tokenizer 
            if word is None: # stop if we cannot map the word
                break
            pred_text += ' ' + word
            if word == 'endseq': # stop if we predict the end of the sequence
                break
        pred_list[i] = pred_text[1:-7].split() # BLEU only cares for the predicted sentence converted to a list of words
        
    for i, captions in enumerate(y_true): # loop over y_true (nSamples, 5) where 5 is captions per sample image
        true_captions = []
        for j, caption in enumerate(captions): # each caption sequence to be converted to sentence
            true_text = ''
            for index in caption: # for each index in caption convert to word and add up
                word = word_for_id(index, tokenizer)
                if word is None: # stop if we cannot map the word
                    break
                true_text += ' ' + word
                if word == 'endseq': # stop if we predict the end of the sequence
                    break
            true_captions[j] = true_text[1:-7].split()
        true_list[i] = true_captions
    
    bleu1 = mean([corpus_bleu(true_list[i], pred_list[i], weights=(1.0, 0, 0, 0)) for i in len(pred_list)])
    #bleu2 = mean([corpus_bleu(true_list[i], pred_list[i], weights=(0.5, 0.5, 0, 0)) for i in len(pred_list)])
    #bleu3 = mean([corpus_bleu(true_list[i], pred_list[i], weights=(0.33, 0.33, 0.33, 0)) for i in len(pred_list)])
    #bleu4 = mean([corpus_bleu(true_list[i], pred_list[i], weights=(0.25, 0.25, 0.25, 0.25)) for i in len(pred_list)])
    
            
    return blue1 #, bleu2, bleu3, bleu4

## Preparations for defining models

In [17]:
filename = 'Flickr8k_text/Flickr8k.token.txt'
# load descriptions
doc = load_doc(filename)
# parse descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))
# clean descriptions
clean_descriptions(descriptions)
# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))
# save to file
save_descriptions(descriptions, 'descriptions.txt')

Loaded: 8092 
Vocabulary Size: 8763


In [37]:
# load training dataset (6K)
filename_train = 'Flickr8k_text/Flickr_8k.trainImages.txt'
filename_val = 'Flickr8k_text/Flickr_8k.devImages.txt'
train = load_set(filename_train)
val = load_set(filename_val)
print('Dataset train: %d' % len(train))
print('Dataset val: %d' % len(val))

# descriptions
descriptions_train = load_clean_descriptions('descriptions.txt', train)
descriptions_val = load_clean_descriptions('descriptions.txt', val)
print('Descriptions: train=%d' % len(descriptions_train))
print('Descriptions: val=%d' % len(descriptions_val))

# photo features
features_train = load_photo_features('features_VGG.npy', train)
features_val = load_photo_features('features_VGG.npy', val)
print('Photos: train=%d' % len(features_train))
print('Photos: val=%d' % len(features_val))

# prepare tokenizer
tokenizer_train = create_tokenizer(descriptions_train)
tokenizer_val = create_tokenizer(descriptions_val)
np.save('tokenizer_train.npy',tokenizer_train)
np.save('tokenizer_val.npy',tokenizer_val)
vocab_size_train = len(tokenizer_train.word_index) + 1
vocab_size_val = len(tokenizer_val.word_index) + 1
print('Vocabulary Size train: %d' % vocab_size_train)
print('Vocabulary Size val: %d' % vocab_size_val)

# determine the maximum sequence length
max_len_train = max_length(descriptions_train)
max_len_val = max_length(descriptions_val)
print('Description Length train: %d' % max_len_train)
print('Description Length val: %d' % max_len_val)

Dataset train: 6000
Dataset val: 1000
Descriptions: train=6000
Descriptions: val=1000
Photos: train=6000
Photos: val=1000
Vocabulary Size train: 7579
Vocabulary Size val: 3317
Description Length train: 34
Description Length val: 31


# Models

## Standard caption model

In [107]:
def define_model_base(vocab_size, max_length):
    # feature extractor model
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    # sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256)(inputs2) #, mask_zero=True
    se2 = Dropout(0.5)(se1)
    se3 = CuDNNLSTM(256)(se2)
    
    # decoder model
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    
    # compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[BLEU])
    
    # summarize model
    model.summary()
    #plot_model(model, to_file='model.png', show_shapes=True)
    return model

## Modified caption model (to be continued...)

### Create model with GLOVE embedding

In [109]:
BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'),encoding="utf-8")
iter = 0
for line in f:
    iter+=1
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [110]:
EMBEDDING_DIM = 300 #Since we use 'glove.6B.300d.txt'
word_index = tokenizer_train.word_index 

embedding_matrix = np.zeros((vocab_size_train, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [111]:
embedding_layer = Embedding(vocab_size_train,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_len_train,
                            trainable=False)

In [112]:
def define_model_glove(vocab_size, max_length):
    # feature extractor model
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    # sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = embedding_layer(inputs2) #, mask_zero=True
    se2 = Dropout(0.5)(se1)
    se3 = CuDNNLSTM(256)(se2)
    
    # decoder model
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    
    # compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    # summarize model
    model.summary()
    #plot_model(model, to_file='model.png', show_shapes=True)
    return model

### Create model with bidirectional LSTM

In [130]:
def define_model_bidirectional(vocab_size, max_length):
    # feature extractor model
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    # sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256)(inputs2) #, mask_zero=True
    se2 = Dropout(0.5)(se1)
    se3 = Bidirectional(CuDNNLSTM(256),merge_mode='ave')(se2)
    se4 = Dense(256, activation='relu')(se3)
    
    # decoder model
    de1 = add([fe2, se4])
    de2 = Dense(256, activation='relu')(de1)
    de3 = Bidirectional(CuDNNLSTM(256),merge_mode='ave')(de2)
    outputs = Dense(vocab_size, activation='softmax')(de3)
    
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    
    # compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[BLEU])
    
    # summarize model
    model.summary()
    #plot_model(model, to_file='model.png', show_shapes=True)
    return model

### Inspiration:
* [Image captioning in Keras](https://github.com/danieljl/keras-image-captioning)
* [Keras implementation of image captioning model](https://medium.com/@faizanmustafa75/keras-implementation-of-image-captioning-model-3a7ab68e67d4)
* [Image Captioning using InceptionV3 and beam search](https://github.com/yashk2810/Image-Captioning)
* [Recurrent Neural Networks, Image Captioning, LSTM](https://www.youtube.com/watch?v=cO0a0QYmFm8)

# Training and tweaking

## VGG16 feature extraction 

In [None]:
#extract features from all images
directory = 'Flickr8k_Dataset'
features = extract_features(directory,model_type='VGG16p')

# save to file
np.save('features_VGG16.npy',features)
#imported_features = np.load('features.npy').item()

## InceptionResNet feature extraction

In [None]:
#extract features from all images
directory = 'Flickr8k_Dataset'
features = extract_features(directory,model_type='InceptionResNet')

# save to file
np.save('features_InceptionResNet.npy',features)
#imported_features = np.load('features.npy').item()

## Base model training

In [108]:
# define the model
model_base = define_model_base(vocab_size_train, max_len)

# train the model, run epochs manually and save after each epoch
num_epochs = 10
steps_train = len(descriptions_train)
steps_val = len(descriptions_val)

# callbacks
ES = EarlyStopping(monitor='val_loss') #, restore_best_weights=True

# create the data generator
generator_train = data_generator(descriptions_train, features_train, tokenizer_train, max_len_train)
generator_val = data_generator(descriptions_val, features_val, tokenizer_train, max_len_train)
# TODO: ADD VALIDATION GENERATOR

# fit for one epoch
model_base.fit_generator(generator_train, epochs=num_epochs, steps_per_epoch=steps_train, verbose=1, callbacks=[ES], validation_data=generator_val, validation_steps=steps_val)

# save model
model_base.save('model_base.h5')

TypeError: Tensor objects are not iterable when eager execution is not enabled. To iterate over this tensor use tf.map_fn.

## Modified model

In [131]:
# define the model
model_mod = define_model_bidirectional(vocab_size_train, max_len_train)

# train the model, run epochs manually and save after each epoch
num_epochs = 10
steps_train = len(descriptions_train)
steps_val = len(descriptions_val)

# callbacks
ES = EarlyStopping(monitor='val_loss') #, restore_best_weights=True

# create the data generator
generator_train = data_generator(descriptions_train, features_train, tokenizer_train, max_len_train)
generator_val = data_generator(descriptions_val, features_val, tokenizer_train, max_len_train)
# TODO: ADD VALIDATION GENERATOR

# fit for one epoch
model_mod.fit_generator(generator_train, epochs=num_epochs, steps_per_epoch=steps_train, verbose=1, validation_data=generator_val, validation_steps=steps_val)
#, callbacks=[ES]

# save model
model_mod.save('model_modified.h5')

ValueError: Input 0 is incompatible with layer bidirectional_12: expected ndim=3, found ndim=2

# Testing

In [None]:
# load test set
filename = 'Flickr8k_text/Flickr_8k.testImages.txt'
test = load_set(filename)
print('Dataset: %d' % len(test))
# descriptions
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))
# photo features
test_features = load_photo_features('features_InceptionResNet.npy', test)
print('Photos: test=%d' % len(test_features))
 
# load the model
filename = 'model_modified.h5'
model = load_model(filename)

# evaluate model
evaluate_model(model, test_descriptions, test_features, tokenizer, max_length)

## Generate caption for single image

In [None]:
# load the tokenizer
tokenizer = np.load('tokenizer.npy').item()

# pre-define the max sequence length (from training)
max_length = 34

# load the model
model = load_model('model_3.h5')

# load and prepare the photograph
photo = extract_single_features('dog-frisbee.jpg','VGG16')

# generate description
description = generate_desc(model, tokenizer, photo, max_length)
print(description)

# EXTRA: Possible extensions

*  Pre-trained models: InceptionResNetV2 https://arxiv.org/pdf/1602.07261.pdf
*  Pre-trained Word Vectors [Pre-trained word embeddings](https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py)
*  Bidirectional - [Wrappers](https://keras.io/layers/wrappers/)
*  Tune Model
*  (Attention)