# Project description 
The following blog post is chosen as framework for this project. Listed below is also a few possible tweaks to improve its performance:

*   Other pre-trained model: ResNET, Inception...
*   Added approaches from other papers

[How to Develop a Deep Learning Photo Caption Generator from Scratch](https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/)

# **Import**

In [1]:
import numpy as np
from os import listdir

# Image processing, pre-trained VGG16 & InceptionResNetV2
from keras.preprocessing.image import load_img, img_to_array
from keras.applications.vgg16 import VGG16
from keras.applications.inception_resnet_v2 import InceptionResNetV2
from keras.applications.vgg16 import preprocess_input as preprocess_VGG16
from keras.applications.inception_resnet_v2 import preprocess_input as preprocess_InceptionResNet

# Text processing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.utils import to_categorical #plot_model
from keras.models import Model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint, EarlyStopping

Using TensorFlow backend.


# Helper functions for loading and preprocessing




## Data generation

In [2]:
def data_generator(descriptions, photos, tokenizer, max_length):
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            # retrieve the photo feature
            photo = photos[key][0]
            in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo)
            yield [[in_img, in_seq], out_word]

## Load some file

In [3]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    
    # read all text
    text = file.read()

    # close the file
    file.close()
    return text

## Load descriptions

In [4]:
# extract descriptions for images
def load_descriptions(doc):
    mapping = dict()
    
    # process lines
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        if len(line) < 2:
            continue
            
        # take the first token as the image id, the rest as the description
        image_id, image_desc = tokens[0], tokens[1:]
        
        # remove filename from image id
        image_id = image_id.split('.')[0]
        
        # convert description tokens back to string
        image_desc = ' '.join(image_desc)
        
        # create the list if needed
        if image_id not in mapping:
            mapping[image_id] = list()
            
        # store description
        mapping[image_id].append(image_desc)
        
    return mapping

## Clean text in order to reduce the size of the vocabulary

In [5]:
import string
 
def clean_descriptions(descriptions):
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            # tokenize
            desc = desc.split()
            
            # convert to lower case
            desc = [word.lower() for word in desc]
            
            # remove punctuation from each token
            desc = [w.translate(table) for w in desc]
            
            # remove hanging 's' and 'a'
            desc = [word for word in desc if len(word)>1]
            
            # remove tokens with numbers in them
            desc = [word for word in desc if word.isalpha()]
            
            # store as string
            desc_list[i] =  ' '.join(desc)


## Convert descriptions to vocabulary and save to file

In [6]:
# convert the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
    # build a list of all description strings
    all_desc = set()
    
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
        
    return all_desc

# save descriptions to file, one per line
def save_descriptions(descriptions, filename):
    lines = list()
    
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
            
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

## Load a pre-defined set of identifiers given the sets filename

In [7]:
# load a pre-defined list of photo identifiers
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    
    # process line by line
    for line in doc.split('\n'):
        # skip empty lines
        if len(line) < 1:
            continue
            
        # get the image identifier
        identifier = line.split('.')[0]
        dataset.append(identifier)
        
    return set(dataset)

## Load clean description into memory

In [8]:
# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
    # load document
    doc = load_doc(filename)
    descriptions = dict()
    
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        
        # split id from description
        image_id, image_desc = tokens[0], tokens[1:]
        
        # skip images not in the set
        if image_id in dataset:
            # create list
            if image_id not in descriptions:
                descriptions[image_id] = list()
                
            # wrap description in tokens
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            
            # store
            descriptions[image_id].append(desc)
            
    return descriptions

## Loads the entire set of photo descriptions and returns a subset of interest for a given set of photo identifiers

In [9]:
# load photo features
def load_photo_features(filename, dataset):
    # load all features
    all_features = np.load(filename).item() # all_features = load(open(filename, 'rb'))
    
    # filter features
    features = {k: all_features[k] for k in dataset}
    
    return features



## Convert the dictionary of descriptions into a list of strings and l fit a Tokenizer given the loaded photo description text

In [10]:
# covert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
        
    return all_desc
 
# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

## Create sequences

In [11]:
# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, desc_list, photo):
    X1, X2, y = list(), list(), list()
    
    # walk through each description for the image
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        
        # split one sequence into multiple X,Y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            
            # store
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)

    return np.array(X1), np.array(X2), np.array(y)

## Define length of description with most words

In [12]:
# calculate the length of the description with the most words
def max_length(descriptions):
    lines = to_lines(descriptions)
    max_val = max(len(d.split()) for d in lines) # Added variable to return due to error(?)
    return max_val

## Extract features from VGG16 or InceptionResNet

In [13]:
# extract features from each photo in the directory
def extract_features(directory, model_type):
    # load the model
    if model_type.lower() is 'vgg16':
        model = InceptionResNetV2()
    elif model_type.lower() is 'inceptionresnet':
        model = VGG16()
    
    # re-structure the model
    model.layers.pop()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)

    # summarize
    print(model.summary())

    # extract features from each photo
    features = dict()
    for name in listdir(directory):
        # load an image from file
        filename = directory + '/' + name
        image = load_img(filename, target_size=(224, 224))
        
        # convert the image pixels to a numpy array
        image = img_to_array(image)
        
        # reshape data for the model
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        
        # prepare the image
        if model_type.lower() is 'vgg16':
            image = preprocess_VGG16(image)
        elif model_type.lower() is 'inceptionresnet':
            image = preprocess_InceptionResNet(image)
        
        # get features
        feature = model.predict(image, verbose=0)
        
        # get image id
        image_id = name.split('.')[0]
        
        # store feature
        features[image_id] = feature
        print('>%s' % name)
    
    return features

## Map integer to word

In [14]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

## Generate image description

In [15]:
# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
    # seed the generation process
    in_text = 'startseq'
    # iterate over the whole length of the sequence
    for i in range(max_length):
        # integer encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad input
        sequence = pad_sequences([sequence], maxlen=max_length)
        # predict next word
        yhat = model.predict([photo,sequence], verbose=0)
        # convert probability to integer
        yhat = argmax(yhat)
        # map integer to word
        word = word_for_id(yhat, tokenizer)
        # stop if we cannot map the word
        if word is None:
            break
        # append as input for generating the next word
        in_text += ' ' + word
        # stop if we predict the end of the sequence
        if word == 'endseq':
            break
    return in_text

## Evaluate model with BLEU score

In [16]:
# evaluate the skill of the model
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
    actual, predicted = list(), list()
    # step over the whole set
    for key, desc_list in descriptions.items():
        # generate description
        yhat = generate_desc(model, tokenizer, photos[key], max_length)
        
        # store actual and predicted
        references = [d.split() for d in desc_list]
        actual.append(references)
        predicted.append(yhat.split())

    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

# Models

## Standard caption model

In [17]:
def define_model(vocab_size, max_length):
    # feature extractor model
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    # sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    # decoder model
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    
    # compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    # summarize model
    model.summary()
    #plot_model(model, to_file='model.png', show_shapes=True)
    return model

## Modified caption model (to be continued...)

In [18]:
def define_modified_model(vocab_size, max_length):
    # feature extractor model
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    # sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    #se3 = Bidirectional(LSTM(256))(se2)
    
    # decoder model
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    
    # compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    # summarize model
    model.summary()
    #plot_model(model, to_file='model.png', show_shapes=True)
    return model

# Training and tweaking

## VGG16 feature extraction 

In [None]:
#extract features from all images
directory = 'Flicker8k_Dataset'
features = extract_features(directory,model_type='VGG16')

# save to file
np.save('features_VGG16.npy',features)
#imported_features = np.load('features.npy').item()

## InceptionResNet feature extraction

In [None]:
#extract features from all images
directory = 'Flicker8k_Dataset'
features = extract_features(directory,model_type='InceptionResNet')

# save to file
np.save('features_VGG16.npy',features)
#imported_features = np.load('features.npy').item()

## Example model training

In [20]:
# load training dataset (6K)
filename = 'Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))

# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))

# photo features
train_features = load_photo_features('features_VGG.npy', train)
print('Photos: train=%d' % len(train_features))

# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
np.save('tokenizer.npy',tokenizer)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# determine the maximum sequence length
max_len = max_length(train_descriptions)
print('Description Length: %d' % max_len)
 
# define the model
model_VGG16 = define_model(vocab_size, max_len)

# train the model, run epochs manually and save after each epoch
num_epochs = 10
steps = len(train_descriptions)

# callbacks
ES = EarlyStopping(monitor='val_loss', restore_best_weights=True)

# create the data generator
generator = data_generator(train_descriptions, train_features, tokenizer, max_len)
# TODO: ADD VALIDATION GENERATOR

# fit for one epoch
model_VGG16.fit_generator(generator, epochs=num_epochs, steps_per_epoch=steps, verbose=1, callbacks=[ES])

# save model
model_VGG16.save('model_VGG16.h5')

Dataset: 6000
Descriptions: train=6000
Photos: train=6000
Vocabulary Size: 7579
Description Length: 34
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 34)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 4096)         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 34, 256)      1940224     input_4[0][0]                    
__________________________________________________________________________________________________
dropout_3 (Dropout)             (None, 4096)         0           input_3[0][0]                    
______

KeyboardInterrupt: 

## Modified model

In [None]:
# load training dataset (6K)
filename = 'Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))

# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))

# photo features
train_features = load_photo_features('features_InceptionResNet.npy', train)
print('Photos: train=%d' % len(train_features))

# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
np.save('tokenizer.npy',tokenizer)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# determine the maximum sequence length
max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)
 
# define the model
model_mod = define_modified_model(vocab_size, max_length)

# train the model, run epochs manually and save after each epoch
num_epochs = 5
steps = len(train_descriptions)

# callbacks
ES = EarlyStopping(monitor='val_loss', restore_best_weights=True)

# create the data generator
generator = data_generator(train_descriptions, train_features, tokenizer, max_len)
# TODO: ADD VALIDATION GENERATOR

# fit for one epoch
model_mod.fit_generator(generator, epochs=num_epochs, steps_per_epoch=steps, verbose=1, callbacks=[ES])

# save model
model_mod.save('model_modified.h5')

# Testing

In [None]:
# load test set
filename = 'Flickr8k_text/Flickr_8k.testImages.txt'
test = load_set(filename)
print('Dataset: %d' % len(test))
# descriptions
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))
# photo features
test_features = load_photo_features('features_InceptionResNet.npy', test)
print('Photos: test=%d' % len(test_features))
 
# load the model
filename = 'model_modified.h5'
model = load_model(filename)

# evaluate model
evaluate_model(model, test_descriptions, test_features, tokenizer, max_length)

## Generate caption for single image

In [None]:
# load the tokenizer
tokenizer = np.load('tokenizer.npy').item()

# pre-define the max sequence length (from training)
max_length = 34

# load the model
model = load_model('model_modified.h5')

# load and prepare the photograph
photo = extract_features('example.jpg')

# generate description
description = generate_desc(model, tokenizer, photo, max_length)
print(description)

# EXTRA: Possible extensions

*  Alternative pre-trained models: InceptionResNetV2 https://arxiv.org/pdf/1602.07261.pdf
*  Pre-trained Word Vectors [Pre-trained word embeddings](https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py)
*  Tune Model
*   Bi-directional - [Wrappers](https://keras.io/layers/wrappers/)
*  Attention - Further development...