In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import random, json, string, pickle
import keras
import keras.layers
from keras.layers.wrappers import TimeDistributed
import keras.models
import keras.optimizers
import keras.callbacks
from keras.preprocessing import image
import keras.preprocessing.text
import keras.preprocessing.sequence
from keras.applications import vgg16
from keras.applications import resnet50
import seq2seq
from nltk import word_tokenize
import matplotlib.pyplot as plt%matplotlib inline

In [None]:
# VGG-16 architecture with ImageNet weights
image_model = vgg16.VGG16(weights='imagenet', include_top = True)

In [None]:
image_model.layers.pop()
image_model.outputs = [image_model.layers[-1].output]
image_model.layers[-1].outbound_nodes = []
image_model.summary()

In [None]:
# ResNet-50 architecture with ImageNet weights
image_model = resnet50.ResNet50(weights='imagenet', include_top = False)

In [None]:
image_model.summary()

In [None]:
# image paths

with open('Data/train_images.txt','rb') as file:
    train_path = []
    for line in file:
        train_path.append(line.decode().strip())

with open('Data/val_images.txt','rb') as file:
    val_path = []
    for line in file:
        val_path.append(line.decode().strip())

with open('Data/test_images.txt','rb') as file:
    test_path = []
    for line in file:
        test_path.append(line.decode().strip())

In [None]:
# source captions

with open('Data/train.en','rb') as file:
    caption_source_train = []
    for line in file:
        caption_source_train.append(line.decode().strip())
        
with open('Data/val.en','rb') as file:
    caption_source_val = []
    for line in file:
        caption_source_val.append(line.decode().strip())

with open('Data/test2016.en','rb') as file:
    caption_source_test = []
    for line in file:
        caption_source_test.append(line.decode().strip())
        
caption_source_train = ['[START] ' + entry for entry in caption_source_train]      
caption_source_val = ['[START] ' + entry for entry in caption_source_val]       
caption_source_test = ['[START] ' + entry for entry in caption_source_test]

In [None]:
target captions

with open('Data/train.fr','rb') as file:
    caption_target_train = []
    for line in file:
        caption_target_train.append(line.decode().strip())

with open('Data/val.fr','rb') as file:
    caption_target_val = []
    for line in file:
        caption_target_val.append(line.decode().strip())

with open('Data/test2016.fr','rb') as file:
    caption_target_test = []
    for line in file:
        caption_target_test.append(line.decode().strip())
        
caption_target_train = ['[START] ' + entry for entry in caption_target_train]        
caption_target_val = ['[START] ' + entry for entry in caption_target_val]        
caption_target_test = ['[START] ' + entry for entry in caption_target_test]

In [None]:
# Compute training image features.

features = np.zeros((29000, 2048), dtype=np.float32)

batch_size = 290
n_batches = 100
index = 0

for b in range(0, 100):
    batch = np.zeros((batch_size, 224, 224, 3))
    print(('Computing features for batch %d of %d') % (b + 1, n_batches))
    
    for i in range(0, batch_size):
        img_path = 'Data/flickr30k-images/' + train_path[index]
        img = image.load_img(img_path, target_size=(224, 224))
        img = image.img_to_array(img)
        batch[i, :, :, :] = img
        index = index + 1
    
    batch = vgg16.preprocess_input(batch)
    #batch = resnet50.preprocess_input(batch)
    features[b * batch_size : (b + 1) * batch_size, :] = np.reshape(image_model.predict(batch), (batch_size, 2048))
    print(('Batch loaded for batch %d of %d') % (b + 1, n_batches))
    
pickle.dump({'features': features, 'source caption': caption_source_train, 'target caption': caption_target_train}, 
            open('Data/vgg16_image_train_features.p', 'wb'))
#pickle.dump({'features': features, 'source caption': caption_source_train, 'target caption': caption_target_train}, 
            #open('Data/resnet50_image_train_features.p', 'wb'))

In [None]:
# Compute validation image features.

features = np.zeros((1014, 4096), dtype=np.float32)
#features = np.zeros((1014, 2048), dtype=np.float32)
val = np.zeros((1014, 224, 224, 3))

for i in range(0, 1014):
    img_path = 'Data/flickr30k-images/' + val_path[i]
    img = image.load_img(img_path, target_size=(224, 224))
    img = image.img_to_array(img)
    val[i, :, :, :] = img

val = vgg16.preprocess_input(val)
#val = resnet50.preprocess_input(val)
features[:, :] = np.reshape(image_model.predict(val), (1014, 4096))
#features[:, :] = np.reshape(image_model.predict(val), (1014, 2048))

pickle.dump({'features': features, 'source caption': caption_source_val, 'target caption': caption_target_val}, 
            open('Data/vgg16_image_val_features.p', 'wb'))
#pickle.dump({'features': features, 'source caption': caption_source_val, 'target caption': caption_target_val}, 
            #open('Data/resnet50_image_val_features.p', 'wb'))

In [None]:
# Compute testing image features.

features = np.zeros((1000, 4096), dtype=np.float32)
#features = np.zeros((1000, 4096), dtype=np.float32)
test = np.zeros((1000, 224, 224, 3))

for i in range(0, 1000):
    img_path = 'Data/flickr30k-images/' + test_path[i]
    img = image.load_img(img_path, target_size=(224, 224))
    img = image.img_to_array(img)
    test[i, :, :, :] = img

test = vgg16.preprocess_input(test)
#test = resnet50.preprocess_input(test)
features[:, :] = np.reshape(image_model.predict(test), (1000, 4096))
#features[:, :] = np.reshape(image_model.predict(test), (1000, 2048))

pickle.dump({'features': features, 'source caption': caption_source_test, 'target caption': caption_target_test}, 
            open('Data/vgg16_image_test_features.p', 'wb'))
#pickle.dump({'features': features, 'source caption': caption_source_test, 'target caption': caption_target_test}, 
            #open('Data/resnet50_image_test_features.p', 'wb'))

In [None]:
# Define source vocabulary.

vocabularySize = 5000

# Split sentences into words, and define a vocabulary with the most common words.
tokenizer = keras.preprocessing.text.Tokenizer(vocabularySize, filters = '!"#$%&()*+,-./:;<=>?@\\^_`{|}~\t\n') 
tokenizer.fit_on_texts(caption_source_train)

# Convert the sentences into sequences of word ids using our vocabulary.
captionSequences = tokenizer.texts_to_sequences(caption_source_train)
val_captionSequences = tokenizer.texts_to_sequences(caption_source_val)
test_captionSequences = tokenizer.texts_to_sequences(caption_source_test)

# Keep dictionaries that map ids -> words, and words -> ids.
word2id = tokenizer.word_index
id2word = {idx: word for (word, idx) in word2id.items()}
maxSequenceLength = max([len(seq) for seq in captionSequences])  # Find the sentence with most words.

# Print some output to verify the above.
print('Original string', caption_source_train[0])
print('Sequence of Word Ids', captionSequences[0])
print('Word Ids back to Words', [id2word[idx] for idx in captionSequences[0]])
print('Max Sequence Length', maxSequenceLength)
print('Vocabulary Size', vocabularySize)

In [None]:
# Pre-process sequences.

# Pad sequences.
text = keras.preprocessing.sequence.pad_sequences(captionSequences, maxlen = (maxSequenceLength + 1), 
                                                  padding = 'post', truncating = 'post')

val_text = keras.preprocessing.sequence.pad_sequences(val_captionSequences, maxlen = (maxSequenceLength + 1), 
                                                      padding = 'post', truncating = 'post')

test_text = keras.preprocessing.sequence.pad_sequences(test_captionSequences, maxlen = (maxSequenceLength + 1), 
                                                       padding = 'post', truncating = 'post')

# input and output sequences for training language model
inputText = text[:, :-1] # words 1, 2, 3, ... , (n-1)
outputText = text[:, 1:] # words 2, 3, 4, ... , (n)

val_inputText = val_text[:, :-1]
val_outputText = val_text[:, 1:]

test_inputText = test_text[:, :-1]
test_outputText = test_text[:, 1:]

outputText = np.expand_dims(outputText, -1)
val_outputText = np.expand_dims(val_outputText, -1)
test_outputText = np.expand_dims(test_outputText, -1)

id2word[0] = 'END'
word2id['END'] = 0

print(text.shape)
print([id2word[idx] for idx in text[0]], " ")

In [None]:
# Define target vocabulary

vocabularySize = 5000

# Split sentences into words, and define a vocabulary with the most common words.
lab_tokenizer = keras.preprocessing.text.Tokenizer(vocabularySize, filters = '!"#$%&()*+,-./:;<=>?@\\^_`{|}~\t\n') 
lab_tokenizer.fit_on_texts(caption_target_train)

# Convert the sentences into sequences of word ids using our vocabulary.
lab_captionSequences = lab_tokenizer.texts_to_sequences(caption_target_train)
lab_val_captionSequences = lab_tokenizer.texts_to_sequences(caption_target_val)
lab_test_captionSequences = lab_tokenizer.texts_to_sequences(caption_target_test)

# Keep dictionaries that map ids -> words, and words -> ids.
lab_word2id = lab_tokenizer.word_index
lab_id2word = {idx: word for (word, idx) in lab_word2id.items()}

# Print some output to verify the above.
print('Original string', caption_target_train[0])
print('Sequence of Word Ids', lab_captionSequences[0])
print('Word Ids back to Words', [lab_id2word[idx] for idx in lab_captionSequences[0]])
print('Max Sequence Length', maxSequenceLength)
print('Vocabulary Size', vocabularySize)

In [None]:
# Pre-process sequences.

# Pad sequences.
output = keras.preprocessing.sequence.pad_sequences(lab_captionSequences, maxlen = (maxSequenceLength + 1), 
                                                  padding = 'post', truncating = 'post')

val_output = keras.preprocessing.sequence.pad_sequences(lab_val_captionSequences, maxlen = (maxSequenceLength + 1), 
                                                      padding = 'post', truncating = 'post')

test_output = keras.preprocessing.sequence.pad_sequences(lab_test_captionSequences, maxlen = (maxSequenceLength + 1), 
                                                      padding = 'post', truncating = 'post')

# output sequences for translation
output = output[:, 1:]
val_output = val_output[:, 1:]
test_output = test_output[:, 1:]

lab_id2word[0] = 'END'
lab_word2id['END'] = 0

print(output.shape)
print([lab_id2word[idx] for idx in output[0]], '')

output = np.expand_dims(output, -1)
val_output = np.expand_dims(val_output, -1)
test_output = np.expand_dims(test_output, -1)

In [None]:
# SimpleRNN for language modeling. I later abandoned this approach in favor of end to end model: Seq2Seq.

words = keras.layers.Input(batch_shape=(None, maxSequenceLength), name = "input")
embeddings = keras.layers.embeddings.Embedding(vocabularySize, 300, name = "embeddings")(words)
dropout1 = keras.layers.Dropout(0.5)(embeddings)
hiddenStates = keras.layers.SimpleRNN(512, return_sequences = True, input_shape=(maxSequenceLength, 300), 
                                      name = "rnn")(dropout1)
dropout2 = keras.layers.Dropout(0.5)(hiddenStates)
denseOutput = TimeDistributed(keras.layers.Dense(vocabularySize), name = "linear")(dropout2)   
predictions = TimeDistributed(keras.layers.Activation("softmax"), name = "softmax")(denseOutput)                                      

text_model = keras.models.Model(input = words, output = predictions)

text_model.compile(loss='sparse_categorical_crossentropy', optimizer = keras.optimizers.Adam(lr = 0.001))

text_model.summary()

In [None]:
# Train model.

checkpointer = keras.callbacks.ModelCheckpoint(filepath='text_model.hdf5', save_weights_only = True,
                                               save_best_only = True, monitor = 'val_loss')

text_model.fit(inputText, outputText, validation_data = (val_inputText, val_outputText), batch_size = 290, nb_epoch = 30, 
               callbacks = [checkpointer])

In [None]:
# Load weights.
text_model.load_weights('Models/text_model.hdf5')

In [None]:
# Predict on validation set.
val_probability = text_model.predict(val_inputText[:10])
val_probability.shape

In [None]:
# Generate predictions.

val_prediction = np.empty([10, 38])
val_caption = []

for i in range(10):
    caption = []
    
    for j in range(38):
        prediction = (-val_probability[i, j, :]).argsort()[0]
        caption.append(id2word[prediction])
        
    val_caption.append(caption)
    print('%d.' %i)
    
    for word in caption:
        print(word, end = ' ')
        
    print('\n')

In [None]:
# Generate truths.

for i in range(10):
    caption = []
    
    for j in range(38):
        caption.append(id2word[val_outputLabels[i, j, 0]])
    
    print('%d.' %i)
    
    for word in caption:
        print(word, end = ' ')
        
    print('\n')

In [None]:
# Remove layers after hidden layers for feature extraction.
print('Building training model...')

words = keras.layers.Input(batch_shape=(None, maxSequenceLength), name = "input")
embeddings = keras.layers.embeddings.Embedding(vocabularySize, 300, name = "embeddings")(words)
dropout1 = keras.layers.Dropout(0.5)(embeddings)
hiddenStates = keras.layers.SimpleRNN(512, return_sequences = True, input_shape=(maxSequenceLength, 300), 
                                      name = "rnn")(dropout1)

text_model = keras.models.Model(input = words, output = hiddenStates)

text_model.compile(loss='sparse_categorical_crossentropy', optimizer = keras.optimizers.Adam(lr = 0.001))

text_model.summary()

In [None]:
# Load weights.
text_model.load_weights('Models/text_model.hdf5', by_name = True)

In [None]:
# Compute training text features.

features = np.zeros((29000, 38, 512), dtype=np.float32)

batch_size = 290
n_batches = 100
index = 0

for b in range(0, 100):
    batch = outputText[b * batch_size:(b + 1) * batch_size, :]
    print(('Computing features for batch %d of %d') % (b + 1, n_batches))
    
    features[b * batch_size : (b + 1) * batch_size, :, :] = text_model.predict_on_batch(batch)
    print(('Batch loaded for batch %d of %d') % (b + 1, n_batches))
    
pickle.dump({'features': features, 'source caption': caption_source_train, 'target caption': caption_target_train}, 
            open('Data/text_train_features.p', 'wb'))

In [None]:
# Compute validation text features

features = text_model.predict_on_batch(val_outputText)
    
pickle.dump({'features': features, 'source caption': caption_source_val, 'target caption': caption_target_val}, 
            open('Data/text_val_features.p', 'wb'))

In [None]:
# Compute testing text features

features = text_model.predict_on_batch(test_outputText)
    
pickle.dump({'features': features, 'source caption': caption_source_val, 'target caption': caption_target_val}, 
            open('Data/text_test_features.p', 'wb'))

In [None]:
# Load features.

train_imageFeatures = pickle.load(open('Data/vgg16_image_train_features.p','rb'))['features']
val_imageFeatures = pickle.load(open('Data/vgg16_image_val_features.p','rb'))['features']
test_imageFeatures = pickle.load(open('Data/vgg16_image_test_features.p','rb'))['features']
#train_imageFeatures = pickle.load(open('Data/resnet50_image_train_features.p','rb'))['features']
#val_imageFeatures = pickle.load(open('Data/resnet50_image_val_features.p','rb'))['features']
#test_imageFeatures = pickle.load(open('Data/resnet50_image_test_features.p','rb'))['features']

train_textFeatures = pickle.load(open('Data/text_train_features.p','rb'))['features']
val_textFeatures = pickle.load(open('Data/text_val_features.p','rb'))['features']
test_textFeatures = pickle.load(open('Data/text_train_features.p','rb'))['features']

In [None]:
# LSTM for inference.
print('Building training model...')

inputs = keras.layers.Input(batch_shape = (1, 38, 4608))
normalize = keras.layers.normalization.BatchNormalization()(inputs)
dropout1 = keras.layers.Dropout(0.5)(normalize)
hiddenStates = keras.layers.LSTM(512, stateful = False, return_sequences = True, batch_input_shape=(38, 4608))(dropout1)
dropout2 = keras.layers.Dropout(0.5)(hiddenStates)
denseOutput = keras.layers.Dense(vocabularySize)(dropout2)   
predictions = keras.layers.Activation("softmax")(denseOutput)                                      

inference_model = keras.models.Model(input = inputs, output = predictions)

inference_model.compile(loss='sparse_categorical_crossentropy', optimizer = keras.optimizers.Adam(lr = 0.001))

inference_model.summary()

In [None]:
# Define generator for training.

def DataGenerator(image_features, text_features, output, batch_size):
    
    while True:
        batch = np.zeros((batch_size, 38, 4608))
        labels = np.zeros((batch_size, 38, 1))
        
        for i in range(0, batch_size):
            index = np.random.randint(len(image_features))
            image = image_features[index, :, :]
            text = text_features[index, :, :]
            batch[i, :, :] = np.concatenate((image, text), axis = -1)
            labels[i, :, :] = output[i, :, :]
            
        yield batch, labels

In [None]:
# Train model.

checkpointer = keras.callbacks.ModelCheckpoint(filepath='vgg16_inference_model_.hdf5', save_weights_only = True,
                                               save_best_only = True, monitor = 'val_loss')
#checkpointer = keras.callbacks.ModelCheckpoint(filepath='resnet50_inference_model_.hdf5', save_weights_only = True,
                                               #save_best_only = True, monitor = 'val_loss')

inference_model.fit_generator(DataGenerator(train_imageFeatures, train_textFeatures, output, 100), 2900, nb_epoch = 200,
                              validation_data = DataGenerator(val_imageFeatures, val_textFeatures, val_output, 100),
                              nb_val_samples = 1000, nb_worker = 1, max_q_size = 20, pickle_safe = False, 
                              callbacks = [checkpointer])

In [None]:
# Load weights.
inference_model.load_weights('vgg16_inference_model_.hdf5')
#inference_model.load_weights('resnet50_inference_model_.hdf5')

In [None]:
# Predict on validation set.

batch = np.zeros((10, 38, 4608))
labels = np.zeros((10, 38, 1))
     
    for i in range(0, 10):
        image = val_imageFeatures[i, :, :]
        text = val_textFeatures[i, :, :]
        batch[i, :, :] = np.concatenate((image, text), axis = -1)
        labels[i, :, :] = val_output[i, :, :]
        
val_probability = inference_model.predict([val_imageFeatures, val_textFeatures])
val_probability.shape

In [None]:
# Generate predictions.

val_prediction = np.empty([10, 38])
val_caption = []

for i in range(10):
    caption = []
    
    for j in range(38):
        prediction = (-val_probability[i, j, :]).argsort()[0]
        caption.append(lab_id2word[prediction])
        
    val_caption.append(caption)
    print('%d.' %i)
    
    for word in caption:
        print(word, end = ' ')
        
    print('\n')

In [None]:
# Generate truths.

for i in range(10):
    caption = []
    
    for j in range(38):
        caption.append(lab_id2word[val_output[i, j, 0]])
    
    print('%d.' %i)
    
    for word in caption:
        print(word, end = ' ')
        
    print('\n')

In [None]:
# Seq2Seq for inference

rnn = seq2seq.Seq2Seq(batch_input_shape = (None, 38, 5000), hidden_dim = 512, output_dim=512, output_length=38, depth=1, peek=True)
predictions = TimeDistributed(keras.layers.Dense(5000, activation='softmax'))(rnn.output)

s2s_inference_model = keras.models.Model(input = rnn.input, output = predictions)

s2s_inference_model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.RMSprop(lr=0.0001, rho=0.9, epsilon=1e-08, decay=0.0))

s2s_inference_model.summary()

In [None]:
# Define generator for training.

def TextDataGenerator(inputText, outputText, batch_size):
    
    while True:
        batch = np.zeros((batch_size, 38, 5000))
        labels = np.zeros((batch_size, 38, 5000))
        
        for i in range(0, batch_size):
            index = np.random.randint(len(inputText))
            batch[i, :, :] = np.expand_dims(keras.utils.np_utils.to_categorical(inputText[index, :], 5000), 0)
            labels[i, :, :] = np.expand_dims(keras.utils.np_utils.to_categorical(outputText[index, :], 5000), 0)
            
        yield batch, labels

In [None]:
# Train model.

checkpointer = keras.callbacks.ModelCheckpoint(filepath='Models/s2s_inference_model.hdf5', save_weights_only = True,
                                               save_best_only = True, monitor = 'val_loss')
log = keras.callbacks.TensorBoard(log_dir='Logs/s2s_inference_model', histogram_freq=10, write_graph=True, write_images=True)

s2s_inference_model.fit_generator(TextDataGenerator(outputText, output, 100), 
                                      validation_data = TextDataGenerator(val_outputText, val_output, 100), 
                                      samples_per_epoch=2900, nb_epoch = 500, nb_val_samples = 1000, 
                                      callbacks = [checkpointer, log], nb_worker = 1)

In [None]:
# Load weights.
s2s_inference_model.load_weights('Models/s2s_inference_model.hdf5')

In [None]:
# Predict on validation set.

batch = np.zeros((10, 38, 5000))

for i in range(0, 10):
    batch[i, :, :] = np.expand_dims(keras.utils.np_utils.to_categorical(val_outputText[i, :], 5000), 0)

val_probability = s2s_inference_model.predict_on_batch(batch)
val_probability.shape

In [None]:
# Generate predictions.

val_prediction = np.empty([10, 38])
val_caption = []

for i in range(10):
    caption = []
    
    for j in range(38):
        prediction = (-val_probability[i, j, :]).argsort()[0]
        caption.append(lab_id2word[prediction])
        
    val_caption.append(caption)
    print('%d.' %i)
    
    for word in caption:
        print(word, end = ' ')
        
    print('\n')

In [None]:
# Reuse baseline model.

rnn = seq2seq.Seq2Seq(batch_input_shape = (None, 38, 5000), hidden_dim = 512, output_dim=512, output_length=38, depth=1, peek=True)
predictions = TimeDistributed(keras.layers.Dense(5000, activation='softmax'))(rnn.output)

s2s_inference_model = keras.models.Model(input = rnn.input, output = predictions)

In [None]:
# Freeze layers.

for layer in s2s_inference_model.layers:
    layer.trainable = False
    
s2s_inference_model.summary()

In [None]:
# VGG16/ResNet50 + S2S for inference

image_input = keras.layers.Input((38, 4096))
#image_input = keras.layers.Input((38, 2048))
image_dense = keras.layers.Dense(128, activation='tanh')(image_input)
text_input = keras.layers.Input((38, 5000))
merge = keras.layers.merge((image_dense, text_input), mode='concat')
dense = keras.layers.Dense(5000, activation='softmax')(merge)
inference = s2s_inference_model(dense)

vgg16_s2s_inference_model = keras.models.Model(input = [image_input, text_input], output = inference)
#resnet_s2s_inference_model = keras.models.Model(input = [image_input, text_input], output = inference)

# Load weights.
vgg16_s2s_inference_model.layers[-1].load_weights('Models/s2s_inference_model.hdf5')
#resnet_s2s_inference_model.layers[-1].load_weights('Models/s2s_inference_model.hdf5')

vgg16_s2s_inference_model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0))
#resnet_s2s_inference_model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0))

vgg16_s2s_inference_model.summary()
#resnet_s2s_inference_model.summary()

In [None]:
# Define generator for training.
def DataGenerator(inputImage, inputText, output, batch_size):
    
    while True:
        batch = [np.zeros((batch_size, 38, 4096)), np.zeros((batch_size, 38, 5000))]
        #batch = [np.zeros((batch_size, 38, 2048)), np.zeros((batch_size, 38, 5000))]
        labels = np.zeros((batch_size, 38, 5000))
        
        for i in range(0, batch_size):
            index = np.random.randint(len(inputText))
            image = np.expand_dims(inputImage[index, :], axis=-1).repeat(38, axis=1).transpose()
            text = np.expand_dims(keras.utils.np_utils.to_categorical(inputText[index, :, :], 5000), 0)
            batch[0][i, :, :] = image
            batch[1][i, :, :] = text
            labels[i, :, :] = np.expand_dims(keras.utils.np_utils.to_categorical(outputText[index, :, :], 5000), 0)
            
        yield batch, labels

In [None]:
# Train model.

checkpointer = keras.callbacks.ModelCheckpoint(filepath='Models/vgg16_s2s_inference_model.hdf5', save_weights_only = True,
                                               save_best_only = True, monitor = 'val_loss')
#checkpointer = keras.callbacks.ModelCheckpoint(filepath='Models/resnet50_s2s_inference_model.hdf5', save_weights_only = True,
                                               #save_best_only = True, monitor = 'val_loss')
log = keras.callbacks.TensorBoard(log_dir='Logs/vgg16_s2s_inference_model', histogram_freq=10, write_graph=True, write_images=True)
#log = keras.callbacks.TensorBoard(log_dir='Logs/resnet50_s2s_inference_model', histogram_freq=10, write_graph=True, write_images=True)

vgg16_s2s_inference_model.fit_generator(DataGenerator(train_imageFeatures, outputText, output, 50), 
                                        validation_data = DataGenerator(val_imageFeatures, val_outputText, val_output, 50), 
                                        samples_per_epoch=2900, nb_epoch = 200, nb_val_samples = 1000, 
                                        callbacks = [checkpointer, log], nb_worker = 1)
#resnet50_s2s_inference_model.fit_generator(DataGenerator(train_imageFeatures, outputText, output, 50), 
                                           #validation_data = DataGenerator(val_imageFeatures, val_outputText, val_output, 50), 
                                           #samples_per_epoch=2900, nb_epoch = 200, nb_val_samples = 1000, 
                                           #callbacks = [checkpointer, log], nb_worker = 1)

## Seq2Seq

In [None]:
# Load weights.
s2s_inference_model.load_weights('Models/s2s_inference_model.hdf5')

In [None]:
# Predict on testing set.

batch = np.zeros((1000, 38, 5000))

for i in range(0, 1000):
    batch[i] = np.expand_dims(keras.utils.np_utils.to_categorical(test_outputText[i, :], 5000), 0)

test_probability = s2s_inference_model.predict_on_batch(batch)
test_probability.shape

In [None]:
# Generate predictions.

test_prediction = np.empty([1000, 38])
test_caption = []

for i in range(1000):
    caption = []
    
    for j in range(38):
        prediction = (-test_probability[i, j, :]).argsort()[0]
        test_prediction[i, j] = prediction
        caption.append(id2word[prediction]) # Use source dictionary because nltk.translate.bleu_score does not work for French.
        
    test_caption.append(caption)
    print('%d.' %i)
    
    for word in caption:
        print(word, end = ' ')
        
    print('\n')

In [None]:
# Generate truths.

test_truth = np.empty([1000, 38])
test_truecaption = []

for i in range(1000):
    caption = []
    
    for j in range(38):
        truth = int(test_output[i, j])
        test_truth[i, j] = truth
        caption.append(id2word[truth]) # Use source dictionary because nltk.translate.bleu_score does not work for French.
        
    test_truecaption.append(caption)
    print('%d.' %i)
    
    for word in caption:
        print(word, end = ' ')
        
    print('\n')

In [None]:
# Calculate BLEU score

reference = test_truecaption
candidate = test_caption

BLEUscore = np.zeros(1000)

for i in range(1000):
    BLEUscore[i] = nltk.translate.bleu(reference[i], candidate[i], smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method4)
    BLEUaverage = np.mean(BLEUscore)
    
print(BLEUaverage)

#bleu_score = float(len(set(tuple(cap) for cap in reference) & set(tuple(cap) for cap in candidate))) / len(candidate)
#print("BLEU-1 score = ", bleu_score)

### S2S:
### BLEU-1 = 0.122849222101

## VGG-16 + Seq2Seq

In [None]:
# Load weights.
vgg16_s2s_inference_model.load_weights('Models/vgg16_s2s_inference_model.hdf5')

In [None]:
# Predict on testing set.

batch = [np.zeros((1000, 38, 4096)), np.zeros((1000, 38, 5000))]

for i in range(1000):
    image = np.expand_dims(test_imageFeatures[i, :], axis=-1).repeat(38, axis=1).transpose()
    text = np.expand_dims(keras.utils.np_utils.to_categorical(test_outputText[i, :, :], 5000), 0)
    batch[0][i, :, :] = image
    batch[1][i, :, :] = text

test_probability = vgg16_s2s_inference_model.predict_on_batch(batch)
test_probability.shape

In [None]:
# Generate predictions.

test_prediction = np.empty([1000, 38])
test_caption = []

for i in range(1000):
    caption = []
    
    for j in range(38):
        prediction = (-test_probability[i, j, :]).argsort()[0]
        test_prediction[i, j] = prediction
        caption.append(id2word[prediction]) # Use source dictionary because nltk.translate.bleu_score does not work for French.
        
    test_caption.append(caption)
    print('%d.' %i)
    
    for word in caption:
        print(word, end = ' ')
        
    print('\n')

In [None]:
# Generate truths

test_truth = np.empty([1000, 38])
test_truecaption = []

for i in range(1000):
    caption = []
    
    for j in range(38):
        truth = int(test_output[i, j])
        test_truth[i, j] = truth
        caption.append(id2word[truth]) # Use source dictionary because nltk.translate.bleu_score does not work for French.
        
    test_truecaption.append(caption)
    print('%d.' %i)
    
    for word in caption:
        print(word, end = ' ')
        
    print('\n')

In [None]:
# Calculate BLEU score

reference = test_truecaption
candidate = test_caption

BLEUscore = np.zeros(1000)

for i in range(1000):
    BLEUscore[i] = nltk.translate.bleu_score.sentence_bleu(reference[i], candidate[i], 
                                                           smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method4)

BLEUaverage = np.mean(BLEUscore)
    
print(BLEUaverage)

### VGG16 + S2S:
### BLEU-1 = 0.171233798533

## ResNet-16 + Seq2Seq

In [None]:
# Load weights.
resnet50_s2s_inference_model.load_weights('Models/resnet50_s2s_inference_model.hdf5')

In [None]:
# Predict on testing set.

batch = [np.zeros((1000, 38, 2048)), np.zeros((1000, 38, 5000))]

for i in range(1000):
    image = np.expand_dims(test_imageFeatures[i, :], axis=-1).repeat(38, axis=1).transpose()
    text = np.expand_dims(keras.utils.np_utils.to_categorical(test_outputText[i, :, :], 5000), 0)
    batch[0][i, :, :] = image
    batch[1][i, :, :] = text

test_probability = resnet50_s2s_inference_model.predict_on_batch(batch)
test_probability.shape

In [None]:
# Generate predictions.

test_prediction = np.empty([1000, 38])
test_caption = []

for i in range(1000):
    caption = []
    
    for j in range(38):
        prediction = (-test_probability[i, j, :]).argsort()[0]
        test_prediction[i, j] = prediction
        caption.append(id2word[prediction]) # Use source dictionary because nltk.translate.bleu_score does not work for French.
        
    test_caption.append(caption)
    print('%d.' %i)
    
    for word in caption:
        print(word, end = ' ')
        
    print('\n')

In [None]:
# Generate truths.

test_truth = np.empty([1000, 37])
test_truecaption = []

for i in range(1000):
    caption = []
    
    for j in range(37):
        truth = int(test_output[i, j+1])
        test_truth[i, j] = truth
        caption.append(id2word[truth]) # Use source dictionary because nltk.translate.bleu_score does not work for French.
        
    test_truecaption.append(caption)
    print('%d.' %i)
    
    for word in caption:
        print(word, end = ' ')
        
    print('\n')

In [None]:
# Calculate BLEU score.

reference = test_truecaption
candidate = test_caption

BLEUscore = np.zeros(1000)

for i in range(1000):
    BLEUscore[i] = nltk.translate.bleu_score.sentence_bleu(reference[i], hypothesis=candidate[i], 
                                                           smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method4)

BLEUaverage = np.mean(BLEUscore)
    
print(BLEUaverage)

### ResNet50 + S2S: 
### BLEU-1 = 0.17116628201

In [None]:
top_captions = []

for i in range(10):
    top_captions.append(candidate[(-BLEUscore).argsort()[i]])
    print('%d.' %i)
    
    for j in range(38):
        print(candidate[(-BLEUscore).argsort()[i]][j], end=' ')
        
    print('\n')

In [None]:
top_truecaptions = []

for i in range(10):
    top_truecaptions.append(reference[(-BLEUscore).argsort()[i]])
    print('%d.' %i)
    
    for j in range(38):
        print(reference[(-BLEUscore).argsort()[i]][j], end=' ')
        
    print('\n')