In [1]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD, Adadelta, Adagrad
from keras.utils import np_utils, generic_utils
from keras.callbacks import EarlyStopping
from keras.layers.advanced_activations import PReLU, LeakyReLU
from keras.layers import Embedding,GRU,TimeDistributed,RepeatVector,Merge
from keras.preprocessing.text import one_hot
from keras.preprocessing import sequence
import cv2
import numpy as np
from vgg16 import Vgg16

from os import listdir
from os.path import isfile, join

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import PIL.Image

import json
from tqdm import tqdm

from keras.optimizers import SGD, RMSprop, Adam

from utils import *
import cPickle as pickle
from matplotlib import pyplot as plt


Using Theano backend.
Using gpu device 0: GeForce GTX 1080 (CNMeM is disabled, cuDNN 5110)


In [2]:
class ImageData(object):

    def __init__(self,id,name):
        self.id = id
        self.name = name
        self.captions = []
        self.image = []
        
    def appendCaption(self,caption):
        self.captions.append(caption)
        
class ImageEntry(object):

    def __init__(self,image,caption):
        self.image = image
        self.caption = caption
        
              

# Data Building

In [22]:
def build_data_dict():
    annotation_path = "/home/docker/fastai-courses/deeplearning1/nbs/persistent/coco/raw_annotations/captions_val2014.json"
    with open(annotation_path) as data_file:    
        data = json.load(data_file)
        
    id2ImageDataDict = {imageJson["id"]: ImageData(imageJson["id"],imageJson["file_name"]) 
                        for imageJson in data["images"]}
    
    annotationsJson = data["annotations"]
    
    for annotationJson in annotationsJson:
        imageData = id2ImageDataDict[annotationJson["image_id"]]
        caption = annotationJson["caption"]
        imageData.appendCaption(caption)

    fileName2ImageDataDict = {imageJson["file_name"]: id2ImageDataDict[imageJson["id"]] for imageJson in data["images"]}

    return fileName2ImageDataDict

def construct_image_data_arr(base_path,fileName2ImageDataDict):   
    
    image_paths = [f for f in listdir(base_path)]
    
    
    for image_file_name in tqdm(image_paths):
        
        img = PIL.Image.open(base_path+"/"+image_file_name)
        img = img.resize((224, 224), PIL.Image.NEAREST)
        
        image_data = fileName2ImageDataDict[image_file_name]
        
        img = np.asarray(img)
        
        image_data.image = img
        image_data.image = np.asarray(image_data.image)
        
        
        
    all_image_data = [imageData for _,imageData in fileName2ImageDataDict.iteritems()]
    
    filtered_image_data = [imageData for imageData in all_image_data
                      if np.asarray(imageData.image).shape == (224,224,3)]
    

    return  filtered_image_data

def constructImageEntryArr(imageDataArr):
    image_entry_arr = []

    for imageData in imageDataArr:
        image = imageData.image

        for caption in imageData.captions:
            image_entry_arr.append(ImageEntry(image,caption))

    return image_entry_arr

def construct_images_concat_t(image_data_arr):
    image_np_arr = [ np.expand_dims(image_data.image, axis=0) for image_data in image_data_arr]
    images_concat =  np.vstack(image_np_arr)
    images_concat_t = np.transpose(images_concat,(0,3,1,2))
    return images_concat_t


def get_unique_words(captions):
    unique_words = []
    words = [caption.split() for caption in captions]
   
    for word in words:
        unique_words.extend(word)
        
    unique_words = list(set(unique_words))
    
    return unique_words

def get_index_word_dicts(unique_words):
    word_index = {}
    index_word = {}
    for i,word in enumerate(unique_words):
        word_index[word] = i
        index_word[i] = word
        
    return (word_index,index_word)

def get_train_captions_indexed(captions, word2index, MAX_CAPTION_LEN ):
    
    train_captions_indexed = []
    for caption in captions:
        one = [word2index[caption_word] for caption_word in caption.split()]
        train_captions_indexed.append(one)

    train_captions_indexed = sequence.pad_sequences(train_captions_indexed, maxlen=MAX_CAPTION_LEN,padding='post')
    return train_captions_indexed
    
    

In [6]:
save_path = "/home/docker/fastai-courses/deeplearning1/nbs/persistent/coco/"
images_path = save_path+"raw_images/val2014"
image_data_arr_path = save_path+"imageDataArr/"
images_concat_t_path = save_path+"imagesConcatT/"
captions_path = save_path+"captions/"
np_save_path = save_path+"temp/"
model_path = save_path+"models/"


# Images

In [None]:
fileName_2_image_data_dict = build_data_dict()

In [None]:
image_data_arr = construct_image_data_arr(images_path,fileName_2_image_data_dict)

In [7]:
# train_images_concat_t = construct_images_concat_t(image_data_arr)
train_images_concat_t = load_array(images_concat_t_path + 'val_imagesConcatT_1000.bc')

In [8]:
test_images_concat_t = load_array(images_concat_t_path+'val_imagesConcatT_last_1000.bc')

In [None]:
# save_array(images_concat_t_path+ 'val_imagesConcatT.bc', imagestConcatT)

In [9]:
# NR_INSTANCES = len(imagestConcatT)
NR_INSTANCES = 1000

In [10]:
train_images_concat_t = train_images_concat_t[:NR_INSTANCES]
print(train_images_concat_t.shape)


(1000, 3, 224, 224)


# Captions

In [14]:
def get_truncated_captions_from_batch(batch_nr,nr_instances):
    captions = pickle.load(open(captions_path+"val2014_captions_"+str(batch_nr)+".p", "rb" ))
    captions = captions[:NR_INSTANCES]
    return captions

def dump_captions_to_disk(image_data_arr):
    for i in tqdm(range(5)):
        captions = ["START "+image_data.captions[i]+" END" for image_data in image_data_arr] 
        pickle.dump( captions, open(captions_path+"val2014_captions_"+str(i)+".p", "wb" ) )

In [17]:
captions = get_truncated_captions_from_batch(batch_nr = 0, nr_instances = NR_INSTANCES )
len(captions)

1000

In [18]:
MAX_CAPTION_LEN = max([len(caption.split()) for caption in captions])

In [23]:
unique_words = get_unique_words(captions)
VOCAB_SIZE = len(unique_words)
(word2index, index2word) = get_index_word_dicts(unique_words)

In [24]:
train_captions_indexed = get_train_captions_indexed(captions, word2index, MAX_CAPTION_LEN )
train_captions_indexed.shape

(1000, 30)

In [25]:
print("NR_INSTANCES = %s" % NR_INSTANCES)
print("MAX_CAPTION_LEN = %s"%MAX_CAPTION_LEN)
print("VOCAB_SIZE = %s"%VOCAB_SIZE)

NR_INSTANCES = 1000
MAX_CAPTION_LEN = 30
VOCAB_SIZE = 1826


In [32]:
STEP_SIZE = 100

In [26]:
def compute_partial_all_words_2_next_word(captions_indexed,step_size,np_save_path):
    
    for window_start in tqdm(range(0,len(captions_indexed),step_size)):
    
        captions_indexed_batch = captions_indexed[window_start:window_start+step_size]

        all_words_2_next_word = []

        for caption_indexed in captions_indexed_batch:

            word_2_next_word = []

            enhanced_caption_indexed = np.append(caption_indexed,[word2index["END"]]) #hacky

            for i in xrange(0,len(caption_indexed)):
                caption_word_index = enhanced_caption_indexed[i]
                future_word_index = enhanced_caption_indexed[i+1]
                future_indexes = np.zeros(VOCAB_SIZE)
                future_indexes[future_word_index] = 1

                word_2_next_word.append(future_indexes)

            words_2_next_word = np.vstack(word_2_next_word)

            all_words_2_next_word.append(words_2_next_word)

        save_array(np_save_path+ 'all_words_2_next_word__'+str(format(window_start, "06"))+'.bc', all_words_2_next_word)
    

In [28]:
def get_future_words(np_save_path):
    all_words_2_next_word_paths= [f for f in listdir(np_save_path)]
    all_words_2_next_word_paths.sort()

    all_words_2_next_word = [load_array(np_save_path + all_words_2_next_word_path) 
                             for all_words_2_next_word_path in all_words_2_next_word_paths ]

    future_words = np.vstack(all_words_2_next_word)
    future_words = np.transpose(future_words,(0,1,2))
    return future_words

In [33]:
compute_partial_all_words_2_next_word(train_captions_indexed,STEP_SIZE,np_save_path)

100%|██████████| 10/10 [00:00<00:00,  9.84it/s]


In [34]:
future_words = get_future_words(np_save_path)
future_words.shape

(1000, 30, 1826)

In [None]:
# (40438, 259, 13601)

# Model Building

## VGG

In [35]:
#Remove the last two layers to get the 4096D activations    
image_model = Vgg16().model
image_model.pop()
image_model.pop()
image_model.trainable = False
image_model.add(RepeatVector(MAX_CAPTION_LEN))

## GRU

In [36]:
language_model = Sequential()
language_model.add(Embedding(VOCAB_SIZE, 256, input_length=MAX_CAPTION_LEN))

In [37]:
model = Sequential()
model.add(Merge([image_model, language_model], mode='concat'))
model.add(GRU(256, return_sequences=True))
model.add(TimeDistributed(Dense(VOCAB_SIZE, activation = 'softmax')))

model.compile(loss='categorical_crossentropy', optimizer = Adam(0.001))


In [None]:
model.summary()

# Training the model

In [43]:
print("Images : "+str(train_images_concat_t.shape))
print("Partial captions : " + str(train_captions_indexed.shape))
print("Future words :" + str(future_words.shape))

Images : (1000, 3, 224, 224)
Partial captions : (1000, 30)
Future words :(1000, 30, 1826)


In [None]:
model.fit([train_images_concat_t, train_captions_indexed], future_words, batch_size=64, nb_epoch=5)

In [None]:
model.evaluate([train_images_concat_t, train_captions_indexed], future_words, batch_size=64)

In [None]:
# model.save_weights(model_path+'val_1000.h5')

In [None]:
model.load_weights(model_path+'val_1000.h5')

# Testing the model

In [None]:
def plot_predictions(ims, titles):  
    for i in range(len(ims)):
        plt.title(titles[i])
        plt.imshow(ims[i])
        plt.figure()
            
    plt.show()
    
def make_prediction(random_number):
    startIndex = word_index["START"]
    start_captions = [[startIndex]]
    start_captions = sequence.pad_sequences(start_captions, maxlen=MAX_CAPTION_LEN,padding='post')

    firstImage = np.expand_dims(newImagesConcatT[random_number], axis=0)
    firstCaption = np.expand_dims(start_captions[0], axis=0) 

    outputs = []

    endGenerated = False
    i = 0
    while ((not endGenerated) & (i < MAX_CAPTION_LEN-1)):
    # for i in range(17):
        predictions = model.predict([firstImage, firstCaption])
        predictions = predictions[0]

        currentPred = predictions[i]

        max_index = np.argmax(currentPred)

    #     top3_max_indexes = predictions.argsort()[-4:][::-1]

    #     max_index = top3_max_indexes[3]
    #     print(predictions.shape)

        outputs.append(max_index)
        firstCaption[0,i+1] = max_index

        i+=1

        if(index_word[max_index] == "END"):
            endGenerated = True

    caption = ' '.join([index_word[x] for x in firstCaption[0][:i+1]])
    
    drawImage = firstImage[0]
    drawImageT = np.transpose(drawImage,(1,2,0))
    plt.imshow(drawImageT)
    
    return (drawImageT,caption)

In [None]:
NO_TEST_IMAGES = 10
TEST_WINDOW_START = 20

images2Captions = [make_prediction(i) 
                   for i in range(TEST_WINDOW_START,TEST_WINDOW_START+NO_TEST_IMAGES)]
images = [image2Caption[0] for image2Caption in images2Captions]
captions = [image2Caption[1] for image2Caption in images2Captions]


In [None]:
plot_predictions(images,titles = captions)