In [1]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers import Embedding,GRU,TimeDistributed,RepeatVector,Merge,BatchNormalization
from keras.preprocessing import sequence
from keras import callbacks
from keras.optimizers import SGD, RMSprop, Adam
import numpy as np
from vgg16 import Vgg16
import matplotlib.pyplot as plt
import PIL.Image

from tqdm import tqdm

from utils import *

import cPickle as pickle
import string

import collections
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords


import os

import preprocessing as preproc


Using Theano backend.
Using gpu device 0: GeForce GTX 1080 (CNMeM is disabled, cuDNN 5110)


In [2]:
def search_images_by(searched_word,images,predicted_captions):
    lmtzr = WordNetLemmatizer()
    lemm_word = lmtzr.lemmatize(searched_word)
    
    found_indexes = []
    for index,caption in enumerate(predicted_captions):
        lemm_caption_words = [lmtzr.lemmatize(word) for word in caption.split()]
        if lemm_word in lemm_caption_words:
            found_indexes.append(index)
    
    return ([images[i] for i in found_indexes],[predicted_captions[i] for i in found_indexes])

    
def make_prediction(random_number,images_concat_t,vgg_model):
    startIndex = word2index["START"]
    start_captions = [[startIndex]]
    start_captions = sequence.pad_sequences(start_captions, maxlen=MAX_CAPTION_LEN,padding='post')

    firstImage = np.expand_dims(images_concat_t[random_number], axis=0)

    first_image_vgg_features = vgg_model.predict(firstImage)
#     first_image_input = firstImage
    first_image_input = np.squeeze(first_image_vgg_features)[0].reshape(1,4096)
    
    firstCaption = np.expand_dims(start_captions[0], axis=0) 

    outputs = []

    endGenerated = False
    i = 0
    while ((not endGenerated) & (i < MAX_CAPTION_LEN-1)):

        predictions = model.predict([first_image_input, firstCaption])
        predictions = predictions[0]

        currentPred = predictions[i]

        max_index = np.argmax(currentPred)

        outputs.append(max_index)
        firstCaption[0,i+1] = max_index

        i+=1

        if(index2word[max_index] == "END"):
            endGenerated = True

    caption = ' '.join([index2word[x] for x in firstCaption[0][:i+1]]) 
    
    drawImage = firstImage[0]
    drawImageT = np.transpose(drawImage,(1,2,0))
    plt.imshow(drawImageT)
    
    return (drawImageT,caption)

def make_prediction_on_dataset(images_concat_t, window_start = None, no_images = None):
    
    if(window_start == None):
        window_start = 0
        
    if(no_images == None):
        no_images = len(images_concat_t)
    
    vgg_model = get_vgg_model()
    
    images2Captions = [make_prediction(i,images_concat_t,vgg_model) for i in tqdm(range(window_start,window_start+no_images))]
    images = [image2Caption[0] for image2Caption in images2Captions]
    predicted_captions = [image2Caption[1] for image2Caption in images2Captions]

    
    return (images,predicted_captions)
    
def generate_arrays_from_file(img_vgg_path,indexed_captions_path,future_words_path):
    while 1:
        img_vgg_elements = os.listdir(img_vgg_path)
        indexed_captions_elements = os.listdir(indexed_captions_path)
        future_words_elements = os.listdir(future_words_path)
        
        img_vgg_elements.sort()
        indexed_captions_elements.sort()
        future_words_elements.sort()

        nr_elem = len(img_vgg_elements)
        
        BATCH_SIZE = 1
        
        for index in range(nr_elem/BATCH_SIZE):
            
            img_vgg_batch_list = []
            indexed_caption_batch_list = []
            future_words_batch_list = []
            
            for elem_in_batch in range(BATCH_SIZE):
                
                img_vgg_el_name = img_vgg_elements[index*BATCH_SIZE + elem_in_batch]
                indexed_caption_name = indexed_captions_elements[index*BATCH_SIZE + elem_in_batch]
                future_words_el_name = future_words_elements[index*BATCH_SIZE + elem_in_batch]

                img_vgg = preproc.load_array(img_vgg_path+"/"+img_vgg_el_name)
                indexed_caption = preproc.load_array(indexed_captions_path+"/"+indexed_caption_name)
                future_words = preproc.load_array(future_words_path+"/"+future_words_el_name)
                
                img_vgg_batch_list.append(img_vgg)
                indexed_caption_batch_list.append(indexed_caption)
                future_words_batch_list.append(future_words)
                
            img_vgg_big = np.vstack(img_vgg_batch_list)
            indexed_caption_big = np.vstack(indexed_caption_batch_list)
            future_words_big = np.vstack(future_words_batch_list)
            
#             print(img_vgg_big.shape)
#             print(indexed_caption_big.shape)
#             print(future_words_big.shape)
    
            yield ([img_vgg_big,indexed_caption_big], future_words_big)

def get_test_data(img_vgg_path,indexed_captions_path,future_words_path):
    img_vgg_elements = os.listdir(img_vgg_path)
    indexed_captions_elements = os.listdir(indexed_captions_path)
    future_words_elements = os.listdir(future_words_path)

    img_vgg_elements.sort()
    indexed_captions_elements.sort()
    future_words_elements.sort()

    nr_batches = 1

    for index in tqdm(range(nr_batches)):

        img_vgg_batch_list = []
        indexed_caption_batch_list = []
        future_words_batch_list = []

        img_vgg_el_name = img_vgg_elements[index]
        indexed_caption_name = indexed_captions_elements[index]
        future_words_el_name = future_words_elements[index]

        img_vgg = preproc.load_array(img_vgg_path+"/"+img_vgg_el_name)
        indexed_caption = preproc.load_array(indexed_captions_path+"/"+indexed_caption_name)
        future_words = preproc.load_array(future_words_path+"/"+future_words_el_name)

        img_vgg_batch_list.append(img_vgg)
        indexed_caption_batch_list.append(indexed_caption)
        future_words_batch_list.append(future_words)

    img_vgg_big = np.vstack(img_vgg_batch_list)
    indexed_caption_big = np.vstack(indexed_caption_batch_list)
    future_words_big = np.vstack(future_words_batch_list)

    print(img_vgg_big.shape)
    print(indexed_caption_big.shape)
    print(future_words_big.shape)

    return img_vgg_big, indexed_caption_big, future_words_big
      
        

In [3]:
base_path = app_3_length_15_data_path

train_path = base_path + train_folder
val_path = base_path + val_folder

NR_TRAIN_EXAMPLES = 62735
NR_TEST_EXAMPLES = 28858

# Read Serialized Data - Images

In [15]:
test_images_concat_t = preproc.get_images_concat(val_path + images_concat_folder+ 'images_concat.bc',NR_TEST_EXAMPLES)
print(test_images_concat_t.shape)

(28858, 3, 224, 224)


# Load precomputed misc data structures

In [4]:
train_img_vgg_path = base_path + train_folder + batch_folder + images_vgg_4096_folder
train_indexed_captions_path = base_path + train_folder + batch_folder + indexed_captions_folder
train_raw_captions_path = base_path + train_folder + batch_folder + captions_folder
train_future_words_path = base_path + train_folder + batch_folder + indexed_future_words_folder

test_img_vgg_path = base_path + val_folder + batch_folder +images_vgg_4096_folder
test_indexed_captions_path = base_path + val_folder + batch_folder + indexed_captions_folder
test_raw_captions_path = base_path + val_folder + batch_folder+captions_folder
test_future_words_path = base_path + val_folder + batch_folder+indexed_future_words_folder



In [5]:
unique_words = preproc.load_obj(base_path + general_datastruct_folder+"unique_words")
word2index = preproc.load_obj(base_path+general_datastruct_folder+"word2index")
index2word = preproc.load_obj(base_path+general_datastruct_folder+"index2word")

(train_captions_raw,_) = preproc.get_captions_raw_and_indexed(train_raw_captions_path,train_indexed_captions_path)
(test_captions_raw,_) = preproc.get_captions_raw_and_indexed(test_raw_captions_path,test_indexed_captions_path)


100%|██████████| 30/30 [00:00<00:00, 170.56it/s]
100%|██████████| 14/14 [00:00<00:00, 644.24it/s]


In [6]:
VOCAB_SIZE = len(unique_words)
MAX_CAPTION_LEN = 15 # ATENTIE AICI

In [7]:
print("MAX_CAPTION_LEN = %s"%MAX_CAPTION_LEN)
print("VOCAB_SIZE = %s"%VOCAB_SIZE)


MAX_CAPTION_LEN = 15
VOCAB_SIZE = 7275


# Word Embeddings

In [8]:
EMB_SIZE = 200
vecs, words, wordidx = preproc.load_vectors(save_path+glove_folder+"6B."+str(EMB_SIZE)+"d")

emb = preproc.create_emb(vecs, words, wordidx,index2word,VOCAB_SIZE)

Found = 7025
Not found = 249


# Model Building

In [9]:
# VGG
def get_vgg_model():
    image_model = Vgg16().model
    image_model.pop()
    image_model.pop()
    image_model.trainable = False
    image_model.add(RepeatVector(MAX_CAPTION_LEN))
    return image_model

def get_precomputed_input_model():
    input_model = Sequential()
    input_model.add(RepeatVector(MAX_CAPTION_LEN,input_shape=(4096,)))
    return input_model


# GRU
def get_language_model(emb):
    language_model = Sequential()
    #language_model.add(Embedding(VOCAB_SIZE, 256, input_length=MAX_CAPTION_LEN))
    #language_model.add(Embedding(VOCAB_SIZE, EMB_SIZE, input_length=MAX_CAPTION_LEN,weights=[emb], trainable=False))
    language_model.add(Embedding(VOCAB_SIZE, EMB_SIZE, input_length=MAX_CAPTION_LEN,weights=[emb]))
    Dropout(0.2)
    language_model.add(BatchNormalization())
    return language_model


# Top level model
def build_model(image_model,language_model):
    model = Sequential()
    model.add(Merge([image_model, language_model], mode='concat'))
    model.add(GRU(1024, return_sequences=True))
    model.add(TimeDistributed(Dense(VOCAB_SIZE, activation = 'softmax')))

    model.compile(loss='categorical_crossentropy', optimizer = Adam(0.001))
    return model


In [10]:
image_model = get_precomputed_input_model()
language_model = get_language_model(emb)
model = build_model(image_model,language_model)

# Training the model

In [None]:
test_img_vgg, test_indexed_captions, test_future_words = get_test_data(test_img_vgg_path,
                                                                       test_indexed_captions_path,
                                                                       test_future_words_path)

In [None]:
history = model.fit_generator(
                    generate_arrays_from_file(train_img_vgg_path,train_indexed_captions_path,train_future_words_path),
                    samples_per_epoch=2000,
                    nb_epoch=3,
                    validation_data = ([test_img_vgg, test_indexed_captions], test_future_words),
                    callbacks=[callbacks.RemoteMonitor()]
                   )

In [None]:
preproc.plot_loss_from_history(history)

In [None]:
# model.save_weights(save_path + models_folder +'app_3_length_15_30_epoch.h5')

In [13]:
model.load_weights(save_path + models_folder +'app_3_length_15_30_epoch.h5')

In [12]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
repeatvector_1 (RepeatVector)    (None, 15, 4096)      0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 15, 200)       1455000                                      
____________________________________________________________________________________________________
batchnormalization_1 (BatchNorma (None, 15, 200)       800                                          
____________________________________________________________________________________________________
gru_1 (GRU)                      (None, 15, 1024)      16346112    merge_1[0][0]                    
___________________________________________________________________________________________

# Evaluate model

In [None]:
model.evaluate_generator(generate_arrays_from_file(train_img_vgg_path,
                                                   train_indexed_captions_path,
                                                   train_future_words_path),
                        val_samples = NR_TRAIN_EXAMPLES)

In [None]:

model.evaluate_generator(generate_arrays_from_file(test_img_vgg_path,
                                                   test_indexed_captions_path,
                                                   test_future_words_path),
                        val_samples = NR_TEST_EXAMPLES)

In [None]:
def make_test_predictions(test_images_concat_t):
    return make_prediction_on_dataset(test_images_concat_t,0,NR_TEST_EXAMPLES)

In [None]:
(_,all_test_predictions) = make_test_predictions(test_images_concat_t)

In [None]:
preproc.save_obj(all_test_predictions,val_path+"predictions/"+"app_3_length_15_30_epoch_predicted_captions")

# Testing the model

In [None]:
window_start = 512
nr_images = 128

# images_concat_t = train_images_concat_t
# real_captions = train_captions_raw

images_concat_t = test_images_concat_t
real_captions = test_captions_raw

(images,predicted_captions) = make_prediction_on_dataset(images_concat_t,window_start,nr_images)


100%|██████████| 128/128 [00:12<00:00, 10.32it/s]


In [1]:
preproc.plot_predictions(images,titles = predicted_captions)

In [None]:
common_words2app = preproc.most_common_words(predicted_captions,500)
common_words2app

In [None]:
common_words2app

In [None]:
searched_word = "teeth"
(found_images,found_captions) = search_images_by(searched_word,images,predicted_captions)
print("Number of results = %d"%len(found_images))

In [None]:
plot_predictions(found_images,found_captions)

# Make predictions on misc dataset

In [None]:
misc_images_path = save_path + misc_images_folder

In [None]:
misc_images = []
for img_path in os.listdir(misc_images_path):
    img = PIL.Image.open(misc_images_path+img_path)
    img = img.resize((224, 224), PIL.Image.NEAREST)
    img = np.asarray(img)
    img = np.transpose(img,(2,0,1))
    img = np.expand_dims(img,axis=0)
    
    misc_images.append(img)
    
stacked_images = np.vstack(misc_images)

In [None]:
# (misc_images,misc_predicted_captions) = make_prediction_on_dataset(stacked_images)
# preproc.plot_predictions(misc_images,misc_predicted_captions)