In [3]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD, Adadelta, Adagrad
from keras.utils import np_utils, generic_utils
from keras.layers import Embedding,GRU,TimeDistributed,RepeatVector,Merge,BatchNormalization
from keras.preprocessing import sequence

import numpy as np
from vgg16 import Vgg16

from os import listdir
from os.path import isfile, join

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import PIL.Image

import json
from tqdm import tqdm

from keras.optimizers import SGD, RMSprop, Adam

from utils import *
import cPickle as pickle
from matplotlib import pyplot as plt

from itertools import compress

import shutil
import string

import collections
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

import re
from numpy.random import random, permutation, randn, normal 

import os
import preprocessing as preproc


Using Theano backend.
Using gpu device 0: GeForce GTX 1080 (CNMeM is disabled, cuDNN 5110)


In [4]:
def get_unique_words(captions):
    unique_words = []
    words = [caption.split() for caption in captions]
   
    for word in words:
        unique_words.extend(word)
        
    unique_words = list(set(unique_words))
    
    return unique_words

def get_index_word_dicts(unique_words):
    word_index = {}
    index_word = {}
    for i,word in enumerate(unique_words):
        word_index[word] = i
        index_word[i] = word
        
    return (word_index,index_word)


In [5]:
save_path = "/home/docker/fastai-courses/deeplearning1/nbs/persistent/coco/"

annotation_path = save_path +"raw_annotations/captions_val2014.json"

images_path = save_path+"raw_images/val2014"
image_data_arr_path = save_path+"imageDataArr/"
images_concat_t_path = save_path+"imagesConcatT/"
captions_path = save_path+"captions/"
temp_save_path = save_path+"temp/"
model_path = save_path+"models/"
images_vgg_features_path = save_path + "images_vgg_features/"

train_path = save_path + "train/"
test_path = save_path + "test/"


images_concat_folder = "images_concat/"
images_vgg_4096_folder = "images_vgg_4096/"
captions_folder = "captions/"
indexed_captions_folder = "indexed-captions/"
indexed_future_words_folder = "indexed-future-words/"
glove_folder = "glove/"

misc_folder = "misc/"

batch_folder = "batches/"

# Load Data

In [6]:
NR_TRAIN_INSTANCES = 10240
BATCH_SIZE = 1024
MAX_CAPTION_LEN = 15

In [7]:
train_images_precomputed_vgg_features = preproc.read_serialized_np_arr(train_path+images_vgg_4096_folder+'vgg_features.bc'
                                                                  ,nr_instances = NR_TRAIN_INSTANCES )

print(train_images_precomputed_vgg_features.shape)

In [9]:
train_captions = preproc.get_truncated_captions_from_batch(train_path + captions_folder, batch_nr = 0,
                                                           nr_instances = NR_TRAIN_INSTANCES )

print(len(train_captions))

# Save img vgg features batch

In [12]:
for index in tqdm(range(len(train_images_precomputed_vgg_features) / BATCH_SIZE)):
    img_vgg_feature_list = []
    
    for elem_in_batch in range(BATCH_SIZE):
        img_vgg_feature = np.expand_dims(train_images_precomputed_vgg_features[index*BATCH_SIZE + elem_in_batch],axis=0)
        img_vgg_feature_list.append(img_vgg_feature)
        
    img_vgg_batch = np.vstack(img_vgg_feature_list)
    save_array(train_path+batch_folder+images_vgg_4096_folder+'img_vgg_feature_'+str(format(index, "06"))+'_'+'.bc',
               img_vgg_batch)

100%|██████████| 10/10 [00:00<00:00, 13.32it/s]


# Save raw captions batch

In [14]:
for index in tqdm(range(len(train_captions)/BATCH_SIZE)):
    caption_list = []
    
    for elem_in_batch in range(BATCH_SIZE):
        caption_list.append(train_captions[index*BATCH_SIZE + elem_in_batch])
     
    captions_batch = np.vstack(caption_list)
    
    save_array(train_path+batch_folder+captions_folder+'caption_'+str(format(index, "06"))+'_'+'.bc',
               captions_batch)

100%|██████████| 10/10 [00:00<00:00, 57.59it/s]


# Save indexed captions batch

In [15]:
unique_words = get_unique_words(train_captions)
VOCAB_SIZE = len(unique_words)
(word2index, index2word) = get_index_word_dicts(unique_words)

indexed_captions = []
for index,train_caption in tqdm(enumerate(train_captions)):
    indexed_caption = [word2index[caption_word] for caption_word in train_caption.split()]
    indexed_caption = sequence.pad_sequences([indexed_caption], maxlen=MAX_CAPTION_LEN,padding='post')
    indexed_np_arr = np.asarray(np.squeeze(indexed_caption))
    
    indexed_captions.append(indexed_np_arr)
    

10240it [00:00, 49993.92it/s]


In [16]:
for index in tqdm(range(len(indexed_captions)/BATCH_SIZE)):
    
    indexed_caption_list = []
    
    for elem_in_batch in range(BATCH_SIZE):
        indexed_caption = np.expand_dims(indexed_captions[index*BATCH_SIZE + elem_in_batch],axis=0)
        indexed_caption_list.append(indexed_caption)
        
    indexed_captions_batch = np.vstack(indexed_caption_list)
    
    save_array(train_path+batch_folder+indexed_captions_folder+'indexed_caption_'+str(format(index, "06"))+'_'+'.bc',
               indexed_captions_batch)

100%|██████████| 10/10 [00:00<00:00, 54.95it/s]


# Save future words captions batch

In [18]:
for index in (range(len(indexed_captions)/BATCH_SIZE)):

    indexed_future_word_list = []
        
    for elem_in_batch in tqdm(range(BATCH_SIZE)):

        caption_indexed = indexed_captions[index*BATCH_SIZE + elem_in_batch]
            
        enhanced_caption_indexed = np.append(caption_indexed,[word2index["END"]]) #hacky
        word_2_next_word = []

        for i in xrange(0,len(caption_indexed)):
            caption_word_index = enhanced_caption_indexed[i]
            future_word_index = enhanced_caption_indexed[i+1]
            future_indexes = np.zeros(VOCAB_SIZE)
            future_indexes[future_word_index] = 1

            word_2_next_word.append(future_indexes)

        words_2_next_word = np.vstack(word_2_next_word)
        words_2_next_word = np.expand_dims(words_2_next_word,axis=0)
        
        indexed_future_word_list.append(words_2_next_word)
    
    indexed_future_words_batch = np.vstack(indexed_future_word_list)
    
    save_array(train_path+batch_folder+indexed_future_words_folder+'indexed_future_word_'+str(format(index, "06"))+'_'+'.bc', 
               indexed_future_words_batch)
    


100%|██████████| 1024/1024 [00:00<00:00, 2235.13it/s]
100%|██████████| 1024/1024 [00:00<00:00, 3086.74it/s]
100%|██████████| 1024/1024 [00:00<00:00, 3101.24it/s]
100%|██████████| 1024/1024 [00:00<00:00, 3076.79it/s]
100%|██████████| 1024/1024 [00:00<00:00, 3228.22it/s]
100%|██████████| 1024/1024 [00:00<00:00, 3190.08it/s]
100%|██████████| 1024/1024 [00:00<00:00, 3232.62it/s]
100%|██████████| 1024/1024 [00:00<00:00, 3074.41it/s]
100%|██████████| 1024/1024 [00:00<00:00, 3225.08it/s]
100%|██████████| 1024/1024 [00:00<00:00, 3248.79it/s]


# Saving other misc data 

In [20]:
preproc.save_obj(unique_words,train_path+batch_folder+misc_folder+"unique_words")

In [21]:
preproc.save_obj(word2index,train_path+batch_folder+misc_folder+"word2index")

In [22]:
preproc.save_obj(index2word,train_path+batch_folder+misc_folder+"index2word")