In [1]:
import preprocessing as preproc
from utils import *
import numpy as np
import cPickle as pickle
import pandas as pd
import os

Using Theano backend.
Using gpu device 0: GeForce GTX 1080 (CNMeM is disabled, cuDNN 5110)


In [2]:
def get_all_captions(image_data_arr):
    caption_bucket_list = preproc.get_captions_list(image_data_arr)
    caption_bucket_length = len(caption_bucket_list[0])
    
    captions = np.stack(caption_bucket_list)
    
    print("caption_bucket_length = %d"%caption_bucket_length)
    print("captions.shape = %s"%str(captions.shape))
    
    return (caption_bucket_list)

def get_imgs_and_captions(base_images_path,base_annotation_path):
    image_data_arr = preproc.get_image_data_arr(base_images_path,base_annotation_path)
    
    captions= get_all_captions(image_data_arr)
    images = preproc.construct_images_concat_t(image_data_arr)
    
    print("images.shape = %s"%str(images.shape))
    return (images,captions)
    

In [3]:
def get_common_words_mask(captions):
    common_words_caption_mask = preproc.compute_common_words_caption_mask(captions,min_no_of_app = MIN_NO_OF_APP)
    print("[COMMON WORDS] After = %d"%np.sum(common_words_caption_mask))
    return common_words_caption_mask

def get_max_caption_length_mask(captions):
    max_length_mask = preproc.get_short_caption_mask(captions, MAX_LENGTH)
    print("[MAX LENGHT] After = %d"%(np.sum(max_length_mask)))
    return max_length_mask

def construct_data_mask(captions):
    
    common_words_caption_mask = get_common_words_mask(captions)
    max_length_mask = get_max_caption_length_mask(captions)
    
    combined_masks = [a and b for a, b in zip(common_words_caption_mask, max_length_mask)]
    print("[COMBINED] After all = %d"%(np.sum(combined_masks)))
    return combined_masks

def filter_data_by_mask(images_concat_t,captions,combined_masks):
    captions_filtered = preproc.filter_array_by_mask(captions,combined_masks)
    images_concat_t_filtered = preproc.filter_array_by_mask(images_concat_t,combined_masks)
    
    return (images_concat_t_filtered,captions_filtered)


def write_to_folder(images_concat_t_filtered,captions_filtered,write_images_path,write_captions_path,index):
    
    print("Writing...")
    
    preproc.save_array_with_folder_create(write_images_path,"images_concat_"+str(index)+".bc",images_concat_t_filtered)
      
    if not os.path.exists(write_captions_path):
        os.makedirs(write_captions_path)
    
    pickle.dump(captions_filtered, open(write_captions_path + "captions_batch_"+str(index)+".p", "wb"))

# Run one for train / val folder

In [None]:
base_images_path = train_images_path
base_annotation_path = train_annotation_path
current_folder = train_folder

# base_images_path = val_images_path
# base_annotation_path = val_annotation_path
# current_folder = val_folder

MIN_NO_OF_APP = 10
MAX_LENGTH = 15

base_path = data_path+"app-10-length-15/"
write_path = base_path + current_folder
write_images_path = write_path + images_concat_folder
write_captions_path = write_path + captions_folder

## Build data

In [None]:
(images,caption_bucket_list) = get_imgs_and_captions(base_images_path,base_annotation_path)

100%|██████████| 82783/82783 [19:00<00:00, 72.57it/s] 
100%|██████████| 5/5 [00:00<00:00,  7.13it/s]


caption_bucket_length = 82612
captions.shape = (5, 82612)
images.shape = (82612, 3, 224, 224)


# Filter Data

In [None]:
for index,caption_bucket in enumerate(caption_bucket_list):
    caption_bucket = np.asarray(caption_bucket)
    print("------------Bucket %d------------"%index)
    
    combined_masks = construct_data_mask(caption_bucket)
    
    print("images.shape = %s"%(str(images.shape)))
    print("caption_bucket.shape = %s"%(str(caption_bucket.shape)))
    
    (images_concat_t_filtered,caption_bucket_filtered) = filter_data_by_mask(images,
                                                                             caption_bucket,
                                                                             combined_masks)
    
    print("Images shape = %s"%str(images_concat_t_filtered.shape))
    print("Caption bucket shape = %s"%str(caption_bucket_filtered.shape))
    
    write_to_folder(images_concat_t_filtered,caption_bucket_filtered,write_images_path,write_captions_path,index)

# Build General Purpose Data Structures ( just once )

In [None]:
train_path = base_path + train_folder
val_path = base_path + val_folder

all_raw_captions = []

In [None]:
for index in range(5):
    train_raw_captions = preproc.get_captions_from_batch(train_path + captions_folder, batch_nr = index)
    val_raw_captions = preproc.get_captions_from_batch(val_path  + captions_folder, batch_nr = index)
    
    print("train_raw_captions.shape = %s"%(str(train_raw_captions.shape)))
    print("val_raw_captions.shape = %s"%(str(val_raw_captions.shape)))
    
    all_raw_captions += list(train_raw_captions) + list(val_raw_captions)
    

In [None]:
print("len(all_raw_captions) = %d"%(len(all_raw_captions)))

In [None]:
unique_words = preproc.get_unique_words(all_raw_captions)
(word2index, index2word) = preproc.get_index_word_dicts(unique_words)
print(len(unique_words))

In [None]:
if not os.path.exists(base_path+general_datastruct_folder):
    os.makedirs(base_path+general_datastruct_folder)

preproc.save_obj(unique_words,base_path+general_datastruct_folder+"unique_words")
preproc.save_obj(word2index,base_path+general_datastruct_folder+"word2index")
preproc.save_obj(index2word,base_path+general_datastruct_folder+"index2word")