In [1]:
from os.path import exists
from os import mkdir
from os.path import join
from PIL import Image
import json


import tensorflow as tf
import threading
import numpy as np
import h5py

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from scipy.misc import imread, imresize

from utils import load_vocab
from time import time
import datetime


image_paths = {}
root_path = "/srv/data/datasets/mscoco/images/"

for split in 'train val'.split():
    image_ids_path = "datasets/vqa/"+split+"/img_ids.txt"
    image_ids = set([int(x.strip()) for x in open(image_ids_path).readlines()])
    print(split,len(image_ids))
    for x in image_ids:
        name = 'COCO_'+split+'2014_'+format(x, '012')+'.jpg'
        path = join(root_path,split+"2014",name)
        image_paths[x] = path

train 82783
val 40504


In [None]:
def load_images(ps):
    tic = time()
    images = [imread(p,mode='RGB') for p in ps]
    toc = time()
    print("imread = %1.3fs" % (toc-tic))
    treated_images = []
    sizes = []
    for img in images:
        sizes.append(img.shape[:2])
        treated_img = imresize(img,(448,448),'nearest') / 255.0
        treated_images.append(treated_img)
    tic = time()
    print("Resize / scaling = %1.3fs" % (tic-toc))
    return treated_images,sizes


class Dataset(object):
    def __init__(self,h5_path,image_paths,max_q=None,max_mc=None):
        self.h5 = h5py.File(h5_path,mode='r')
        self.image_ids = self.h5['image_ids'].value
        self.questions = self.h5['questions'].value
        self.multiple_choice = self.h5['multiple_choice'].value
        self.answers = self.h5['answers'].value
        self.bounding_boxes = dict((k,v) for (k,v) in zip(self.h5['img_list'].value, 
                                                          self.h5['bounding_boxes'].value))
        self.N = len(self.image_ids)
        if max_q:
            if max_q<self.questions.shape[1]:
                self.questions = self.questions[:,:max_q]
            else:
                self.questions = np.pad(self.questions,
                                        ((0,0),(0,max_q-self.questions.shape[-1])),
                                        'constant',constant_values=a_w2i['</s>'])
        if max_mc:
            if max_mc<self.multiple_choice.shape[-1]:
                self.multiple_choice = self.multiple_choice[:,:,max_mc]
            else:
                self.multiple_choice = np.pad(self.multiple_choice,
                                              ((0,0),(0,0),(0,max_mc-self.multiple_choice.shape[-1])),
                                              'constant',constant_values=a_w2i['</s>'])
        self.max_q = self.questions.shape[1]
        self.indexes = np.arange(self.N)
        self.image_paths = image_paths
        
    def __iter__(self):
        return self
    
    def batch_gen(self,batch_size=64,shuffle=True):
        def load_image(p):
            img = imread(p,mode='RGB')
            size = img.shape[:2]
            img = imresize(img,(448,448),'nearest') / 255.0
            return img,size
        
        if shuffle:
            np.random.shuffle(self.indexes)
        n_batches = self.N // batch_size
        tiled_batch = np.arange(batch_size)[:,None]
        tiled_batch = np.tile(tiled_batch,(1,100))[:,:,None]
        load_time = 0
        for batch_id in range(n_batches):
            begin = batch_id*batch_size
            end = min((batch_id+1)*batch_size, self.N)
            idxs = self.indexes[begin:end]
            image_ids = self.image_ids[idxs]
            images,sizes = [],[]
            for i in image_ids:
                p = self.image_paths[i]
                img,size = load_image(p)
                images.append(img)
                sizes.append(size)
            images = np.stack(images)
            sizes = np.array(sizes)
            questions = self.questions[idxs]
            lengths = np.sum(np.not_equal(questions, 
                                          a_w2i['</s>']), 
                             axis=1)
            question_mask = np.zeros((self.max_q,batch_size))
            for i,q in enumerate(questions):
                question_mask[lengths[i]-1,i] = 1
            answers = self.answers[idxs]
            multiple_choice = self.multiple_choice[idxs]
            #lengths = np.sum(np.not_equal(multiple_choice,a_w2i['</s>']), axis=-1)
            #multiple_choice = multiple_choice[:,:,:lengths.max()]
            bbs = np.array([self.bounding_boxes[k] for k in image_ids])
            bbs = np.concatenate((tiled_batch,bbs),axis=-1)
            bounding_boxes = np.reshape(bbs, (bbs.shape[0]*bbs.shape[1],bbs.shape[2]))
            yield (images,questions,question_mask,answers,multiple_choice,bounding_boxes,sizes)

            
q_i2w, q_w2i = load_vocab('datasets/vqa/train/questions.vocab')
a_i2w, a_w2i = load_vocab('datasets/vqa/train/answers.vocab')
            
train_set = Dataset('datasets/vqa/train/dataset.h5',image_paths)
max_mc = train_set.multiple_choice.shape[-1]
max_q = train_set.max_q
val_set = Dataset('datasets/vqa/val/dataset.h5',image_paths,max_q=max_q,max_mc=max_mc)
Nq = len(q_i2w)
Na = len(a_i2w)