In [1]:
from os import listdir
from os.path import join
from time import time
import json


import tensorflow as tf
import numpy as np


import skimage
from skimage.transform import resize
from skimage.io import imread


def load_image(path):
    # load image
    img = imread(path)
    img = img / 255.0
    assert (0 <= img).all() and (img <= 1.0).all()
    #print "Original Image Shape: ", img.shape
    # we crop image from center
    short_edge = min(img.shape[:2])
    yy = int((img.shape[0] - short_edge) / 2)
    xx = int((img.shape[1] - short_edge) / 2)
    crop_img = img[yy : yy + short_edge, xx : xx + short_edge]
    # resize to 224, 224
    resized_img = resize(crop_img, (224, 224))
    return resized_img

# Image model

This thing works

In [20]:
with open("vgg16.tfmodel", mode='rb') as f:
    fileContent = f.read()

with tf.device('/gpu:0'):
    graph_def = tf.GraphDef()
    graph_def.ParseFromString(fileContent)

    images = tf.placeholder("float", [None, 224, 224, 3])

    tf.import_graph_def(graph_def, input_map={ "images": images })

    graph = tf.get_default_graph()
    out_tensor = graph.get_tensor_by_name("import/Relu_1:0")

sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
init = tf.initialize_all_variables()
sess.run(init)

batch = []
path = '/home/hbenyounes/nshots/data_mostsim/unlabelized/'
for i in listdir(path):
    batch.append(load_image(join(path,i)))
    if len(batch) >=16:
        break
batch = np.stack(batch)

feed_dict = {images: batch}
tic = time()
with tf.device('/gpu:0'):
    features2 = sess.run(out_tensor, feed_dict=feed_dict)
toc = time() - tic
print(features.shape, 'Time = %1.3fs'%toc)

# Language model

## Pre-processing

In [26]:
from collections import Counter

def build_vocab(path):
    saving_folder = "/".join(path.split('/')[:-1])
    name = path.split('/')[-1].split('.')[0]
    file = open(path,'r',encoding='latin1')
    sentences = []
    for l in file:
        sentences.append(l.strip().split())
    ct = Counter(x for a in sentences for x in a)
    i2w = sorted(ct, key=ct.get, reverse=True)
    i2w = ['<unk>','<s>', '</s>'] + i2w
    w2i = dict((w,i) for i,w in enumerate(i2w))
    vocab_file = open(join(saving_folder, name+'.vocab'), 'w',encoding='latin1')
    for w in i2w:
        vocab_file.write(w+'\n')
    vocab_file.close()
    return 'done'

def integerify(text_path, vocab_path, pad=False):
    saving_folder = "/".join(text_path.split('/')[:-1])
    name = text_path.split('/')[-1].split('.')[0]
    w2i = {}
    for i,l in enumerate(open(vocab_path,'r',encoding='latin1')):
        l = l.strip()
        w2i[l] = i
    indexes_file = open(join(saving_folder, name+'.idxs'), 'w',encoding='latin1')
    for l in open(text_path, 'r',encoding='latin1'):
        l = l.strip().split() 
        if pad:
            l = ['<s>'] + l + ['</s>']
        idxs = []
        for w in l:
            if w in w2i:
                idxs.append(str(w2i[w]))
            else:
                idxs.append(str(w2i['<unk>']))
        indexes_file.write(' '.join(idxs) + '\n')
    return 'done'


In [18]:
build_vocab('/home/hbenyounes/vqa/datasets/coco/train/questions.txt')
build_vocab('/home/hbenyounes/vqa/datasets/coco/train/answers.txt')

'done'

In [28]:
integerify('/home/hbenyounes/vqa/datasets/coco/train/questions.txt', 
           '/home/hbenyounes/vqa/datasets/coco/train/questions.vocab', pad=True)
integerify('/home/hbenyounes/vqa/datasets/coco/train/answers.txt', 
           '/home/hbenyounes/vqa/datasets/coco/train/answers.vocab')

'done'

In [29]:
integerify('/home/hbenyounes/vqa/datasets/coco/test/questions.txt', 
           '/home/hbenyounes/vqa/datasets/coco/train/questions.vocab', pad=True)
integerify('/home/hbenyounes/vqa/datasets/coco/test/answers.txt', 
           '/home/hbenyounes/vqa/datasets/coco/train/answers.vocab')

'done'

## Model

In [1]:
from os import listdir
from os.path import join
from time import time
import json
from collections import Counter
from time import time

import tensorflow as tf
import numpy as np
from utils import load_dataset, load_vocab, get_batch

In [1]:
def load_vocab(vocab_path):
    i2w = []
    w2i = {}
    for i,l in enumerate(open(vocab_path,'r',encoding='latin1')):
        l = l.strip()
        i2w.append(l)
        w2i[l] = i
    return i2w, w2i


def load_dataset(idxs_path):
    dataset = []
    for l in open(idxs_path, 'r',encoding='latin1'):
        dataset.append([int(i) for i in l.strip().split()])
    return dataset

def get_batch(begin,end,X,Y,
              batch_size,max_q,Na):
    Q = np.zeros((batch_size, max_q), 'int32')
    mask = np.zeros((max_q,batch_size), 'int32')
    for i,s in enumerate(X[begin:end]):
        Q[i] = np.pad(s, (0,max_q-len(s)), 'constant')
        mask[len(s)-1,i] = 1
    ans = np.zeros((batch_size,Na),'int32')
    for i,a in enumerate(Y[begin:end]):
        ans[i,a] = 1
    mask = mask[:,:,None]
    return Q,mask,ans

In [2]:
q_train = load_dataset('datasets/coco/train/questions.idxs')
q_test = load_dataset('datasets/coco/test/questions.idxs')
a_train = load_dataset('datasets/coco/train/answers.idxs')
a_test = load_dataset('datasets/coco/test/answers.idxs')

In [3]:
q_i2w, q_w2i = load_vocab('datasets/coco/train/questions.vocab')
a_i2w, a_w2i = load_vocab('datasets/coco/train/answers.vocab')

In [4]:
class ImageQA(object):
    def __init__(self, dh, dq, da, max_q, Nq, Na, infer=False):
        self.dh = dh
        self.dq = dq
        self.da = da
        self.max_q = max_q
        self.Nq = Nq
        self.Na = Na
        self.infer = infer
        
        with tf.device('/cpu:0'):
            self.qemb_W = tf.get_variable('qemb_w',
                                          initializer=tf.random_uniform([self.Nq, self.dq], -0.1, 0.1))
        self.aemb_W = tf.get_variable(name='aemb_w',
                                      initializer=tf.random_uniform([self.dh, self.Na], -0.1, 0.1))
        self.aemb_b = tf.get_variable(name='aemb_b',
                                      initializer=tf.zeros([self.Na]))

        self.gru = tf.nn.rnn_cell.GRUCell(self.dh)
    def build_model(self,batch_size):
        
        p_question = tf.placeholder(tf.int32, 
                                    [None, self.max_q],
                                    name="p_question")
        p_answer = tf.placeholder(tf.float32, 
                                  [None,self.Na],
                                  name="p_answer")
        p_question_mask = tf.placeholder(tf.int32,
                                         [self.max_q, None, None],
                                         name="p_question_mask")
        
        state = tf.zeros([batch_size, self.gru.state_size])
        states = []
        outputs = []
        for j in range(self.max_q):
            with tf.device('/cpu:0'):
                question_emb = tf.nn.embedding_lookup(self.qemb_W, p_question[:,j])
            if j>0:
                tf.get_variable_scope().reuse_variables()
            output,state = self.gru(question_emb, state)
            states.append(state)
            outputs.append(output)

        output = tf.pack(outputs) # (max_words_q, batch_size, 4*dim_hidden)
        output_final = tf.reduce_sum(tf.mul(output, tf.to_float(p_question_mask)),0) # (batch_size, 2*dim_hidden)

        answer_logits = tf.nn.xw_plus_b(output_final,
                                        self.aemb_W,
                                        self.aemb_b) # (batch_size, num_answer)
        answer_pred = tf.argmax(answer_logits,1)
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(answer_logits, p_answer) # (batch_size, )
        loss = tf.reduce_mean(cross_entropy)
        train_op = tf.train.AdamOptimizer().minimize(loss)
        loss_summary = tf.scalar_summary("loss",loss,name="loss")
        output = {'train_op':train_op,
                 'loss':loss,
                 'question':p_question,
                 'mask':p_question_mask,
                 'answer':p_answer,
                 'answer_pred':answer_pred,
                 'loss_summary':loss_summary}
        return output

In [5]:
def test(step,verbose=None):
    N_test = len(q_test)
    n_batches = N_test // batch_size
    acc = []
    for idx in range(n_batches):
        if verbose:
            if idx%20==0:
                print("%d/%d - accuracy = %1.3f"%(idx,n_batches, np.mean(acc)))
        begin = idx*batch_size
        end = min((idx+1)*batch_size, N_test)
        Q, mask, A = get_batch(begin,end,q_test,a_test,batch_size,max_q,Na)
        a_pred = sess.run(model_outputs['answer_pred'], 
                          feed_dict={model_outputs['question']:Q,
                                     model_outputs['mask']:mask, 
                                     model_outputs['answer']:A})
        equals = 1*np.equal(A.argmax(axis=1),a_pred)
        equals = list(equals[:end-begin])
        acc += equals
    acc = tf.reduce_mean(tf.to_float(acc))
    acc_s = tf.scalar_summary("acc_tf",acc,name="acc_tf")
    acc,acc_s = sess.run([acc,acc_s])
    writer.add_summary(acc_s,step)
    return acc

In [6]:
max_q = len(max(q_train, key=lambda x:len(x)))+1
Nq = len(q_i2w)
Na = len(a_i2w)

dh = 50 #LSTM hidden state dimension
dq = 75 #Question embedding dimension
da = 50 #Answer embedding dimension
batch_size = 64

print("Graph initialization")
tf.reset_default_graph()
model = ImageQA(dh,dq,da,max_q,Nq,Na,batch_size)
model_outputs = model.build_model(batch_size)

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)

sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options))

saver = tf.train.Saver(max_to_keep=100)
writer = tf.train.SummaryWriter('/home/hbenyounes/vqa/tf_log', sess.graph)

init = tf.initialize_all_variables()
sess.run(init)

In [7]:
#Train
q_train = np.array(q_train)
a_train = np.array(a_train)
with tf.device('/gpu:0'):
    n_epochs = 50
    N_train = len(q_train)
    n_batches = N_train // batch_size + 1
    for epoch in range(n_epochs):
        epoch_loss = []
        times = 0.
        indexes = np.arange(N_train)
        np.random.shuffle(indexes)
        q_train = q_train[indexes]
        a_train = a_train[indexes]
        for idx in range(n_batches):
            tic = time()
            if idx%(n_batches//10)==0:
                print("Epoch %d - %d/%d : loss = %1.4f - time = %1.3fs"%(epoch,idx,
                                                                         n_batches,np.mean(epoch_loss),
                                                                         times/((N_train//10)*batch_size)))
                times = 0.
            begin = idx*batch_size
            end = min((idx+1)*batch_size, N_train)
            Q, mask, A = get_batch(begin,end,q_train,a_train,batch_size,max_q,Na)
            _,l,l_s = sess.run([model_outputs['train_op'],
                                model_outputs['loss'],
                                model_outputs['loss_summary']], 
                               feed_dict={model_outputs['question']:Q,
                                          model_outputs['mask']:mask,
                                          model_outputs['answer']:A})
            epoch_loss.append(l)
            writer.add_summary(l_s,idx+epoch*n_batches)
            times += time() - tic
        with tf.device('/cpu:0'):
            test_acc = test((1+epoch)*n_batches)
            print("Epoch %d - Test accuracy = %1.3f" % (epoch+1, test_acc))
        saver.save(sess, join('/home/hbenyounes/vqa/saved_models/','model'), global_step=epoch)



Epoch 0 - 0/1231 : loss = nan - time = 0.000s
Epoch 0 - 123/1231 : loss = 5.4648 - time = 0.000s
Epoch 0 - 246/1231 : loss = 5.1740 - time = 0.000s
Epoch 0 - 369/1231 : loss = 5.0072 - time = 0.000s
Epoch 0 - 492/1231 : loss = 4.8797 - time = 0.000s
Epoch 0 - 615/1231 : loss = 4.7827 - time = 0.000s
Epoch 0 - 738/1231 : loss = 4.6957 - time = 0.000s
Epoch 0 - 861/1231 : loss = 4.6221 - time = 0.000s
Epoch 0 - 984/1231 : loss = 4.5568 - time = 0.000s
Epoch 0 - 1107/1231 : loss = 4.4951 - time = 0.000s
Epoch 0 - 1230/1231 : loss = 4.4421 - time = 0.000s
Epoch 1 - 0/1231 : loss = nan - time = 0.000s
Epoch 1 - 123/1231 : loss = 3.8649 - time = 0.000s
Epoch 1 - 246/1231 : loss = 3.8479 - time = 0.000s
Epoch 1 - 369/1231 : loss = 3.8141 - time = 0.000s
Epoch 1 - 492/1231 : loss = 3.7917 - time = 0.000s
Epoch 1 - 615/1231 : loss = 3.7608 - time = 0.000s
Epoch 1 - 738/1231 : loss = 3.7370 - time = 0.000s
Epoch 1 - 861/1231 : loss = 3.7123 - time = 0.000s
Epoch 1 - 984/1231 : loss = 3.6906 - ti

KeyboardInterrupt: 