In [1]:
from os import listdir, mkdir
from os.path import join, exists
from time import time
import json


import tensorflow as tf
import numpy as np


import skimage
from skimage.transform import resize
from scipy.misc import imread
from collections import Counter
from time import time

from utils import load_dataset, load_vocab,read_features

In [2]:
class Dataset(object):
    def __init__(self, f_path, i_path, q_path, a_path, n_max=np.Inf):
        self.f_path = f_path
        self.i_path = i_path
        self.q_path = q_path
        self.a_path = a_path
        print('Parse features file')
        self.images_lines = {}
        self.images_features = []
        for i,l in enumerate(open(self.f_path)):
            l = l.split(';')
            self.images_lines[l[0]] = i
            self.images_features.append([float(x) for x in l[1].split()])
        print('Parse questions file')
        
        q_data = load_dataset(self.q_path)
        self.max_q = len(max(q_data, key=lambda x:len(x)))
        print('Parse answers file')
        a_data = load_dataset(self.a_path)
        self.data = []
        for q_id,q,a in zip(open(i_path),q_data,a_data):
            q_id = q_id.strip()
            try:
                l_num = self.images_lines[q_id]
            except:
                continue
            datum = (l_num,q,a)
            self.data.append(datum)
        self.data = np.array(self.data, dtype=object)
        del q_data,a_data
        self.N = len(self.data)
        self.indexes = np.arange(self.N)
    
    def __iter__(self):
        return self
    
    def batch_gen(self,batch_size=64):
        np.random.shuffle(self.indexes)
        n_batches = self.N // batch_size
        for batch_id in range(n_batches):
            begin = batch_id*batch_size
            end = min((batch_id+1)*batch_size, self.N)
            B = self.data[self.indexes[begin:end]]
            B = [(self.images_features[b[0]],b[1],b[2]) for b in B]
#                 all_image_indexes = [b[0] for b in B]
#                 saved_order = dict((idx,order) for order,idx in enumerate(all_image_indexes))
#                 S = sorted(saved_order,reverse=True)
#                 current_line = S.pop()
#                 for i,l in enumerate(f):
#                     if i == current_line:
#                         l = l.split(';')
#                         vec = [float(x) for x in l[1].split()]
#                         B[saved_order[current_line]][0] = vec
#                         current_line = S.pop()
#                         if len(S) == 0:
#                             break
            yield B
        
#         batch = []
#         np.random.shuffle(self.indexes)
#         with open(self.f_path) as f:
#             for idx in self.indexes:
#                 datum = self.data[self.indexes[idx]]
#                 query = datum[0]
#                 vec = []
#                 print(query)
#                 for i,l in enumerate(f):
#                     if i==query:
#                         l = l.split(';')
#                         vec = [float(x) for x in l[1].split()]
#                         break
#                 batch.append((vec,datum[1],datum[2]))
#                 if len(batch) == batch_size:
#                     yield batch
#                     batch = []
#             if len(batch)>0:
#                 yield batch 

In [3]:
def load_emb_matrix(q_i2w, embeddings):
    out = []
    c = set()
    for w in q_i2w:
        if w in embeddings:
            out.append(embeddings[w])
        else:
            c.add(w)
            out.append(np.zeros((vector_size,)))
    return (np.array(out),c)

def create_feed_dict(batch,max_q,Na,batch_size):
    V = np.zeros((batch_size, len(batch[0][0])), 'float32')
    Q = np.zeros((batch_size, max_q), 'int32')
    mask = np.zeros((max_q+1,batch_size), 'int32')
    ans = np.zeros((batch_size,Na),'int32')
    
    for i,(im,s,a) in enumerate(batch):
        V[i] = im
        Q[i] = np.pad(s, (0,max_q-len(s)), 'constant')
        mask[len(s),i] = 1
        ans[i,a] = 1
    mask = mask[:,:,None]
    return V,Q,mask,ans

def test(step,verbose=None):
    acc = []
    test_batches = test_set.batch_gen(batch_size)
    for idx,batch in enumerate(test_batches):    
        if verbose:
            if idx%20==0:
                print("%d - accuracy = %1.3f"%(idx, np.mean(acc)))
        V,Q,mask,ans = create_feed_dict(batch,max_q,Na,batch_size)
        a_pred = sess.run(model_outputs['answer_pred'], 
                          feed_dict={model_outputs['question']:Q,
                                     model_outputs['mask']:mask, 
                                     model_outputs['answer']:ans,
                                     model_outputs['image']:V, 
                                     model_outputs['keep_prob']:keep_prob})
        equals = 1*np.equal(ans.argmax(axis=1),a_pred)
        equals = list(equals[:len(batch)])
        acc += equals
    acc = tf.reduce_mean(tf.to_float(acc))
    acc_s = tf.scalar_summary("acc_tf",acc,name="acc_tf")
    acc,acc_s = sess.run([acc,acc_s])
    writer.add_summary(acc_s,step)
    return acc

In [11]:
model_name = "model3"
root_path = "/home/hbenyounes/vqa/word2vec_fixed/"
embedding_path = '/home/hbenyounes/vqa/GoogleNews.model'
vector_size = 300
hyperparams = {"dh":2048, 
               "dq":vector_size,
               "da":200, 
               "di":4096,
               "batch_size":32,
               "keep_prob":0.8,
               "cell":"lstm"}

In [5]:
q_i2w, q_w2i = load_vocab('datasets/coco/train/questions.vocab')

print("Load word2Vec")
embeddings = {}
for n,l in enumerate(open(embedding_path,encoding='utf-8')):
    l = l.strip().split()
    w = l[0]
    vec = [float(x) for x in l[1:]]
    embeddings[w] = vec

emb,c = load_emb_matrix(q_i2w, embeddings)
del embeddings

Load word2Vec


In [6]:
train_set = Dataset("/home/hbenyounes/vqa/datasets/coco/train/images.feat",
                    "/home/hbenyounes/vqa/datasets/coco/train/img_ids.txt",
                    "/home/hbenyounes/vqa/datasets/coco/train/questions.idxs",
                    "/home/hbenyounes/vqa/datasets/coco/train/answers.idxs")


test_set = Dataset("/home/hbenyounes/vqa/datasets/coco/test/images.feat",
                    "/home/hbenyounes/vqa/datasets/coco/test/img_ids.txt",
                    "/home/hbenyounes/vqa/datasets/coco/test/questions.idxs",
                    "/home/hbenyounes/vqa/datasets/coco/test/answers.idxs")

Parse features file
Parse questions file
Parse answers file
Parse features file
Parse questions file
Parse answers file


In [7]:
class ImageQA(object):
    def __init__(self, dh, dq, da, di, max_q, Nq, Na, cell='rnn',trainable_embeddings=True):
        self.dh = dh
        self.dq = dq
        self.da = da
        self.di = di
        self.max_q = max_q
        self.Nq = Nq
        self.Na = Na
        self.cell = cell
        
        with tf.device('/cpu:0'):
            self.qemb_W = tf.get_variable('qemb_w',
                                          initializer=tf.random_uniform([self.Nq, self.dq], -0.1, 0.1),
                                          trainable = trainable_embeddings)
        self.aemb_W = tf.get_variable(name='aemb_w',
                                      initializer=tf.random_uniform([self.dh, self.Na], -0.1, 0.1))
        self.aemb_b = tf.get_variable(name='aemb_b',
                                      initializer=tf.zeros([self.Na]))
        self.Wi = tf.get_variable(name='Wi', shape=[self.di, self.dq],
                                  initializer=tf.contrib.layers.xavier_initializer())
        self.bi = tf.get_variable(name='bi',
                                      initializer=tf.zeros([self.dq]))
        
        if self.cell == 'rnn':
            self.recur = tf.nn.rnn_cell.RNNCell(self.dh)
        elif self.cell == 'lstm':
            self.recur = tf.nn.rnn_cell.LSTMCell(self.dh)
        elif self.cell == 'gru':
            self.recur = tf.nn.rnn_cell.GRUCell(self.dh)
        else:
            raise NotImplementedError
        
    def build_model(self,batch_size):
        
        p_image = tf.placeholder(tf.float32,
                                [None, self.di],
                                 name="p_image")
        
        p_keep_prob = tf.placeholder(tf.float32, name='p_keep_prob')
        
        p_question = tf.placeholder(tf.int32, 
                                    [None, self.max_q],
                                    name="p_question")
        p_answer = tf.placeholder(tf.float32, 
                                  [None,self.Na],
                                  name="p_answer")
        p_question_mask = tf.placeholder(tf.int32,
                                         [self.max_q+1, None, None],
                                         name="p_question_mask")
        
        image_proj = tf.nn.xw_plus_b(p_image,self.Wi,self.bi,name='image_proj')
        image_proj_drp = tf.nn.dropout(image_proj, p_keep_prob)
        
        state = self.recur.zero_state(batch_size, tf.float32)
        states = []
        outputs = []
        for j in range(self.max_q+1):
            if j==0:
                output,state = self.recur(image_proj_drp, state)
            else:
                with tf.device('/cpu:0'):
                    question_emb = tf.nn.embedding_lookup(self.qemb_W, p_question[:,j-1])
                    question_emb_drp = tf.nn.dropout(question_emb, p_keep_prob)
                tf.get_variable_scope().reuse_variables()
                output,state = self.recur(question_emb_drp, state)
            states.append(state)
            outputs.append(output)

        output = tf.pack(outputs) # (max_words_q, batch_size, 4*dim_hidden)
        output_final = tf.reduce_sum(tf.mul(output, tf.to_float(p_question_mask)),0) # (batch_size, 2*dim_hidden)

        answer_logits = tf.nn.xw_plus_b(output_final,
                                        self.aemb_W,
                                        self.aemb_b) # (batch_size, num_answer)
        answer_pred = tf.argmax(answer_logits,1)
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(answer_logits, p_answer) # (batch_size, )
        loss = tf.reduce_mean(cross_entropy)
        train_op = tf.train.AdamOptimizer().minimize(loss)
        loss_summary = tf.scalar_summary("loss",loss,name="loss")
        output = {'train_op':train_op,
                 'loss':loss,
                 'question':p_question,
                 'mask':p_question_mask,
                 'answer':p_answer,
                  'keep_prob':p_keep_prob,
                 'answer_pred':answer_pred,
                 'loss_summary':loss_summary,
                 'image':p_image}
        return output

In [12]:
if not exists(join(root_path, model_name)):
    mkdir(join(root_path, model_name))

q_i2w, q_w2i = load_vocab('datasets/coco/train/questions.vocab')
a_i2w, a_w2i = load_vocab('datasets/coco/train/answers.vocab')
Nq = len(q_i2w)
Na = len(a_i2w)

max_q = train_set.max_q
Nq = len(q_i2w)
Na = len(a_i2w)


dh = hyperparams["dh"] #GRU hidden state dimension
dq = hyperparams["dq"] #Question embedding dimension
da = hyperparams["da"] #Answer embedding dimension
di = hyperparams["di"] #Image dimension
batch_size = hyperparams["batch_size"]
keep_prob = hyperparams["keep_prob"]
cell = hyperparams["cell"]

print("Graph initialization")
tf.reset_default_graph()
model = ImageQA(dh,dq,da,di,max_q,Nq,Na,cell,False)
model_outputs = model.build_model(batch_size)

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6)

sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options, 
                                                   intra_op_parallelism_threads=4))

writer = tf.train.SummaryWriter(join(root_path,model_name,'tf_log'), sess.graph)

saver = tf.train.Saver(max_to_keep=100)


# saver.restore(sess, '/home/hbenyounes/vqa/model2/model-8')

init = tf.initialize_all_variables()
sess.run(init)


sess.run(model.qemb_W.assign(emb))

n_parameters = sum( [np.prod(v.get_shape(),dtype='int') for v in tf.trainable_variables()])
hyperparams['n_parameters'] = n_parameters



Graph initialization


Exception ignored in: <bound method InteractiveSession.__del__ of <tensorflow.python.client.session.InteractiveSession object at 0x7f17bca69f60>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py", line 171, in __del__
    self.close()
  File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py", line 976, in close
    self._default_session.__exit__(None, None, None)
  File "/usr/lib/python3.4/contextlib.py", line 66, in __exit__
    next(self.gen)
  File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/ops.py", line 3378, in get_controller
    % type(default))
AssertionError: Nesting violated for default stack of <class 'weakref'> objects


In [None]:
#Train
break_all = False
with tf.device('/gpu:0'):
    n_epochs = 50
    max_test_acc = -np.Inf
    patience = 3
    for epoch in range(n_epochs):
        epoch_loss = []
        times = 0.
        n_batches = train_set.N // batch_size
        train_batches = train_set.batch_gen(batch_size)
        for idx,batch in enumerate(train_batches):
            tic = time()
            if idx%(n_batches//10)==0:
                print("Epoch %d - %d/%d : loss = %1.4f - time = %1.3fs"%(epoch,idx,
                                                                         n_batches,np.mean(epoch_loss),
                                                                         times))
            V,Q,mask,ans = create_feed_dict(batch,max_q,Na,batch_size)
            _,l,l_s = sess.run([model_outputs['train_op'],
                                model_outputs['loss'],
                                model_outputs['loss_summary']], 
                               feed_dict={model_outputs['question']:Q,
                                          model_outputs['mask']:mask,
                                          model_outputs['answer']:ans,
                                          model_outputs['image']:V,
                                          model_outputs['keep_prob']:keep_prob})
            if np.isnan(l):
                break_all = True
            epoch_loss.append(l)
            writer.add_summary(l_s,idx+epoch*n_batches)
            times += time() - tic
            if break_all:
                print("Loss is nan at iteration %d" % (idx+n_batches*epoch))
                break
        if break_all:
            break
        with tf.device('/cpu:0'):
            test_acc = test((1+epoch)*n_batches)
            print("Epoch %d - Test accuracy = %1.3f" % (epoch+1, test_acc))
        if test_acc > max_test_acc:
            patience += 3
            saver.save(sess, join(root_path,model_name,'model'), global_step=epoch)
        max_test_acc = max(test_acc, max_test_acc)
        if epoch == patience:
            print("EARLY STOPPING")
            break



Epoch 0 - 0/2460 : loss = nan - time = 0.000s
Epoch 0 - 246/2460 : loss = 4.0994 - time = 50.390s
Epoch 0 - 492/2460 : loss = 3.5805 - time = 100.483s
Epoch 0 - 738/2460 : loss = 3.2929 - time = 150.895s
Epoch 0 - 984/2460 : loss = 3.1152 - time = 201.302s
Epoch 0 - 1230/2460 : loss = 2.9812 - time = 251.694s
Epoch 0 - 1476/2460 : loss = 2.8801 - time = 302.139s
Epoch 0 - 1722/2460 : loss = 2.7983 - time = 352.710s
Epoch 0 - 1968/2460 : loss = 2.7308 - time = 403.260s
Epoch 0 - 2214/2460 : loss = 2.6725 - time = 453.217s
Epoch 1 - Test accuracy = 0.456
Epoch 1 - 0/2460 : loss = nan - time = 0.000s
Epoch 1 - 246/2460 : loss = 1.9718 - time = 50.220s
Epoch 1 - 492/2460 : loss = 1.9913 - time = 101.068s
Epoch 1 - 738/2460 : loss = 1.9889 - time = 152.059s
Epoch 1 - 984/2460 : loss = 1.9819 - time = 202.988s
Epoch 1 - 1230/2460 : loss = 1.9782 - time = 253.145s
Epoch 1 - 1476/2460 : loss = 1.9729 - time = 303.799s
Epoch 1 - 1722/2460 : loss = 1.9697 - time = 354.177s
Epoch 1 - 1968/2460 : 

In [None]:
hyperparams['max_test_acc'] = max_test_acc
with open(join(root_path, model_name, 'hyperparams'),'w') as f:
    for h in hyperparams:
        f.write("%s = %s\n" % (h, str(hyperparams[h])))
    f.write('\n\nMaximal test accuracy = %1.4f' % max_test_acc)