In [1]:
from os import listdir, mkdir
from os.path import join, exists
from time import time
import json


import tensorflow as tf
import numpy as np


import skimage
from skimage.transform import resize
from scipy.misc import imread
from collections import Counter
from time import time

from utils import load_dataset, load_vocab,read_features, Dataset, load_emb_matrix

In [15]:
class VisLSTM(object):
    def __init__(self, hyperparams):
        self.dh = hyperparams['dh']
        self.dq = hyperparams['dq']
        self.da = hyperparams['da']
        self.di = hyperparams['di']
        self.max_q = hyperparams['max_q']
        self.Nq = hyperparams['Nq']
        self.Na = hyperparams['Na']
        self.cell = hyperparams['cell']
        self.batch_size = hyperparams['batch_size']
        self.trainable_embeddings = hyperparams['trainable_embeddings']
        
        with tf.device('/cpu:0'):
            self.qemb_W = tf.get_variable('qemb_w',
                                          initializer=tf.random_uniform([self.Nq, self.dq], -0.1, 0.1),
                                          trainable = self.trainable_embeddings)
        self.aemb_W = tf.get_variable(name='aemb_w',
                                      initializer=tf.random_uniform([self.dh, self.Na], -0.1, 0.1))
        self.aemb_b = tf.get_variable(name='aemb_b',
                                      initializer=tf.zeros([self.Na]))
        self.Wi = tf.get_variable(name='Wi', shape=[self.di, self.dq],
                                  initializer=tf.contrib.layers.xavier_initializer())
        self.bi = tf.get_variable(name='bi',
                                      initializer=tf.zeros([self.dq]))
        
        if self.cell == 'rnn':
            self.recur = tf.nn.rnn_cell.RNNCell(self.dh)
        elif self.cell == 'lstm':
            self.recur = tf.nn.rnn_cell.LSTMCell(self.dh)
        elif self.cell == 'gru':
            self.recur = tf.nn.rnn_cell.GRUCell(self.dh)
        else:
            raise NotImplementedError
        
    def build_model(self):
        
        p_image = tf.placeholder(tf.float32,
                                [None, self.di],
                                 name="p_image")
        
        p_keep_prob = tf.placeholder(tf.float32, name='p_keep_prob')
        
        p_question = tf.placeholder(tf.int32, 
                                    [None, self.max_q],
                                    name="p_question")
        p_answer = tf.placeholder(tf.float32, 
                                  [None,self.Na],
                                  name="p_answer")
        p_question_mask = tf.placeholder(tf.int32,
                                         [self.max_q+1, None, None],
                                         name="p_question_mask")
        
        image_proj = tf.nn.xw_plus_b(p_image,self.Wi,self.bi,name='image_proj')
        image_proj_drp = tf.nn.dropout(image_proj, p_keep_prob)
        
        state = self.recur.zero_state(self.batch_size, tf.float32)
        states = []
        outputs = []
        for j in range(self.max_q+1):
            if j==0:
                output,state = self.recur(image_proj_drp, state)
            else:
                with tf.device('/cpu:0'):
                    question_emb = tf.nn.embedding_lookup(self.qemb_W, p_question[:,j-1])
                    question_emb_drp = tf.nn.dropout(question_emb, p_keep_prob)
                tf.get_variable_scope().reuse_variables()
                output,state = self.recur(question_emb_drp, state)
            states.append(state)
            outputs.append(output)

        output = tf.pack(outputs) # (max_words_q, batch_size, 4*dim_hidden)
        output_final = tf.reduce_sum(tf.mul(output, tf.to_float(p_question_mask)),0) # (batch_size, 2*dim_hidden)

        answer_logits = tf.nn.xw_plus_b(output_final,
                                        self.aemb_W,
                                        self.aemb_b) # (batch_size, num_answer)
        answer_pred = tf.argmax(answer_logits,1)
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(answer_logits, p_answer) # (batch_size, )
        loss = tf.reduce_mean(cross_entropy)
        train_op = tf.train.AdamOptimizer().minimize(loss)
        loss_summary = tf.scalar_summary("loss",loss,name="loss")
        output = {'train_op':train_op,
                 'loss':loss,
                 'question':p_question,
                 'mask':p_question_mask,
                 'answer':p_answer,
                  'keep_prob':p_keep_prob,
                 'answer_pred':answer_pred,
                 'loss_summary':loss_summary,
                 'image':p_image}
        return output

In [33]:
def create_feed_dict(batch,max_q,Na,batch_size):
    V = np.zeros((batch_size, len(batch[0][0])), 'float32')
    Q = np.zeros((batch_size, max_q), 'int32')
    mask = np.zeros((max_q+1,batch_size), 'int32')
    ans = np.zeros((batch_size,Na),'int32')
    
    for i,(im,s,a) in enumerate(batch):
        V[i] = im
        Q[i] = np.pad(s, (0,max_q-len(s)), 'constant')
        mask[len(s),i] = 1
        ans[i,a] = 1
    mask = mask[:,:,None]
    return V,Q,mask,ans

def test(step,verbose=None):
    acc = []
    test_batches = test_set.batch_gen(batch_size)
    for idx,batch in enumerate(test_batches):    
        if verbose:
            if idx%20==0:
                print("%d - accuracy = %1.3f"%(idx, np.mean(acc)))
        V,Q,mask,ans = create_feed_dict(batch,max_q,Na,batch_size)
        a_pred = sess.run(M['answer_pred'], 
                          feed_dict={M['question']:Q,
                                     M['mask']:mask, 
                                     M['answer']:ans,
                                     M['image']:V, 
                                     M['keep_prob']:keep_prob})
        equals = 1*np.equal(ans.argmax(axis=1),a_pred)
        equals = list(equals[:len(batch)])
        acc += equals
    acc = tf.reduce_mean(tf.to_float(acc))
    acc_s = tf.scalar_summary("acc_tf",acc,name="acc_tf")
    acc,acc_s = sess.run([acc,acc_s])
    writer.add_summary(acc_s,step)
    return acc

In [None]:
dataset_rootpath = "/home/hbenyounes/vqa/datasets/spatialgenome/"
train_set = Dataset(join(dataset_rootpath, 'train','images.feat'),
                    join(dataset_rootpath, 'train','img_ids.txt'),
                    join(dataset_rootpath, 'train','questions.idxs'),
                    join(dataset_rootpath, 'train','answers.idxs'))
test_set = Dataset(join(dataset_rootpath, 'test','images.feat'),
                    join(dataset_rootpath, 'test','img_ids.txt'),
                    join(dataset_rootpath, 'test','questions.idxs'),
                    join(dataset_rootpath, 'test','answers.idxs'))

q_i2w, q_w2i = load_vocab('datasets/spatialgenome/train/questions.vocab')
a_i2w, a_w2i = load_vocab('datasets/spatialgenome/train/answers.vocab')

In [48]:
dataset = load_dataset(join(dataset_rootpath,'train','answers.idxs'))

In [53]:
len(dataset), len(set([' '.join(str(d)) for d in dataset]))

(105874, 30206)

In [35]:
print("Graph initialization")
model_name = "model1"
root_path = "/home/hbenyounes/vqa/results/spatialgenome/"
embedding_path = '/home/hbenyounes/vqa/GoogleNews.model'
if not exists(join(root_path, model_name)):
    mkdir(join(root_path, model_name))

Nq = len(q_i2w)
Na = len(a_i2w)

max_q = train_set.max_q
Nq = len(q_i2w)
Na = len(a_i2w)

vector_size = 200
max_q = train_set.max_q
H = {"dq":vector_size,
     "da":vector_size, 
     'dh':300,
     "di":4096,
     "Nq":len(q_i2w),
     "max_q":max_q,
     "Na":len(a_i2w),
     "batch_size":64,
     "cell":"lstm",
     "trainable_embeddings":True,
     "keep_prob":0.5,
     "word2vec":False}

tf.reset_default_graph()
model = VisLSTM(H)
M = model.build_model()
if H['word2vec']:
    q_i2w, q_w2i = load_vocab(join(dataset_rootpath,'train/questions.vocab'))
    print("Load word2Vec")
    embeddings = {}
    for n,l in enumerate(open(embedding_path,encoding='utf-8')):
        l = l.strip().split()
        w = l[0]
        vec = [float(x) for x in l[1:]]
        embeddings[w] = vec
    emb,c = load_emb_matrix(q_i2w, embeddings, vector_size)
    del embeddings
    
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6)
sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options, 
                                                   intra_op_parallelism_threads=1))

saver = tf.train.Saver(max_to_keep=100)
writer = tf.train.SummaryWriter(join(root_path,model_name,'tf_log'), sess.graph)

init = tf.initialize_all_variables()
sess.run(init)

if H['word2vec']:
    sess.run(model.qemb_W.assign(emb))
n_parameters = sum( [np.prod(v.get_shape(),dtype='int') for v in tf.trainable_variables()])
H['n_parameters'] = n_parameters



Graph initialization


Exception ignored in: <bound method InteractiveSession.__del__ of <tensorflow.python.client.session.InteractiveSession object at 0x7fd55b5303c8>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py", line 171, in __del__
    self.close()
  File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py", line 976, in close
    self._default_session.__exit__(None, None, None)
  File "/usr/lib/python3.4/contextlib.py", line 66, in __exit__
    next(self.gen)
  File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/ops.py", line 3378, in get_controller
    % type(default))
AssertionError: Nesting violated for default stack of <class 'weakref'> objects


In [36]:
#Train
break_all = False
batch_size = H['batch_size']
keep_prob = H['keep_prob']
with tf.device('/gpu:0'):
    n_epochs = 50
    max_test_acc = -np.Inf
    patience = 3
    for epoch in range(n_epochs):
#         if epoch <2:
#             continue
        epoch_loss = []
        times = 0.
        n_batches = train_set.N // batch_size
        train_batches = train_set.batch_gen(batch_size)
        for idx,batch in enumerate(train_batches):
            tic = time()
            if idx%(n_batches//10)==0:
                print("Epoch %d - %d/%d : loss = %1.4f - time = %1.3fs"%(epoch,idx,
                                                                         n_batches,np.mean(epoch_loss),
                                                                         times))
            V,Q,mask,ans = create_feed_dict(batch,max_q,Na,batch_size)
            _,l,l_s = sess.run([M['train_op'],
                                M['loss'],
                                M['loss_summary']], 
                               feed_dict={M['question']:Q,
                                          M['mask']:mask,
                                          M['answer']:ans,
                                          M['image']:V,
                                          M['keep_prob']:keep_prob})
            if np.isnan(l):
                break_all = True
            epoch_loss.append(l)
            writer.add_summary(l_s,idx+epoch*n_batches)
            times += time() - tic
            if break_all:
                print("Loss is nan at iteration %d" % (idx+n_batches*epoch))
                break
        if break_all:
            break
        with tf.device('/cpu:0'):
            test_acc = test((1+epoch)*n_batches)
            print("Epoch %d - Test accuracy = %1.3f" % (epoch+1, test_acc))
        if test_acc > max_test_acc:
            patience += 3
            saver.save(sess, join(root_path,model_name,'model'), global_step=epoch)
        max_test_acc = max(test_acc, max_test_acc)
        if epoch >= patience:
            print("EARLY STOPPING")
            break



Epoch 0 - 0/715 : loss = nan - time = 0.000s
Epoch 0 - 71/715 : loss = 5.5153 - time = 8.599s
Epoch 0 - 142/715 : loss = 6.0173 - time = 15.734s
Epoch 0 - 213/715 : loss = 6.9102 - time = 22.977s
Epoch 0 - 284/715 : loss = 7.9457 - time = 30.405s
Epoch 0 - 355/715 : loss = 8.9517 - time = 37.800s
Epoch 0 - 426/715 : loss = 9.9843 - time = 44.910s


KeyboardInterrupt: 

In [None]:
hyperparams['max_test_acc'] = max_test_acc
with open(join(root_path, model_name, 'hyperparams'),'w') as f:
    for h in hyperparams:
        f.write("%s = %s\n" % (h, str(hyperparams[h])))
    f.write('\n\nMaximal test accuracy = %1.4f' % max_test_acc)