In [1]:
from os.path import exists
from os import mkdir
from os.path import join
from PIL import Image
import json


import tensorflow as tf
import roi_pooling_op_grad
module = tf.load_op_library('/Programs/tensorflow/roi_pooling.so')
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

from utils import load_vocab
from time import time
import datetime

In [2]:
def load_image(p):
    img = Image.open(p)
    if img.mode is not 'RGB':
        img = img.convert('RGB')
    treated_img = img.resize((448,448))
    treated_img = np.array(treated_img) / 255.0
    assert (0 <= treated_img).all() and (treated_img <= 1.0).all()
    return treated_img,img


def load_dataset(idxs_path,mc=False):
    dataset = []
    for l in open(idxs_path, 'r',encoding='latin1'):
        l = l.lower().strip().strip('.').strip('?')
        if mc:
            l = [[int(i) for i in ans.strip().split()] for ans in l.strip().split('|')]
        else:
            l = [int(i) for i in l.strip().split()]
        dataset.append(l)
    return dataset

class Dataset(object):
    def __init__(self, i_path, q_path, mc_path, a_path, bb_path, n_max=np.Inf):
        self.i_path = i_path
        self.q_path = q_path
        self.a_path = a_path
        self.mc_path = mc_path
        self.bb_path = bb_path
        print('Parse questions file')
        q_data = load_dataset(self.q_path)
        self.max_q = len(max(q_data, key=lambda x:len(x)))
        print('Parse answers file')
        a_data = load_dataset(self.a_path)
        print('Parse MC file')
        mc_data = load_dataset(self.mc_path,mc=True)
        self.max_mc = 0
        for mc in mc_data:
            for x in mc:
                self.max_mc = max(self.max_mc,len(x))
        print('Parse BB file')
        self.bboxes = {}
        for l in open(self.bb_path):
            if l.endswith('\n'):
                l = json.loads(l)
                self.bboxes.update(l)
        print("Construct data array")
        self.data = []
        self.max_bb = -np.Inf
        for i_id,q,mc,a in zip(open(i_path),q_data,mc_data,a_data):
            i_id = i_id.strip()
            if i_id not in self.bboxes:
                continue
            bb = self.bboxes[i_id]
            self.max_bb = max(self.max_bb,len(bb))
            datum = (i_id,bb,q,mc,a)
            self.data.append(datum)
        self.data = np.array(self.data, dtype=object)
        del q_data,a_data
        self.N = len(self.data)
        self.indexes = np.arange(self.N)
    
    def __iter__(self):
        return self
    
    def batch_gen(self,batch_size=64,shuffle=True):
        if shuffle:
            np.random.shuffle(self.indexes)
        n_batches = self.N // batch_size
        for batch_id in range(n_batches):
            begin = batch_id*batch_size
            end = min((batch_id+1)*batch_size, self.N)
            B = self.data[self.indexes[begin:end]]
            yield B

print("Loading train set")
train_set = Dataset("datasets/vqa/train/img_ids.txt",
                   "datasets/vqa/train/questions.idxs", 
                   "datasets/vqa/train/mcs.idxs", 
                   "datasets/vqa/train/answers.idxs",
                   "datasets/vqa/train/bounding_boxes.json")
val_set = Dataset("datasets/vqa/val/img_ids.txt",
                   "datasets/vqa/val/questions.idxs", 
                   "datasets/vqa/val/mcs.idxs", 
                   "datasets/vqa/val/answers.idxs",
                   "datasets/vqa/val/bounding_boxes.json")
max_bb = train_set.max_bb
max_q = train_set.max_q
max_mc = train_set.max_mc

Loading train set
Parse questions file
Parse answers file
Parse MC file
Parse BB file
Construct data array
Parse questions file
Parse answers file
Parse MC file
Parse BB file
Construct data array


In [3]:
D = 0
C = 0
train_gen = train_set.batch_gen()
for batch in train_gen:
    for batch_id,(i_id,bb,q,mc,a) in enumerate(batch):
        if sum([x==a for x in mc]) == 0:
            C += 1
        if min(len(x) for x in mc) == 0:
            D += 1
print(C,D)

0 3


In [3]:
q_i2w, q_w2i = load_vocab('datasets/vqa/train/questions.vocab')
a_i2w, a_w2i = load_vocab('datasets/vqa/train/answers.vocab')
Nq = len(q_i2w)
Na = len(a_i2w)

In [4]:
image_paths = {}
root_path = "/srv/data/datasets/mscoco/images/"

for split in 'train val'.split():
    image_ids_path = "datasets/vqa/"+split+"/img_ids.txt"
    image_ids = set([int(x.strip()) for x in open(image_ids_path).readlines()])
    print(split,len(image_ids))
    for x in image_ids:
        name = 'COCO_'+split+'2014_'+format(x, '012')+'.jpg'
        path = join(root_path,split+"2014",name)
        image_paths[str(x)] = path

train 82783
val 40504


In [5]:
tf.reset_default_graph()
# Read the model
with open("tensorflow-vgg16/vgg16.tfmodel",
          mode='rb') as f:
    fileContent = f.read()
graph_def = tf.GraphDef()
# Put it into my graph_def
graph_def.ParseFromString(fileContent)
graph = tf.get_default_graph()

weights_names = ["import/fc6/weight:0", 
                 "import/fc7/weight:0",
                 "import/fc8/weight:0"]
biases_names = ["import/fc6/bias:0", 
                "import/fc7/bias:0",
                "import/fc8/bias:0"]
fc_shapes = [4096,4096,1000]
layer_number = 2
#di = graph.get_tensor_by_name(weights_names[layer_number-1]).get_shape()[-1].value
def pool5_tofcX(input_tensor, layer_number=layer_number):
    flatten=tf.reshape(input_tensor,(-1,7*7*512))
    tmp=flatten
    for i in range(layer_number):
        tmp=tf.matmul(tmp, graph.get_tensor_by_name(weights_names[i]))
        tmp=tf.nn.bias_add(tmp, graph.get_tensor_by_name(biases_names[i]))
        tmp = tf.nn.relu(tmp)
    return tmp

In [6]:
batch_size = 16
di = fc_shapes[layer_number-1]
dv = 300
dq = 200
dh = 200
datt = 200
Nq = train_set.N
with tf.variable_scope('image'):
    tf.get_variable('W', shape=[di, dv],
                    initializer=tf.contrib.layers.xavier_initializer())
    tf.get_variable(name='b',
                    initializer=tf.zeros([dv]))

with tf.variable_scope('question'):
    tf.get_variable('W',
                    initializer=tf.random_uniform([Nq, dq], -0.1, 0.1))
    
with tf.variable_scope('attention'):
    tf.get_variable('Wimg',shape=[dv,datt],
                    initializer=tf.contrib.layers.xavier_initializer())
    tf.get_variable('Wstate',shape=[dh,datt],
                    initializer=tf.contrib.layers.xavier_initializer())
    
with tf.variable_scope('multiple_choice'):
    tf.get_variable('W',
                    initializer=tf.random_uniform([Na, dh], -0.1, 0.1))

rnn = tf.nn.rnn_cell.GRUCell(dh)

Pl = {}
Pl['images'] = tf.placeholder(tf.float32, 
                              [batch_size, 448, 448, 3],
                              name="images") #batch x width x height x channels
Pl['boxes'] = tf.placeholder(tf.float32, 
                             [None,5],
                             name = "boxes")
Pl['n_boxes'] = tf.placeholder(tf.float32,
                               [None],
                               name="n_boxes")
Pl['questions'] = tf.placeholder(tf.int32, 
                                 [batch_size, max_q],
                                 name="question")
Pl['question_mask'] = tf.placeholder(tf.int32,
                                     [max_q, None],
                                     name="question_mask")

Pl['mc'] = tf.placeholder(tf.int32,
                          [batch_size, 18,None], 
                          name="mc")
Pl['answers'] = tf.placeholder(tf.float32, 
                               [batch_size,18], 
                               name="answers")


def compute_attention(V,state):
    with tf.variable_scope('attention',reuse=True):
        Wimg = tf.get_variable('Wimg')
        Wstate = tf.get_variable('Wstate')
        Vatt = tf.transpose(tf.tanh(tf.reshape(tf.matmul(tf.reshape(V, 
                                                       (batch_size*max_bb,dv)),
                                             Wimg),
                                   (max_bb,batch_size,datt))),(1,0,2))
        Hatt = tf.expand_dims(tf.matmul(state,Wstate),1)
        att = tf.batch_matmul(Vatt,Hatt,adj_y=True)
        patt = tf.nn.softmax(att[:,:,0])
        Vpond = tf.mul(V,tf.expand_dims(patt,-1))
        Vt = tf.reduce_sum(Vpond,reduction_indices=1)
        return Vt

tf.import_graph_def(graph_def, 
                    input_map={'images':Pl['images']})

out_tensor = graph.get_tensor_by_name("import/conv5_3/Relu:0")
# Don't do your max pooling, but the roi_pooling
[out_pool,argmax] = module.roi_pool(out_tensor,
                                    Pl['boxes'],
                                    7,7,1.0/1) # out_pool.shape = N_Boxes x 7 x 7 x 512
boxes_emb = pool5_tofcX(out_pool,layer_number=layer_number)
with tf.variable_scope('image',reuse=True):
    W = tf.get_variable("W")
    b = tf.get_variable("b")
V = tf.tanh(tf.matmul(boxes_emb,W) + b)
V = tf.reshape(V,(batch_size,100,dv))

state = rnn.zero_state(batch_size, tf.float32)
states = []
q_out = []
with tf.variable_scope('question',reuse=True):
    W = tf.get_variable('W')
for j in range(max_q):
    question_emb = tf.nn.embedding_lookup(W, Pl['questions'][:,j])
    if j>0:
        tf.get_variable_scope().reuse_variables()
    Vt = compute_attention(V,state)
    output,state = rnn(tf.concat(1,[question_emb,Vt]), state)
    states.append(state)
    q_out.append(output)
q_out = tf.pack(q_out)
q_out = tf.reduce_sum(tf.mul(q_out, 
                             tf.to_float(tf.expand_dims(Pl['question_mask'],-1))),0)

mc_mask = tf.to_float(tf.not_equal(Pl['mc'],a_w2i['</s>']))
norm_mask = tf.expand_dims(tf.reduce_sum(mc_mask,reduction_indices=2),-1)
with tf.variable_scope('multiple_choice'):
    W = tf.get_variable('W')
    mc_emb = tf.nn.embedding_lookup(W, Pl['mc'])
    masked_mc_out = tf.mul(tf.expand_dims(mc_mask,-1),mc_emb)
    mc_out = tf.reduce_sum(masked_mc_out,reduction_indices=2)/norm_mask
    
out_scores = tf.batch_matmul(mc_out,tf.expand_dims(q_out,1),adj_y=True)[:,:,0]
out_probas = tf.nn.softmax(out_scores)

normalized_ans = Pl['answers'] / tf.expand_dims(tf.reduce_sum(Pl['answers'],reduction_indices=1),-1)
cross_entropy = normalized_ans*tf.log(out_probas) + (1.-normalized_ans)*tf.log(1.-out_probas)
cost = -tf.reduce_sum(cross_entropy)/batch_size

#optimizer = tf.train.AdamOptimizer()
optimizer = tf.train.GradientDescentOptimizer(0.01)
gvs = optimizer.compute_gradients(cost)
# with tf.device('/cpu:0'):
cost_s = tf.scalar_summary('train loss', cost, name='train_loss')
capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad,var in gvs]
train_op = optimizer.apply_gradients(capped_gvs)

boxes:0 (?, 5)


In [7]:
model_name = "model1"
model_rootpath = "/hhome/hbenyounes/vqa/results/vqa/"
model_path = join(model_rootpath,model_name)
if not exists(model_path):
    mkdir(model_path)

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7)

sess = tf.Session(config=tf.ConfigProto(log_device_placement=True,
                                        gpu_options=gpu_options))

writer = tf.train.SummaryWriter(join(model_path,'tf_log'), sess.graph)
saver = tf.train.Saver(max_to_keep=100)
init = tf.initialize_all_variables()
sess.run(init)

In [8]:
def create_feeddict(batch):
    boxes = np.zeros((batch_size*max_bb,5),dtype='int')
    images = np.zeros((batch_size,448,448,3))
    question_mask = np.zeros((max_q,batch_size),dtype="float")
    questions = np.zeros((batch_size,max_q), dtype='int')
    answers = np.zeros((batch_size,18),dtype="float")
    multiple_choice = []
    mc_len = 0
    for batch_id,(i_id,bb,q,mc,a) in enumerate(batch):
        # b = (i_id,bb,q,mc,a)
        p = image_paths[i_id]
        im_l,im = load_image(p)
        images[batch_id] = im_l
        box = np.array([[batch_id] + bb[i % len(bb)] for i in range(100)])
        box[:,[1,3]] = (box[:,[1,3]]*im_l.shape[1]/im.size[1])/16
        box[:,[2,4]] = (box[:,[2,4]]*im_l.shape[0]/im.size[0])/16 + box[:,[1,3]]
        boxes[batch_id*max_bb:(batch_id+1)*max_bb] = box
        questions[batch_id] = np.pad(q,(0,max_q-len(q)),'constant')
        question_mask[len(q),batch_id] = 1
        multiple_choice.append(mc)
        for i,x in enumerate(mc):
            if len(x) == 0:
                x = [a_w2i["<unk>"]]
                mc[i] = [a_w2i["<unk>"]]
            answers[batch_id,i] = 1*(a==x)
        mc_len = max(mc_len,len(max([x for x in mc], key=len)))
        
    multiple_choice = np.array([[np.pad(m,(0,mc_len+1-len(m)),
                                        'constant',
                                        constant_values=a_w2i['</s>']) for m in mc] for mc in multiple_choice])
    feed_dict = {Pl['images']: images,
                 Pl['boxes']:boxes, 
                 Pl['questions']:questions,
                 Pl['question_mask']:question_mask, 
                 Pl['answers']:answers,
                 Pl['mc']:multiple_choice}
    
    return feed_dict

def test():
    accuracy = []
    val_gen = val_set.batch_gen(batch_size)
    for idx,batch in enumerate(val_gen):
        print("\tTest: %d/%d" % (idx,val_set.N))
        feed_dict = create_feeddict(batch)
        y_pred = sess.run(out_probas,feed_dict=feed_dict).argmax(axis=1)
        accuracy.append(feed_dict[Pl['answers']][np.arange(batch_size),y_pred])
        if idx>10:
            break
    return np.mean(accuracy)

In [9]:
break_all = False
n_epochs = 100
n_batches = train_set.N//batch_size + 1
loss_value = 0
total_time = 0.
output_file = open(join(model_path,"output.txt"),'w')
for epoch in range(1,1+n_epochs):
    train_gen = train_set.batch_gen(batch_size,shuffle=True)
    epoch_loss = []
    for idx,batch in enumerate(train_gen):
        tic = time()
        step = idx + (epoch-1)*n_batches
        feed_dict = create_feeddict(batch)
        _,loss_value,loss_s = sess.run([train_op,cost,cost_s],feed_dict=feed_dict)
        writer.add_summary(loss_s,step)
        step_time = time() - tic
        total_time += step_time
        eta = total_time*(n_batches-idx)/(idx+1)
        print("Epoch %d/%d - batch %d/%d - loss = %1.3f - " \
        "step time = %1.1fs - ETA = %s" % (epoch,n_epochs,
                                          idx,n_batches,
                                          loss_value,step_time,
                                          str(datetime.timedelta(seconds=int(eta)))))
        epoch_loss.append(loss_value)
        if np.isnan(loss_value):
            print("Loss is nan, i get out")
            break_all = True
        if break_all:
            break
    if break_all:
        break
    print("test")
    test_acc = test()
    train_loss = np.mean(epoch_loss)
    output_file.write("Epoch %d - train loss = %1.3f - test accuracy = %1.3f\n" % (epoch,train_loss,test_acc))
    output_file.flush()
    saver.save(sess, join(model_path,'model'), global_step=epoch)
output_file.close()

Epoch 1/100 - batch 0/15522 - loss = 3.788 - step time = 5.5s - ETA = 23:42:04
Epoch 1/100 - batch 1/15522 - loss = 3.697 - step time = 1.5s - ETA = 15:11:23
Epoch 1/100 - batch 2/15522 - loss = 3.678 - step time = 1.5s - ETA = 12:17:44
Epoch 1/100 - batch 3/15522 - loss = 3.655 - step time = 1.5s - ETA = 10:51:53
Epoch 1/100 - batch 4/15522 - loss = 3.744 - step time = 1.4s - ETA = 9:56:16
Epoch 1/100 - batch 5/15522 - loss = 3.672 - step time = 1.5s - ETA = 9:19:54
Epoch 1/100 - batch 6/15522 - loss = 3.493 - step time = 1.4s - ETA = 8:50:37
Epoch 1/100 - batch 7/15522 - loss = 3.740 - step time = 1.4s - ETA = 8:29:59
Epoch 1/100 - batch 8/15522 - loss = 3.401 - step time = 1.4s - ETA = 8:14:51
Epoch 1/100 - batch 9/15522 - loss = 3.670 - step time = 1.5s - ETA = 8:05:18
Epoch 1/100 - batch 10/15522 - loss = 3.564 - step time = 1.5s - ETA = 7:56:35
Epoch 1/100 - batch 11/15522 - loss = 3.523 - step time = 1.4s - ETA = 7:47:46
Epoch 1/100 - batch 12/15522 - loss = 3.091 - step time = 

KeyboardInterrupt: 