# 1 - Create the json

We want an easy-to-read json file for the dataset containing the following keys:

* image_id : the id of the image (type:int)
* question : sentence (type:str)
* ground_truth : a list of true answers (type:[str])
* multiple_choice : a list of possible answers (type:[str])

Note that both questions and answers are to be expressed in the SAME id2word table ! We can then have the freedom to tie/untie the word embedding matrices in our model

In [1]:
import json
from os.path import join
from collections import Counter

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np


In [27]:
split = 'train'
for split in 'train val'.split():
    print(split)
    D = json.loads(open('datasets/vqa/%s/MultipleChoice_mscoco_%s2014_questions.json' %(split,split)).read())
    A = json.loads(open('datasets/vqa/%s/mscoco_%s2014_annotations.json' %(split,split)).read())
    dataset = open('datasets/vqa/%s/dataset.json'%split,'w')
    for a,d in zip(A['annotations'],D['questions']):
        datapoint = {}
        datapoint['image_id'] = a['image_id']
        datapoint['ground_truth'] = [x['answer'] for x in a['answers']]
        datapoint['multiple_choices'] = d['multiple_choices']
        datapoint['question'] = d['question']
        dataset.write(json.dumps(datapoint)+'\n')
    dataset.close()
    del A,D

train
val


# 2 - Create the vocabulary

We are going to take all the words appearing in the training set (question, ground truth answers and multiple choices), lowercase them and build a vocabulary out of it

In [2]:
from nltk.tokenize import wordpunct_tokenize
vocabulary = Counter()
N = len(open('datasets/vqa/train/dataset.json').read().split('\n'))
for i,l in enumerate(open('datasets/vqa/train/dataset.json')):
    if not i % (N//20):
        print("%d/%d" % (i,N))
    l = json.loads(l)
    sent = wordpunct_tokenize(l['question'].lower())
    for ans in l['multiple_choices'] + l['ground_truth']:
        sent += wordpunct_tokenize(ans)
    vocabulary.update(Counter(sent))
vocab_file = open('datasets/vqa/vocabulary.txt','w')
vocab_file.write('<unk>\n<s>\n</s>\n')
for w,c in vocabulary.most_common():
    try:
        vocab_file.write(w + '\n')
    except:
        print("Couldn't write %s" % w)
vocab_file.close()

0/248350
12417/248350
24834/248350
37251/248350
49668/248350
62085/248350
74502/248350
86919/248350
99336/248350
111753/248350
124170/248350
136587/248350
149004/248350
161421/248350
173838/248350
186255/248350
198672/248350
211089/248350
223506/248350
235923/248350
248340/248350
Couldn't write ’


# 3 - Integerify the text

Here, we will take the vocabulary created in #2 and the json dataset from #1 to make a new dataset_idxs.json where all the words are replaced by their indexes in the vocabulary

In [4]:
def integerify(sent,vocab):
    tokenized = wordpunct_tokenize(sent)
    output = []
    for w in tokenized:
        if w in vocab:
            output.append(vocab[w])
        else:
            output.append(vocab['<unk>'])
    return output
w2i = {}
for i,l in enumerate(open('datasets/vqa/vocabulary.txt','r')):
    w2i[l.strip()] = i

for split in 'train val'.split():
    print(split)
    dataset_idxs = open('datasets/vqa/%s/dataset_idxs.json'%split,'w')
    dataset = open('datasets/vqa/%s/dataset.json' % split,'r')
    for l in dataset:
        l = json.loads(l)
        l['question'] = integerify(l['question'], w2i)
        for key in "multiple_choices ground_truth".split():
            X = []
            for x in l[key]:
                integ = integerify(x,w2i)
                if len(integ)==0:
                    print("PROBLEM WITH LINE %d" % i)
                X.append(integ)
            l[key] = X
        dataset_idxs.write(json.dumps(l)+'\n')

    dataset.close()
    dataset_idxs.close()

train
val


In [23]:
def extract_triplets(D,A):
    questions, image_ids, answers, mcs =[],[],[],[]
    for d,a in zip(D['questions'],A['annotations']):
        assert d['image_id'] == a['image_id']
        questions.append(d['question'])
        image_ids.append(d['image_id'])
        answers.append(a['multiple_choice_answer'])
        mcs.append(' | '.join(d['multiple_choices']))
    print("%d questions (%d different)" % (len(questions), len(set(questions))))
    print("%d images (%d different)" % (len(image_ids), len(set(image_ids))))
    return questions, image_ids, answers, mcs

def extract_spatial_triplets(questions,image_ids,answers,mcs):
    words = open('spatial_words.txt','r').read().split()

    IQA = set()
    for q,i,a,mc in zip(questions, image_ids,answers,mcs):
        qwords = set(q.lower().strip('?').split())
        if len(qwords.intersection(words))>=1:
            IQA.add((i,q,a,mc))
    spatial_questions = []
    spatial_image_ids = []
    spatial_answers = []
    spatial_mcs = []
    for i,q,a,mc in IQA:
        spatial_questions.append(q)
        spatial_image_ids.append(i)
        spatial_answers.append(a)
        spatial_mcs.append(mc)
    print("%d questions (%d different)" % (len(spatial_questions), len(set(spatial_questions))))
    print("%d images (%d different)" % (len(spatial_image_ids), len(set(spatial_image_ids))))
    return spatial_questions, spatial_image_ids, spatial_answers, spatial_mcs

def write_triplets(spatial_questions,spatial_image_ids,spatial_answers,spatial_mcs,split):
    image_file = open('datasets/vqa/'+split+'/img_ids.txt','w',encoding='utf-8')
    question_file = open('datasets/vqa/'+split+'/questions.txt','w',encoding='utf-8')
    answer_file = open('datasets/vqa/'+split+'/answers.txt','w',encoding='utf-8')
    mc_file = open('datasets/vqa/'+split+'/mcs.txt','w',encoding='utf-8')
    for i,q,a,mc in zip(spatial_image_ids, spatial_questions, spatial_answers,spatial_mcs):
        image_file.write(str(i) + '\n')
        question_file.write(q.lower().strip('?') + '\n')
        mc_file.write(mc+'\n')
        answer_file.write(a +'\n')
    image_file.close()
    question_file.close()
    answer_file.close()
    mc_file.close()
    return 'done'

In [24]:
for split in 'train val'.split():
    D = json.loads(open('datasets/vqa/'+split+'/MultipleChoice_mscoco_'+split+'2014_questions.json').read())
    A = json.loads(open('datasets/vqa/'+split+'/mscoco_'+split+'2014_annotations.json').read())
    O = json.loads(open('datasets/vqa/'+split+'/OpenEnded_mscoco_'+split+'2014_questions.json').read())
    questions,image_ids,answers,mcs = extract_triplets(D,A)
    write_triplets(questions,image_ids,answers,mcs,split)

248349 questions (152050 different)
248349 images (82783 different)
121512 questions (81565 different)
121512 images (40504 different)
