In [None]:
# pre source download

# 36 boxes fixed feature
# train and val
!wget https://storage.googleapis.com/up-down-attention/trainval_36.zip
# test
!wget https://storage.googleapis.com/up-down-attention/test2014_36.zip

!unzip "*.zip"

In [2]:
# import

from easydict import EasyDict as edict
import json
import h5py
import sys
import csv
import base64
import numpy as np
from tqdm import tqdm
import re
import itertools
from collections import Counter

In [22]:
# config 

config = edict({
    "output_size": 36,  # max number of object proposals per image
    "output_features": 2048,  # number of features in each object proposal
    "preprocessed_train_path": 'image-train.h5',  # path where preprocessed features from the train split are saved to and loaded from
    "preprocessed_val_path": 'image-val.h5',  # path where preprocessed features from the val split are saved to and loaded from
    "preprocessed_test_path": 'image-test.h5',  # path where preprocessed features from the test split are saved to and loaded from
    "vocabulary_path": 'vocab.json',
    "train_num": 44375,
    "val_num": 21435,
    "test_num": 21435,
    "origin_train_num": 82783,
    "origin_val_num": 40504,
    "origin_test_num": 40775,
    "max_answers": 3129,
})

In [3]:
# for subset adapt

def get_needed_imageid(dataset="test"):
    path = './questions/' + dataset + '.json'
    print(path)
    imageid_set = set()
    with open(path, 'r') as f:
        t = json.load(f)
        print(len(t['questions']))
        for question in t['questions']:
            imageid_set.add(question['image_id'])
    return list(imageid_set)

ss = get_needed_imageid('test')
print(len(ss), ss[:5])
ss = get_needed_imageid('val')
print(len(ss), ss[:5])
ss = get_needed_imageid('train')
print(len(ss), ss[:5])

./questions/test.json
21435
15718 [229385, 393226, 229387, 229391, 458768]
./questions/val.json
21435
15682 [360449, 557059, 229387, 229388, 262161]
./questions/train.json
44375
32077 [131074, 131075, 393223, 393227, 393230]


In [None]:
# process image

csv.field_size_limit(sys.maxsize)

def process_image_feature(dataset="test"):
    h5FilePath = config["preprocessed_%s_path" % (dataset)]
    print("h5 file path:", h5FilePath)
    
    num = config[('%s_num' % (dataset))] # number of images in train or in val or in test
    print('item num:', num)
    
    features_shape = (
        num,
        config.output_features,
        config.output_size,
    )
    boxes_shape = (
        num,
        4, # top, bottom, left, right
        config.output_size,
    )
    with h5py.File(h5FilePath, 'w', libver='latest') as fd:
        features = fd.create_dataset('features', shape=features_shape, dtype='float32')
        boxes = fd.create_dataset('boxes', shape=boxes_shape, dtype='float32')
        coco_ids = fd.create_dataset('ids', shape=(num,), dtype='int32')
        widths = fd.create_dataset('widths', shape=(num,), dtype='int32')
        heights = fd.create_dataset('heights', shape=(num,), dtype='int32')
        
        FIELDNAMES = ['image_id', 'image_w','image_h','num_boxes', 'boxes', 'features']
        needed_imageids = get_needed_imageid(dataset)
        i = 0
        
        for Tdataset in ['train', 'eval', 'test']:
            tsvFilePath = "%s2014_resnet101_faster_rcnn_genome_36.tsv" % (Tdataset)
            print("tsv file path:", tsvFilePath)
            with open(tsvFilePath, "r") as tsvF:
                reader = csv.DictReader(tsvF, delimiter='\t', fieldnames=FIELDNAMES)
                origin_num = config[('origin_%s_num' % (Tdataset))]
                for _, item in enumerate(tqdm(reader, total=origin_num)):
                    cur_id = int(item['image_id'])
                    if cur_id not in needed_imageids:
                        continue

                    coco_ids[i] = int(item['image_id'])
                    widths[i] = int(item['image_w'])
                    heights[i] = int(item['image_h'])

                    buf = base64.decodestring(item['features'].encode('utf8'))
                    array = np.frombuffer(buf, dtype='float32')
                    array = array.reshape((-1, config.output_features)).transpose() # 36*2048 -> T -> 2048*36
                    features[i, :, :array.shape[1]] = array

                    buf = base64.decodestring(item['boxes'].encode('utf8'))
                    array = np.frombuffer(buf, dtype='float32')
                    array = array.reshape((-1, 4)).transpose() # 36*4 -> T -> 4*36
                    boxes[i, :, :array.shape[1]] = array

                    i = i+1
    

In [23]:
# process vocab

_special_chars = re.compile('[^a-z0-9 ]*')
_period_strip = re.compile(r'(?!<=\d)(\.)(?!\d)')
_comma_strip = re.compile(r'(\d)(,)(\d)')
_punctuation_chars = re.escape(r';/[]"{}()=+\_-><@`,?!')
_punctuation = re.compile(r'([{}])'.format(re.escape(_punctuation_chars)))
_punctuation_with_a_space = re.compile(r'(?<= )([{0}])|([{0}])(?= )'.format(_punctuation_chars))

def prepare_questions(questions_json):
    # Tokenize and normalize questions
    questions = [q['question'] for q in questions_json['questions']]
    for question in questions:
        question = question.lower()[:-1]
        question = _special_chars.sub('', question)
        yield question.split(' ')

def prepare_answers(answers_json):
    # Normalize answers
    answers = [[a['answer'] for a in ans_dict['answers']] for ans_dict in answers_json['annotations']]
    def process_punctuation(s):
        if _punctuation.search(s) is None:
            return s
        s = _punctuation_with_a_space.sub('', s)
        if re.search(_comma_strip, s) is not None:
            s = s.replace(',', '')
        s = _punctuation.sub(' ', s)
        s = _period_strip.sub('', s)
        return s.strip()

    for answer_list in answers:
        yield list(map(process_punctuation, answer_list))
        
def extract_vocab(iterable, top_k=None, start=0):
    # Turns an iterable of list of tokens into a vocabulary.
    all_tokens = itertools.chain.from_iterable(iterable)
    counter = Counter(all_tokens)
    if top_k:
        most_common = counter.most_common(top_k)
        most_common = (t for t, c in most_common)
    else:
        most_common = counter.keys()
    # descending in count, then lexicographical order
    tokens = sorted(most_common, key=lambda x: (counter[x], x), reverse=True)
    vocab = {t: i for i, t in enumerate(tokens, start=start)}
    return vocab
        
def process_questions(dataset="test"):
    path = './questions/' + dataset + '.json'
    print(path)
    with open(path, 'r') as f:
        t = json.load(f)
        questions = list(prepare_questions(t))
        print(questions[:1])
        question_vocab = extract_vocab(questions, start=1)
        return question_vocab
    return None

def process_answers(dataset="test"):
    path = './annotations/' + dataset + '.json'
    print(path)
    with open(path, 'r') as f:
        t = json.load(f)
        answers = list(prepare_answers(t))
        print(answers[:1])
        answer_vocab = extract_vocab(answers, top_k=config.max_answers)
        return answer_vocab
    return None
        
        
def process_vocab(dataset="test"):
    question_vocab = process_questions(dataset)
    answer_vocab = process_answers(dataset)
    vocabs = {
        'question': question_vocab,
        'answer': answer_vocab,
    }
    with open(config.vocabulary_path, 'w') as fd:
        json.dump(vocabs, fd)

    
process_vocab()

./questions/test.json
[['what', 'does', 'the', 'truck', 'on', 'the', 'left', 'sell']]
./annotations/test.json
[['green', 'green', 'green', 'silver', 'green', 'green', 'green', 'green', 'green', 'green']]
