In [None]:
# pre source download

# 36 boxes fixed feature
# train and val
!wget https://storage.googleapis.com/up-down-attention/trainval_36.zip
# test
!wget https://storage.googleapis.com/up-down-attention/test2014_36.zip

!unzip "*.zip"

In [None]:
# import

from easydict import EasyDict as edict
import json
import h5py
import sys
import csv
import base64
import random
import numpy as np
from tqdm import tqdm
import re
import itertools
from collections import Counter
import os
import mindspore.dataset as ds
from mindspore.mindrecord import FileWriter


In [None]:
# config 

config = edict({
    "output_size": 36,  # max number of object proposals per image
    "output_features": 2048,  # number of features in each object proposal
    "preprocessed_train_path": 'image-train.h5',  # path where preprocessed features from the train split are saved to and loaded from
    "preprocessed_val_path": 'image-val.h5',  # path where preprocessed features from the val split are saved to and loaded from
    "preprocessed_test_path": 'image-test.h5',  # path where preprocessed features from the test split are saved to and loaded from
    "vocabulary_path": 'vocab/vocab.json',
    "train_num": 44375,
    "val_num": 21435,
    "test_num": 21435,
    "origin_train_num": 82783,
    "origin_val_num": 40504,
    "origin_test_num": 40775,
    "max_answers": 6000,
    "max_q_length": 666,
    # training
    "epoch_size": 15,
})

In [None]:
# for subset adapt

def get_needed_imageid(dataset="test"):
    path = './questions/' + dataset + '.json'
    print(path)
    imageid_set = set()
    with open(path, 'r') as f:
        t = json.load(f)
        print(len(t['questions']))
        for question in t['questions']:
            imageid_set.add(question['image_id'])
    return list(imageid_set)

ss = get_needed_imageid('test')
print(len(ss), ss[:5])
ss = get_needed_imageid('val')
print(len(ss), ss[:5])
ss = get_needed_imageid('train')
print(len(ss), ss[:5])

In [None]:
# process image

csv.field_size_limit(sys.maxsize)

def process_image_feature(dataset="test"):
    h5FilePath = config["preprocessed_%s_path" % (dataset)]
    print("h5 file path:", h5FilePath)
    
    num = config[('%s_num' % (dataset))] # number of images in train or in val or in test
    print('item num:', num)
    
    features_shape = (
        num,
        config.output_features,
        config.output_size,
    )
    boxes_shape = (
        num,
        4, # top, bottom, left, right
        config.output_size,
    )
    with h5py.File(h5FilePath, 'w', libver='latest') as fd:
        features = fd.create_dataset('features', shape=features_shape, dtype='float32')
        boxes = fd.create_dataset('boxes', shape=boxes_shape, dtype='float32')
        coco_ids = fd.create_dataset('ids', shape=(num,), dtype='int32')
        widths = fd.create_dataset('widths', shape=(num,), dtype='int32')
        heights = fd.create_dataset('heights', shape=(num,), dtype='int32')
        
        FIELDNAMES = ['image_id', 'image_w','image_h','num_boxes', 'boxes', 'features']
        needed_imageids = get_needed_imageid(dataset)
        i = 0
        
        for Tdataset in ['train', 'eval', 'test']:
            tsvFilePath = "%s2014_resnet101_faster_rcnn_genome_36.tsv" % (Tdataset)
            print("tsv file path:", tsvFilePath)
            with open(tsvFilePath, "r") as tsvF:
                reader = csv.DictReader(tsvF, delimiter='\t', fieldnames=FIELDNAMES)
                origin_num = config[('origin_%s_num' % (Tdataset))]
                for _, item in enumerate(tqdm(reader, total=origin_num)):
                    cur_id = int(item['image_id'])
                    if cur_id not in needed_imageids:
                        continue

                    coco_ids[i] = int(item['image_id'])
                    widths[i] = int(item['image_w'])
                    heights[i] = int(item['image_h'])

                    buf = base64.decodestring(item['features'].encode('utf8'))
                    array = np.frombuffer(buf, dtype='float32')
                    array = array.reshape((-1, config.output_features)).transpose() # 36*2048 -> T -> 2048*36
                    features[i, :, :array.shape[1]] = array

                    buf = base64.decodestring(item['boxes'].encode('utf8'))
                    array = np.frombuffer(buf, dtype='float32')
                    array = array.reshape((-1, 4)).transpose() # 36*4 -> T -> 4*36
                    boxes[i, :, :array.shape[1]] = array

                    i = i+1
    

In [None]:
# process vocab

_special_chars = re.compile('[^a-z0-9 ]*')
_period_strip = re.compile(r'(?!<=\d)(\.)(?!\d)')
_comma_strip = re.compile(r'(\d)(,)(\d)')
_punctuation_chars = re.escape(r';/[]"{}()=+\_-><@`,?!')
_punctuation = re.compile(r'([{}])'.format(re.escape(_punctuation_chars)))
_punctuation_with_a_space = re.compile(r'(?<= )([{0}])|([{0}])(?= )'.format(_punctuation_chars))

def prepare_question(q):
    q = q.lower()[:-1]
    q = _special_chars.sub('', q)
    return q.split(' ')

def prepare_questions(questions_json):
    # Tokenize and normalize questions
    questions = [q['question'] for q in questions_json['questions']]
    for question in questions:
        yield prepare_question(question)

def process_punctuation(s):
    if _punctuation.search(s) is None:
        return s
    s = _punctuation_with_a_space.sub('', s)
    if re.search(_comma_strip, s) is not None:
        s = s.replace(',', '')
    s = _punctuation.sub(' ', s)
    s = _period_strip.sub('', s)
    return s.strip()
    
def prepare_answer(answer_list):
    return list(map(process_punctuation, answer_list))
    
def prepare_answers(answers_json):
    # Normalize answers
    answers = [[a['answer'] for a in ans_dict['answers']] for ans_dict in answers_json['annotations']]

    for answer_list in answers:
        yield prepare_answer(answer_list)
        
def extract_vocab(iterable, top_k=None, start=0):
    # Turns an iterable of list of tokens into a vocabulary.
    all_tokens = itertools.chain.from_iterable(iterable)
    counter = Counter(all_tokens)
    if top_k:
        most_common = counter.most_common(top_k)
        most_common = (t for t, c in most_common)
    else:
        most_common = counter.keys()
    # descending in count, then lexicographical order
    tokens = sorted(most_common, key=lambda x: (counter[x], x), reverse=True)
    vocab = {t: i for i, t in enumerate(tokens, start=start)}
    return vocab
        
def process_questions():
    questions = []
    for dataset in ['train', 'val', 'test']:
        path = './questions/' + dataset + '.json'
        print(path)
        with open(path, 'r') as f:
            t = json.load(f)
            questions.extend(list(prepare_questions(t)))

    question_vocab = extract_vocab(questions, start=1)
    data_max_length = max(map(len, questions))
    return question_vocab, data_max_length

def process_answers():
    answers = []
    for dataset in ['train', 'val', 'test']:
        path = './annotations/' + dataset + '.json'
        print(path)
        with open(path, 'r') as f:
            t = json.load(f)
            answers.extend(list(prepare_answers(t)))
            
    answer_vocab = extract_vocab(answers, top_k=config.max_answers)
    return answer_vocab
        
        
def process_vocab():
    save_path=config.vocabulary_path
    question_vocab, max_len = process_questions()
    answer_vocab = process_answers()
    vocabs = {
        'question': question_vocab,
        'answer': answer_vocab,
        'q_max_len': max_len,
    }
    with open(save_path, 'w') as fd:
        json.dump(vocabs, fd)
    
    return vocabs

process_vocab()

In [None]:
def encode_question(q, vocab, max_question_length):
    num_tokens = len(vocab)
    vec = np.zeros(max_question_length, dtype=int)
    vec.fill(num_tokens)
    q = prepare_question(q)
    for i, token in enumerate(q):
        if i >= max_question_length:
            break
        index = vocab.get(token, num_tokens - 1)
        vec[i] = index
    return vec


In [None]:
def encode_answer(answer_list, vocab):
    num_tokens = len(vocab)
    answer_list = [t['answer'] for t in answer_list]
    answers = prepare_answer(answer_list)
    a_vecs = [vocab.get(t, num_tokens-1) for t in answers]
    a_counter = Counter(a_vecs)
    a_label = a_counter.most_common(1)[0][0]
    return a_vecs,a_counter,a_label

In [None]:
def load_vocab():
    vocab_file_path = config.vocabulary_path
    if not(os.path.isfile(vocab_file_path)):
        process_vocab()
    
    vocab = None
    with open(vocab_file_path, 'r') as f:
        vocab = json.load(f)
        
    if vocab==None:
        raise Exception("error in loading vocab file")
    return vocab

vocab = load_vocab()
print(vocab)

In [None]:
# provide generator callable
# refers to https://pytorch.org/docs/stable/_modules/torch/utils/data/dataset.html#Dataset
# mindspore docs is quite weird

class Generator():
    def __init__(self, input_list):
        self.input_list=input_list
    def __getitem__(self, index): # modify according to the network struct
        return (
            np.array(self.input_list[index]['question']),
            np.array(self.input_list[index]['answer_counter'])
        )
    def __len__(self):
        return len(self.input_list)

In [None]:
# build train, val and test record

def gen_mindspore_dataset(dataset="test", batch_size=64, save_mindrecord=False):
    q_file_path = "questions/%s.json" % (dataset)
    a_file_path = "annotations/%s.json" % (dataset)
    
    q_dict = dict()
    a_dict = dict()
    
    # load questions and answers
    
    with open(q_file_path, 'r') as f:
        t = json.load(f)
        t = t['questions']
        for item in t:
            q_dict[item['question_id']] = item
    
    with open(a_file_path, 'r') as f:
        t = json.load(f)
        t = t['annotations']
        for item in t:
            a_dict[item['question_id']] = item
            
    # load questions and answers done
    
    # load vocab
    
    vocab = load_vocab()
    
    final_sources = []
    for qid, item in q_dict.items():
        source = dict()
        source['question_id'] = qid
        source['image_id'] = item['image_id']
        
        q_vec = encode_question(
            item['question'],
            vocab['question'],
            min(config.max_q_length, vocab['q_max_len'])
        )
        source['question'] = q_vec
        
        # search relevant answer
        a_item = a_dict.get(qid, -1)
        if a_item == -1:
            print('cannot find qid %s in answer dict' % qid)
            continue
            
        a_vecs, a_counter, a_label = encode_answer(
            a_item['answers'],
            vocab['answer']
        )
        source['answer'] = a_vecs
        source['answer_counter'] = dict(a_counter)
        source['answer_label'] = a_label
        
        final_sources.append(source)
        
#         return source
    
    ''' sample source
    {
        'question_id': 393226002, 
        'image_id': 393226, 
        'question': array([3, 14, 1, 113, 7, 1, 68, 1192, 4877, 4877, 4877,4877, 4877, 4877, 4877, 4877, 4877, 4877, 4877]), 
        'answer': [489, 489, 489, 489, 489, 489, 489, 489, 489, 489], 
        'answer_counter': {489: 10}, 
        'answer_label': 489
    }
    '''
        
    # create mindspore dataset
    random.shuffle(final_sources)
    gen_dataset = ds.GeneratorDataset(
        source=Generator(input_list=final_sources), 
        column_names=["data","label"],
        shuffle=False
    )
    gen_dataset=gen_dataset.batch(batch_size=batch_size,drop_remainder=True)
    if dataset == 'train':
        gen_dataset = gen_dataset.repeat(config.epoch_size)
    
    # save to a mindrecord file
    if save_mindrecord:
        schema_json = {
            "question_id": {"type": "int32"}, 
            "image_id": {"type": "int32"}, 
            "question": {"type": "string"},
            "answer": {"type": "string"},
            "answer_counter": {"type": "string"},
            "answer_label": {"type": "int32"},
        }
        for source in final_sources:
            source['question'] = base64.b64encode(source['question'])
            source['answer'] = base64.b64encode(source['answer'])
            source['answer_counter'] = base64.b64encode(source['answer_counter'])
            
        # index for accelerate data load
        indexes = ["question_id", "image_id", "answer_label"]
        
        if not os.path.exists('mindrecord'):
            os.makedirs('mindrecord')
        mr_file_path = 'mindrecord/%s.mindrecord' % (dataset)
        writer = FileWriter(file_name=mr_file_path, shard_num=4)
        writer.add_schema(schema_json, "data_schema")
        writer.add_index(indexes)
        writer.write_raw_data(final_sources)
        writer.commit()

    
    return gen_dataset

In [None]:
# for test
# do not run!!!
# or will crash
if not os.path.exists('vocab'):
    os.makedirs('vocab')
dd = gen_mindspore_dataset()
print(dd)