In [1]:
import sys
import os
import json
import pickle as pkl
import re
from collections import Counter
import numpy as np

In [2]:
def process_sentence(sentence):
    periodStrip  = re.compile("(?!<=\d)(\.)(?!\d)")
    commaStrip   = re.compile("(\d)(\,)(\d)")
    punct        = [';', r"/", '[', ']', '"', '{', '}',
                    '(', ')', '=', '+', '\\', '_', '-',
                    '>', '<', '@', '`', ',', '?', '!']
    contractions = {"aint": "ain't", "arent": "aren't", "cant": "can't", "couldve": "could've", "couldnt": "couldn't", \
                    "couldn'tve": "couldn't've", "couldnt've": "couldn't've", "didnt": "didn't", "doesnt": "doesn't", "dont": "don't", "hadnt": "hadn't", \
                    "hadnt've": "hadn't've", "hadn'tve": "hadn't've", "hasnt": "hasn't", "havent": "haven't", "hed": "he'd", "hed've": "he'd've", \
                    "he'dve": "he'd've", "hes": "he's", "howd": "how'd", "howll": "how'll", "hows": "how's", "Id've": "I'd've", "I'dve": "I'd've", \
                    "Im": "I'm", "Ive": "I've", "isnt": "isn't", "itd": "it'd", "itd've": "it'd've", "it'dve": "it'd've", "itll": "it'll", "let's": "let's", \
                    "maam": "ma'am", "mightnt": "mightn't", "mightnt've": "mightn't've", "mightn'tve": "mightn't've", "mightve": "might've", \
                    "mustnt": "mustn't", "mustve": "must've", "neednt": "needn't", "notve": "not've", "oclock": "o'clock", "oughtnt": "oughtn't", \
                    "ow's'at": "'ow's'at", "'ows'at": "'ow's'at", "'ow'sat": "'ow's'at", "shant": "shan't", "shed've": "she'd've", "she'dve": "she'd've", \
                    "she's": "she's", "shouldve": "should've", "shouldnt": "shouldn't", "shouldnt've": "shouldn't've", "shouldn'tve": "shouldn't've", \
                    "somebody'd": "somebodyd", "somebodyd've": "somebody'd've", "somebody'dve": "somebody'd've", "somebodyll": "somebody'll", \
                    "somebodys": "somebody's", "someoned": "someone'd", "someoned've": "someone'd've", "someone'dve": "someone'd've", \
                    "someonell": "someone'll", "someones": "someone's", "somethingd": "something'd", "somethingd've": "something'd've", \
                    "something'dve": "something'd've", "somethingll": "something'll", "thats": "that's", "thered": "there'd", "thered've": "there'd've", \
                    "there'dve": "there'd've", "therere": "there're", "theres": "there's", "theyd": "they'd", "theyd've": "they'd've", \
                    "they'dve": "they'd've", "theyll": "they'll", "theyre": "they're", "theyve": "they've", "twas": "'twas", "wasnt": "wasn't", \
                    "wed've": "we'd've", "we'dve": "we'd've", "weve": "we've", "werent": "weren't", "whatll": "what'll", "whatre": "what're", \
                    "whats": "what's", "whatve": "what've", "whens": "when's", "whered": "where'd", "wheres": "where's", "whereve": "where've", \
                    "whod": "who'd", "whod've": "who'd've", "who'dve": "who'd've", "wholl": "who'll", "whos": "who's", "whove": "who've", "whyll": "why'll", \
                    "whyre": "why're", "whys": "why's", "wont": "won't", "wouldve": "would've", "wouldnt": "wouldn't", "wouldnt've": "wouldn't've", \
                    "wouldn'tve": "wouldn't've", "yall": "y'all", "yall'll": "y'all'll", "y'allll": "y'all'll", "yall'd've": "y'all'd've", \
                    "y'alld've": "y'all'd've", "y'all'dve": "y'all'd've", "youd": "you'd", "youd've": "you'd've", "you'dve": "you'd've", \
                    "youll": "you'll", "youre": "you're", "youve": "you've"}

    inText = sentence.replace('\n', ' ')
    inText = inText.replace('\t', ' ')
    inText = inText.strip()
    outText = inText
    for p in punct:
        if (p + ' ' in inText or ' ' + p in inText) or \
           (re.search(commaStrip, inText) != None):
            outText = outText.replace(p, '')
        else:
            outText = outText.replace(p, ' ')
    outText = periodStrip.sub("", outText, re.UNICODE)
    outText = outText.lower().split()
    for wordId, word in enumerate(outText):
        if word in contractions:
            outText[wordId] = contractions[word]
    outText = ' '.join(outText)
    return outText

def process_answer(answer):
    articles = ['a', 'an', 'the']
    manualMap = { 'none': '0', 'zero': '0', 'one': '1', 'two': '2', 'three':
                  '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7',
                  'eight': '8', 'nine': '9', 'ten': '10' }
    new_answer = process_sentence(answer)
    outText = []
    for word in new_answer.split():
        if word not in articles:
            word = manualMap.setdefault(word, word)
            outText.append(word)
    return ' '.join(outText)

In [3]:
f = open("./questions/train.json", "r")
f1 = open("/Users/kim/GitHub/data/annotations/train.json", "r")
file = json.load(f)
file1 = json.load(f1)
annotations = file1['annotations']
train_question_ids = []
train_image_ids = []
train_questions = []
train_answers = []
question_dict_count = dict()
answer_dict_count = dict()

# 形成qa：一个字典，整理出对应question_id的annotation
qa = {ann['question_id']: [] for ann in annotations}
for ann in annotations:
    qa[ann['question_id']] = ann

#获取image_id question_id
for idx, item in enumerate(file['questions']):
    train_question_ids.append(item['question_id'])
    train_image_ids.append(item['image_id'])
    
    #process question
    question = item['question']
    question = process_sentence(question)
    question = question.split()
    for word in question:
        question_dict_count[word] = question_dict_count.get(word, 0) + 1
    train_questions.append(question)
    answer = qa[item['question_id']]['answers']
    answer_new = [process_answer(ans['answer']) for ans in answer]
    ans_array = []
    for ans in answer:
        ans_array.append(ans['answer'])
    for word in answer_new:
        answer_dict_count[word] = answer_dict_count.get(word, 0) + 1
    train_answers.append(ans_array)
    if idx % 10000 == 0:
        print ('finished processing %d in train' %(idx))

finished processing 0 in train
finished processing 10000 in train
finished processing 20000 in train
finished processing 30000 in train
finished processing 40000 in train


In [4]:
# sort question dict
question_count = question_dict_count.values()
sorted_index = [count[0] for count in
                sorted(enumerate(question_count),
                       key = lambda x : x[1],
                       reverse=True)]
sorted_count = sorted(question_count, reverse=True)
question_key = list(question_dict_count.keys())
# 对question_key重新排序
question_key = [question_key[idx] for idx in sorted_index]
# add '<unk>' to the begining
question_key.insert(0, '<unk>')
# '<unk>' begins at 1, 0 is reserved for empty words
question_key = dict((key, idx + 1) for idx, key in enumerate(question_key))

In [5]:
k = 1000
# sort answer dict and get top k answers
del answer_dict_count['']
answer_count = answer_dict_count.values()
sorted_index = [count[0] for count in
                sorted(enumerate(answer_count),
                       key = lambda x : x[1],
                       reverse=True)]
sorted_count = sorted(answer_count, reverse=True)
answer_key = list(answer_dict_count.keys())
answer_key = [answer_key[idx] for idx in sorted_index]
answer_top_k = answer_key[:k]
answer_top_k = dict((key, idx) for idx, key in enumerate(answer_top_k))

In [6]:
# convert words to idx and remove some
train_question_idx = []
train_answer_idx = []
train_answer_counter = []
idx_to_remove = []
for idx, answer in enumerate(train_answers):
    question_idx = [question_key[word] for word in train_questions[idx]]
    train_question_idx.append(question_idx)
    answer_idx = [answer_top_k[ans] for ans in answer
                 if ans in answer_top_k]
    answer_counter = Counter(answer_idx)
    train_answer_counter.append(answer_counter)
    train_answer_idx.append(answer_idx)
    if not answer_idx:
        idx_to_remove.append(idx)
print ('%d out of %d, %f of the question in train are removed'\
    %(len(idx_to_remove), len(train_question_ids),
      len(idx_to_remove) / float(len(train_question_ids))))

2105 out of 44375, 0.047437 of the question in train are removed


In [7]:
# transform to array and delete all the empty answer
train_question_ids = np.array(train_question_ids)
train_image_ids = np.array(train_image_ids)
train_question_idx = np.array(train_question_idx)
train_answer_idx = np.array(train_answer_idx)
train_answer_counter = np.array(train_answer_counter)

train_question_ids = np.delete(train_question_ids, idx_to_remove)
train_image_ids = np.delete(train_image_ids, idx_to_remove)
train_question_idx = np.delete(train_question_idx, idx_to_remove)
train_answer_idx = np.delete(train_answer_idx, idx_to_remove)
train_answer_counter = np.delete(train_answer_counter, idx_to_remove)

  train_question_idx = np.array(train_question_idx)
  train_answer_idx = np.array(train_answer_idx)


In [8]:
import random
# reshuffle the train data
idx_shuffle = list(range(train_question_ids.shape[0]))
random.shuffle(idx_shuffle)
train_question_ids = train_question_ids[idx_shuffle]
train_image_ids = train_image_ids[idx_shuffle]
train_question_idx = train_question_idx[idx_shuffle]
train_answer_idx = train_answer_idx[idx_shuffle]
train_answer_counter = train_answer_counter[idx_shuffle]

# the most frequent as label
train_answer_label = [counter.most_common(1)[0][0]
                      for counter in train_answer_counter]
train_answer_label = np.array(train_answer_label)

# transform from counter to dict
train_answer_counter = [dict(counter) for counter in train_answer_counter]
train_answer_counter = np.array(train_answer_counter)

print ('finished processing train')

[Counter({16: 10}) Counter({0: 10}) Counter({12: 7, 51: 2, 940: 1}) ...
 Counter({2: 10}) Counter({1: 1, 613: 1}) Counter({7: 10})]
finished processing train
