In [1]:
import numpy as np
import pandas as pd
import pickle
import json
import keras
from keras import layers, models
from keras.layers.core import Activation
from keras.utils import np_utils
from keras.models import Model, Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten
from keras.layers import Input, LSTM, Embedding, Dense, Add, Concatenate, Reshape, Dropout


import re
from collections import defaultdict
import importlib

Using TensorFlow backend.


In [2]:
image_features = np.load('Files/img_vectors_sample.npy')
image_features[:5]

array([[0.04802181, 0.90796024, 1.1933948 , ..., 1.867591  , 0.4990241 ,
        1.9825053 ],
       [0.787118  , 0.24431148, 0.06212842, ..., 1.5899594 , 1.5548197 ,
        2.0951674 ],
       [1.1351358 , 0.643366  , 1.2675045 , ..., 0.44118398, 3.6098127 ,
        0.21706353],
       [2.510353  , 2.1102235 , 0.42487723, ..., 0.2930224 , 0.46952274,
        0.3113997 ],
       [1.4251724 , 0.42584297, 0.9633813 , ..., 1.9000303 , 1.1129313 ,
        0.12448684]], dtype=float32)

In [3]:
image_features.shape

(1913, 512)

In [4]:
questionsdf = pickle.load(open("Files/bert_small.pkl", "rb"))
questionsdf.head()

Unnamed: 0,image_id,question,question_id,vec
0,458752,What is this photo taken looking through?,458752000,"(((tf.Tensor(-0.1294253, shape=(), dtype=float..."
1,458752,What position is this man playing?,458752001,"(((tf.Tensor(-0.11989767, shape=(), dtype=floa..."
2,458752,What color is the players shirt?,458752002,"(((tf.Tensor(-0.085942656, shape=(), dtype=flo..."
3,458752,Is this man a professional baseball player?,458752003,"(((tf.Tensor(-0.11925976, shape=(), dtype=floa..."
4,262146,What color is the snow?,262146000,"(((tf.Tensor(-0.04078256, shape=(), dtype=floa..."


In [5]:
question_features = questionsdf['vec'].values
question_features[0]

<tf.Tensor: shape=(1, 10, 768), dtype=float32, numpy=
array([[[-0.1294253 ,  0.09955358,  0.0038021 , ..., -0.34734687,
          0.16523981,  0.5111018 ],
        [ 0.25323898, -0.28621358,  0.04786473, ...,  0.32535607,
          0.22321571, -0.29962608],
        [ 0.0961668 , -0.5219595 ,  0.7254022 , ..., -0.3481344 ,
          0.28285167,  0.5112381 ],
        ...,
        [ 0.9590938 , -0.0078491 ,  0.4185476 , ...,  0.02356663,
          0.11205805, -0.59412855],
        [-0.05230472, -0.22541082, -0.6083301 , ..., -0.06796081,
          0.21848793, -0.18448268],
        [ 0.71103114,  0.07298414, -0.27923203, ...,  0.22893938,
         -0.55665475, -0.21895313]]], dtype=float32)>

In [6]:
question_features.shape

(9935,)

In [7]:
lens = []
for quest in questionsdf['question']:
    lens.append(len(quest))
question_length = np.array(lens)
question_length

array([41, 34, 32, ..., 37, 26, 59])

In [8]:
question_length.shape

(9935,)

In [9]:
image_ids = []
for x in questionsdf['image_id']:
    if x not in image_ids:
        image_ids.append(x)
image_ids = np.array(image_ids) - 1
sorted(image_ids)[:5]

[8, 24, 29, 33, 35]

In [10]:
image_ids.shape

(1913,)

In [11]:
file=r"v2_Annotations_Train_mscoco/v2_mscoco_train2014_annotations.json"
with open(file,'r') as myfile:
    data=myfile.read()

In [12]:
data=json.loads(data)['annotations']
data[0]

{'question_type': 'what is this',
 'multiple_choice_answer': 'net',
 'answers': [{'answer': 'net', 'answer_confidence': 'maybe', 'answer_id': 1},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 2},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 3},
  {'answer': 'netting', 'answer_confidence': 'yes', 'answer_id': 4},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 5},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 6},
  {'answer': 'mesh', 'answer_confidence': 'maybe', 'answer_id': 7},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 8},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 9},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 10}],
 'image_id': 458752,
 'answer_type': 'other',
 'question_id': 458752000}

In [13]:
def make_vocab_answers(annotations, n_answers):
    """Make dictionary for top n answers and save them into text file."""
    answers = defaultdict(lambda: 0)
    for annotation in annotations:
            for answer in annotation['answers']:
                word = answer['answer']
                if re.search(r"[^\w\s]", word):
                    continue
                answers[word] += 1
                
    answers = sorted(answers, key=answers.get, reverse=True)
    assert('<unk>' not in answers)
    top_answers = ['<unk>'] + answers[:n_answers-1] # '-1' is due to '<unk>'
    
    with open('vocab_answers.txt', 'w') as f:
        f.writelines([w+'\n' for w in top_answers])

    print('Make vocabulary for answers')
    print('The number of total words of answers: %d' % len(answers))
    print('Keep top %d answers into vocab' % n_answers)

In [14]:
make_vocab_answers(data,500) ## this saves a text file in the same folder

Make vocabulary for answers
The number of total words of answers: 135203
Keep top 500 answers into vocab


In [15]:
question_featuresSENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')


def tokenize(sentence):
    tokens = SENTENCE_SPLIT_REGEX.split(sentence.lower())
    tokens = [t.strip() for t in tokens if len(t.strip()) > 0]
    return tokens
def load_str_list(fname):
    with open(fname) as f:
        lines = f.readlines()
    lines = [l.strip() for l in lines]
    return lines


class VocabDict:

    def __init__(self, vocab_file):
        self.word_list = load_str_list(vocab_file)
        self.word2idx_dict = {w:n_w for n_w, w in enumerate(self.word_list)}
        self.vocab_size = len(self.word_list)
        self.unk2idx = self.word2idx_dict['<unk>'] if '<unk>' in self.word2idx_dict else None
    def idx2word(self, n_w):

        return self.word_list[n_w]

    def word2idx(self, w):
        if w in self.word2idx_dict:
            return self.word2idx_dict[w]
        elif self.unk2idx is not None:
            return self.unk2idx
        else:
            raise ValueError('word %s not in dictionary (while dictionary does not contain <unk>)' % w)

    def tokenize_and_index(self, sentence):
        inds = [self.word2idx(w) for w in tokenize(sentence)]

        return inds
        

In [16]:
ans_vocab = VocabDict('vocab_answers.txt')
ans_vocab = ans_vocab.word2idx_dict
ans_vocab

{'<unk>': 0,
 'no': 1,
 'yes': 2,
 '2': 3,
 '1': 4,
 'white': 5,
 '3': 6,
 'red': 7,
 'black': 8,
 'blue': 9,
 '0': 10,
 '4': 11,
 'green': 12,
 'brown': 13,
 'yellow': 14,
 '5': 15,
 'gray': 16,
 '6': 17,
 'baseball': 18,
 'nothing': 19,
 'frisbee': 20,
 'tennis': 21,
 'right': 22,
 'left': 23,
 'orange': 24,
 'wood': 25,
 'bathroom': 26,
 'pizza': 27,
 'none': 28,
 'pink': 29,
 'kitchen': 30,
 '7': 31,
 '8': 32,
 'cat': 33,
 'dog': 34,
 'skiing': 35,
 'grass': 36,
 'water': 37,
 'man': 38,
 'skateboarding': 39,
 'silver': 40,
 '10': 41,
 'kite': 42,
 'horse': 43,
 'black and white': 44,
 'skateboard': 45,
 'surfing': 46,
 'snow': 47,
 'giraffe': 48,
 'tan': 49,
 '9': 50,
 'wii': 51,
 'surfboard': 52,
 'living room': 53,
 'phone': 54,
 'cake': 55,
 'elephant': 56,
 'broccoli': 57,
 'apple': 58,
 'purple': 59,
 'stop': 60,
 '12': 61,
 'table': 62,
 'sunny': 63,
 'eating': 64,
 'woman': 65,
 'banana': 66,
 'soccer': 67,
 'food': 68,
 'unknown': 69,
 'hat': 70,
 'sheep': 71,
 'train': 72

In [17]:
answers = np.array(list(ans_vocab.values()))
answers

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [18]:
# test_image_features = np.load("Files/test_img_vectors.npy")
# test_image_features[:5]

In [19]:
# test_questions = pd.read_csv('Files/questions_test.csv')
# test_questions.head()

In [20]:
# test_questions.shape

In [21]:
# lens = []
# for quest in test_questions['question']:
#     lens.append(len(quest))
# test_question_length = np.array(lens)
# test_question_length

In [22]:
def VQA_MODEL():
    image_feature_size          = 4096
    word_feature_size           = 300
    number_of_LSTM              = 3
    number_of_hidden_units_LSTM = 512
    max_length_questions        = 30
    number_of_dense_layers      = 3
    number_of_hidden_units      = 1024
    activation_function         = 'tanh'
    dropout_pct                 = 0.5

    # Image model
    model_image = Sequential()
    model_image.add(Reshape((image_feature_size,), input_shape=(image_feature_size,)))

   # Language Model
    model_language = Sequential()
#     model_language.add(LSTM(number_of_hidden_units_LSTM, return_sequences=True,input_shape=(max_length_questions, word_feature_size)))
#     model_language.add(LSTM(number_of_hidden_units_LSTM, return_sequences=True))  
#     model_language.add(LSTM(number_of_hidden_units_LSTM, return_sequences=False))
    model_language.add(Reshape((word_feature_size,), input_shape=(word_feature_size,)))


    # combined model
    x = Concatenate()([model_language.output, model_image.output])

    for _ in range(number_of_dense_layers):
        x = Dense(number_of_hidden_units, kernel_initializer='uniform', activation= activation_function)(x)
        x = Dropout(dropout_pct)(x)
        
    x = Dense(50, activation='softmax')(x)
    
    model = Model(inputs = [model_language.input, model_image.input], outputs=x)
    return model

In [23]:
model = 'simple_mlp'
num_hidden_units_mlp = 1024
num_hidden_units_lstm = 512
num_hidden_layers_mlp = 3
num_hidden_layers_lstm = 1
dropout = 0.5
activation_1 = 'tanh'
activation_2 = 'relu'
seed = 1337
optimizer = 'rmsprop'
nb_epoch = 300
nb_iter = 200000
model_save_interval = 19
batch_size = 128
word_vector = 'glove'
word_emb_dim = 300
vocabulary_size = 12603
max_ques_length = 26
data_type = 'TRAIN'
img_vec_dim = 2048
img_features = 'resnet'
img_normalize = 0
nb_classes = 500
class_activation = 'softmax'
loss = 'categorical_crossentropy'
save_folder = ''

In [24]:
def most_common(lst):
    return max(set(lst), key=lst.count)

In [25]:
def get_train_data():

    train_data = {}
    print('loading image feature...')
    img_feature = np.load('Files/img_vectors_sample.npy')
    
    train_data['question'] = question_features 
    train_data['length_q'] = question_length
    train_data['img_list'] = image_ids
    train_data['answers'] = answers

    print('Normalizing image feature')
    if img_normalize:
        tem = np.sqrt(np.sum(np.multiply(img_feature, img_feature)))
        img_feature = np.divide(img_feature, np.tile(tem,(1,img_vec_dim)))

    return img_feature, train_data

In [26]:
# def get_data_test(args):
#     dataset = {}
#     test_data = {}
#     # load json file
#     print('loading json file...')
#     with open(args.input_json) as data_file:
#         data = json.load(data_file)
#     for key in data.keys():
#         dataset[key] = data[key]

#     # load image feature
#     print('loading image feature...')
#     img_feature = np.load('img_vectors_sample.npy')
    
#     # load h5 file
#     print('loading h5 file...')
#     with h5py.File(args.input_ques_h5,'r') as hf:
#         # total number of training data is 215375
#         # question is (26, )
#         tem = hf.get('ques_test')
#         test_data['question'] = test_questions
#         # max length is 23
#         tem = hf.get('ques_length_test')
#         test_data['length_q'] = np.array(tem)
#         # total 82460 img
#         # -----1~82460-----
#         tem = hf.get('img_pos_test')
#         # convert into 0~82459
#         test_data['img_list'] = np.array(tem)-1
#         # quiestion id
#         tem = hf.get('question_id_test')
#         test_data['ques_id'] = np.array(tem)
#     # MC_answer_test
#     tem = hf.get('MC_ans_test')
#     test_data['MC_ans_test'] = np.array(tem)

#     print('Normalizing image feature')
#     if img_norm:
#         tem =  np.sqrt(np.sum(np.multiply(img_feature, img_feature)))
#         img_feature = np.divide(img_feature, np.tile(tem,(1,args.img_vec_dim)))


#     # make sure the ans_file is provided
#     nb_data_test = len(test_data[u'question'])
#     val_all_answers_dict = json.load(open(args.ans_file))
#     val_answers = np.zeros(nb_data_test, dtype=np.int32)

#     ans_to_ix = {v: k for k, v in dataset[u'ix_to_ans'].items()}
#     count_of_not_found = 0
#     for i in xrange(nb_data_test):
#         qid = test_data[u'ques_id'][i]
#         try : 
#             val_ans_ix =int(ans_to_ix[most_common(val_all_answers_dict[str(qid)])]) -1
#         except KeyError:
#             count_of_not_found += 1
#             val_ans_ix = 480
#         val_answers[i] = val_ans_ix
#     print("Beware: " + str(count_of_not_found) + " number of val answers are not really correct")

#     return img_feature, test_data

In [27]:
train_img_feature, train_data = get_train_data()
# test_img_feature,  test_data, val_answers = get_test_data(args)

train_X = [train_data[u'question'], train_img_feature]
# train_X = np.reshape(train_X, (train_X.shape[0], 1, train_X.shape[1]))

train_Y = np_utils.to_categorical(train_data[u'answers'], nb_classes)

# test_X = [test_data[u'question'], test_img_feature]
# test_Y = np_utils.to_categorical(val_answers, args.nb_classes)


# model_name = importlib.import_module("models."+model)

model = VQA_MODEL()
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# model.build(train_img_feature.shape)
# model.summary() # prints model layers with weights

loading image feature...
Normalizing image feature


In [28]:
history = model.fit(train_X, train_Y, batch_size = batch_size, epochs=nb_epoch)
# history = model.fit(train_X, train_Y, batch_size = args.batch_size, nb_epoch=args.nb_epoch, validation_data=(test_X, test_Y))

ValueError: Error when checking input: expected reshape_2_input to have shape (300,) but got array with shape (1,)

In [None]:
model.summary()

In [None]:
model.save_weights("./VQA_MODEL_WEIGHTS.hdf5")

In [None]:
VQA_weights_file_name   = "./VQA_MODEL_WEIGHTS.hdf5"

In [None]:
def get_VQA_model(VQA_weights_file_name):
    ''' Given the VQA model and its weights, compiles and returns the model '''

    from models.VQA.VQA import VQA_MODEL
    vqa_model = VQA_MODEL()
    vqa_model.load_weights(VQA_weights_file_name)

    vqa_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    return vqa_model

In [None]:
vqa_model = get_VQA_model(VQA_weights_file_name)

In [None]:
vqa_model.predict([question_features, image_features])