In [1]:
import numpy as np
import pickle
from keras.models import Sequential
from keras.layers.core import Reshape, Activation, Dropout
from keras.layers import LSTM, Dense, Add, Concatenate

Using TensorFlow backend.


In [2]:
image_features = np.load('Files/img_vectors_sample.npy')
image_features[:5]

array([[0.04802181, 0.90796024, 1.1933948 , ..., 1.867591  , 0.4990241 ,
        1.9825053 ],
       [0.787118  , 0.24431148, 0.06212842, ..., 1.5899594 , 1.5548197 ,
        2.0951674 ],
       [1.1351358 , 0.643366  , 1.2675045 , ..., 0.44118398, 3.6098127 ,
        0.21706353],
       [2.510353  , 2.1102235 , 0.42487723, ..., 0.2930224 , 0.46952274,
        0.3113997 ],
       [1.4251724 , 0.42584297, 0.9633813 , ..., 1.9000303 , 1.1129313 ,
        0.12448684]], dtype=float32)

In [3]:
image_features.shape

(1913, 512)

In [4]:
question_features = pickle.load(open("Files/bert_small.pkl", "rb"))
question_features.head()

Unnamed: 0,image_id,question,question_id,vec
0,458752,What is this photo taken looking through?,458752000,"(((tf.Tensor(-0.1294253, shape=(), dtype=float..."
1,458752,What position is this man playing?,458752001,"(((tf.Tensor(-0.11989767, shape=(), dtype=floa..."
2,458752,What color is the players shirt?,458752002,"(((tf.Tensor(-0.085942656, shape=(), dtype=flo..."
3,458752,Is this man a professional baseball player?,458752003,"(((tf.Tensor(-0.11925976, shape=(), dtype=floa..."
4,262146,What color is the snow?,262146000,"(((tf.Tensor(-0.04078256, shape=(), dtype=floa..."


In [5]:
question_features['vec'][:5]

0    (((tf.Tensor(-0.1294253, shape=(), dtype=float...
1    (((tf.Tensor(-0.11989767, shape=(), dtype=floa...
2    (((tf.Tensor(-0.085942656, shape=(), dtype=flo...
3    (((tf.Tensor(-0.11925976, shape=(), dtype=floa...
4    (((tf.Tensor(-0.04078256, shape=(), dtype=floa...
Name: vec, dtype: object

In [6]:
question_features['vec'][0].shape

TensorShape([1, 10, 768])

In [7]:
lens = []
for quest in question_features['question']:
    lens.append(len(quest))
question_length = np.array(lens)
question_length

array([41, 34, 32, ..., 37, 26, 59])

In [8]:
question_length.shape

(9935,)

In [9]:
image_ids = []
for x in question_features['image_id']:
    if x not in image_ids:
        image_ids.append(x)
image_ids = np.array(image_ids) - 1
sorted(image_ids)

[8,
 24,
 29,
 33,
 35,
 48,
 60,
 63,
 70,
 71,
 76,
 77,
 80,
 88,
 91,
 93,
 108,
 109,
 112,
 126,
 137,
 141,
 143,
 148,
 153,
 164,
 193,
 200,
 246,
 249,
 259,
 306,
 307,
 308,
 311,
 314,
 320,
 321,
 325,
 331,
 367,
 369,
 381,
 383,
 388,
 393,
 403,
 418,
 430,
 435,
 437,
 442,
 449,
 470,
 507,
 509,
 513,
 528,
 530,
 531,
 539,
 541,
 561,
 571,
 574,
 580,
 583,
 594,
 596,
 604,
 611,
 619,
 624,
 628,
 633,
 642,
 649,
 655,
 658,
 672,
 680,
 689,
 713,
 715,
 721,
 722,
 734,
 753,
 761,
 780,
 789,
 794,
 796,
 824,
 827,
 852,
 881,
 896,
 901,
 907,
 908,
 912,
 924,
 926,
 933,
 940,
 954,
 964,
 981,
 983,
 995,
 1005,
 1013,
 1024,
 1035,
 1058,
 1071,
 1083,
 1089,
 1098,
 1101,
 1106,
 1107,
 1110,
 1121,
 1138,
 1154,
 1165,
 1167,
 1182,
 1199,
 1203,
 1215,
 1223,
 1231,
 1263,
 1270,
 1281,
 1294,
 1305,
 1306,
 1307,
 1310,
 1314,
 1329,
 1331,
 1354,
 1359,
 1365,
 1374,
 1380,
 1385,
 1389,
 1391,
 1396,
 1400,
 1402,
 1406,
 1407,
 1430,
 1452,
 

In [10]:
image_ids.shape

(1913,)

In [11]:
def VQA_MODEL():
    image_feature_size          = 4096
    word_feature_size           = 300
    number_of_LSTM              = 3
    number_of_hidden_units_LSTM = 512
    max_length_questions        = 30
    number_of_dense_layers      = 3
    number_of_hidden_units      = 1024
    activation_function         = 'tanh'
    dropout_pct                 = 0.5


    # Image model
    model_image = Sequential()
    model_image.add(Reshape((image_feature_size,), input_shape=(image_feature_size,)))

    # Language Model
    model_language = Sequential()
    model_language.add(LSTM(number_of_hidden_units_LSTM, return_sequences=True, input_shape=(max_length_questions, word_feature_size)))
    model_language.add(LSTM(number_of_hidden_units_LSTM, return_sequences=True))
    model_language.add(LSTM(number_of_hidden_units_LSTM, return_sequences=False))

    # combined model
    model = Sequential()
    model.add(Concatenate([model_language, model_image], concat_axis=1))

    for _ in xrange(number_of_dense_layers):
        model.add(Dense(number_of_hidden_units, kernel_initializer='uniform'))
        model.add(Activation(activation_function))
        model.add(Dropout(dropout_pct))

    model.add(Dense(1000))
    model.add(Activation('softmax'))

    return model

In [12]:
# import argparse

# def get_arguments():

#     parser = argparse.ArgumentParser()
#     # model
#     parser.add_argument('-model'                  , type=str   , default='simple_mlp')
#     parser.add_argument('-num_hidden_units_mlp'   , type=int   , default=1024)
#     parser.add_argument('-num_hidden_units_lstm'  , type=int   , default=512)
#     parser.add_argument('-num_hidden_layers_mlp'  , type=int   , default=3)
#     parser.add_argument('-num_hidden_layers_lstm' , type=int   , default=1)
#     parser.add_argument('-dropout'                , type=float , default=0.5)
#     parser.add_argument('-activation_1'           , type=str   , default='tanh')
#     parser.add_argument('-activation_2'           , type=str   , default='relu')

#     # training
#     parser.add_argument('-seed'                   , type=int   , default=1337)
#     parser.add_argument('-optimizer'              , type=str   , default='rmsprop')
#     parser.add_argument('-nb_epoch'               , type=int   , default=300)
#     parser.add_argument('-nb_iter'                , type=int   , default=200000)
#     parser.add_argument('-model_save_interval'    , type=int   , default=19)
#     parser.add_argument('-batch_size'             , type=int   , default=128)

    # language features
#     parser.add_argument('-word_vector'            , type=str   , default='glove')
#     parser.add_argument('-word_emb_dim'           , type=int   , default=300)
#     parser.add_argument('-vocabulary_size'        , type=int   , default=12603)
#     parser.add_argument('-max_ques_length'        , type=int   , default=26)
#     parser.add_argument('-data_type'              , type=str   , default='TRAIN')

    # image features
#     parser.add_argument('-img_vec_dim'            , type=int   , default=2048)
#     parser.add_argument('-img_features'           , type=str   , default='resnet')
#     parser.add_argument('-img_normalize'          , type=int   , default=0)

    # evaluations
#     parser.add_argument('-nb_classes'             , type=int   , default=1000)
#     parser.add_argument('-class_activation'       , type=str   , default='softmax')
#     parser.add_argument('-loss'                   , type=str   , default='categorical_crossentropy')
#     parser.add_argument('-save_folder'            , type=str   , default='')

#     # data
#     parser.add_argument('-ans_file'               , type=str   , default='data/val_all_answers_dict.json')
#     parser.add_argument('-input_json'             , type=str   , default='data/data_prepro.json')
#     parser.add_argument('-input_img_h5'           , type=str   , default='data/data_img.h5')
#     parser.add_argument('-input_ques_h5'          , type=str   , default='data/data_prepro.h5')


#     return parser.parse_args()

In [14]:
model = 'simple_mlp'
num_hidden_units_mlp = 1024
num_hidden_units_lstm = 512
num_hidden_layers_mlp = 3
num_hidden_layers_lstm = 1
dropout = 0.5
activation_1 = 'tanh'
activation_2 = 'relu'
seed = 1337
optimizer = 'rmsprop'
nb_epoch = 300
nb_iter = 200000
model_save_interval = 19
batch_size = 128
word_vector = 'glove'
word_emb_dim = 300
vocabulary_size = 12603
max_ques_length = 26
data_type = 'TRAIN'
img_vec_dim = 2048
img_features = 'resnet'
img_normalize = 0
nb_classes = 1000
class_activation = 'softmax'
loss = 'categorical_crossentropy'
save_folder = ''
# ans_file = 'data/val_all_answers_dict.json'
# input_json = 'data/data_prepro.json'
# input_img_h5 = 'data/data_img.h5'
# input_ques_h5 ='data/data_prepro.h5'

In [None]:
import numpy as np
# import h5py  as hf
import json

def most_common(lst):
    return max(set(lst), key=lst.count)

def get_train_data(args):

#     dataset = {}
    train_data = {}
#     # load json file
#     print('loading json file...')
#     with open(args.input_json) as data_file:
#         data = json.load(data_file)
#     for key in data.keys():
#         dataset[key] = data[key]

    # load image feature
    print('loading image feature...')
#     with h5py.File(args.input_img_h5,'r') as hf:
#         # -----0~82459------
#         tem = hf.get('images_train')
#         img_feature = np.array(tem)
    img_feature = np.load('img_vectors_sample.npy')
        
    # load h5 file
#     print('loading h5 file...')
#     with h5py.File(args.input_ques_h5,'r') as hf:
        # total number of training data is 215375
        # question is (26, )
#         tem = hf.get('ques_train')
        train_data['question'] = question_features['vec'].numpy()
        # max length is 23
#         tem = hf.get('ques_length_train')
        train_data['length_q'] = question_length
        # total 82460 img
        #-----1~82460-----
#         tem = hf.get('img_pos_train')
    # convert into 0~82459
    
        train_data['img_list'] = image_ids
        # answer is 1~1000
        tem = hf.get('answers')
        train_data['answers'] = np.array(tem)-1

    print('Normalizing image feature')
    if img_norm:
        tem = np.sqrt(np.sum(np.multiply(img_feature, img_feature)))
        img_feature = np.divide(img_feature, np.tile(tem,(1,args.img_vec_dim)))

    return img_feature, train_data

def get_data_test(args):
    dataset = {}
    test_data = {}
    # load json file
    print('loading json file...')
    with open(args.input_json) as data_file:
        data = json.load(data_file)
    for key in data.keys():
        dataset[key] = data[key]

    # load image feature
    print('loading image feature...')
    img_feature = np.load('img_vectors_sample.npy')
    
    # load h5 file
    print('loading h5 file...')
    with h5py.File(args.input_ques_h5,'r') as hf:
        # total number of training data is 215375
        # question is (26, )
        tem = hf.get('ques_test')
        test_data['question'] = np.array(tem)
        # max length is 23
        tem = hf.get('ques_length_test')
        test_data['length_q'] = np.array(tem)
        # total 82460 img
        # -----1~82460-----
        tem = hf.get('img_pos_test')
        # convert into 0~82459
        test_data['img_list'] = np.array(tem)-1
        # quiestion id
        tem = hf.get('question_id_test')
        test_data['ques_id'] = np.array(tem)
    # MC_answer_test
    tem = hf.get('MC_ans_test')
    test_data['MC_ans_test'] = np.array(tem)

    print('Normalizing image feature')
    if img_norm:
        tem =  np.sqrt(np.sum(np.multiply(img_feature, img_feature)))
        img_feature = np.divide(img_feature, np.tile(tem,(1,args.img_vec_dim)))


    # make sure the ans_file is provided
    nb_data_test = len(test_data[u'question'])
    val_all_answers_dict = json.load(open(args.ans_file))
    val_answers = np.zeros(nb_data_test, dtype=np.int32)

    ans_to_ix = {v: k for k, v in dataset[u'ix_to_ans'].items()}
    count_of_not_found = 0
    for i in xrange(nb_data_test):
        qid = test_data[u'ques_id'][i]
        try : 
            val_ans_ix =int(ans_to_ix[most_common(val_all_answers_dict[str(qid)])]) -1
        except KeyError:
            count_of_not_found += 1
            val_ans_ix = 480
        val_answers[i] = val_ans_ix
    print("Beware: " + str(count_of_not_found) + " number of val answers are not really correct")

    return img_feature, test_data

In [None]:
import json
import numpy as np
from keras.utils import np_utils

args = get_arguments()
print(args)
np.random.seed(args.seed)


train_img_feature, train_data = get_train_data(args)
test_img_feature,  test_data, val_answers = get_test_data(args)

train_X = [train_data[u'question'], train_img_feature]
train_Y = np_utils.to_categorical(train_data[u'answers'], args.nb_classes)

test_X = [test_data[u'question'], test_img_feature]
test_Y = np_utils.to_categorical(val_answers, args.nb_classes)


model_name = importlib.import_module("models."+args.model)
model = model_name.model(args)
model.compile(loss='categorical_crossentropy', optimizer=args.optimizer, metrics=['accuracy'])
model.summary() # prints model layers with weights

history = model.fit(train_X, train_Y, batch_size = args.batch_size, nb_epoch=args.nb_epoch, validation_data=(test_X, test_Y))

In [None]:
VQA_weights_file_name   = 'models/VQA/VQA_MODEL_WEIGHTS.hdf5'

In [None]:
def get_VQA_model(VQA_weights_file_name):
    ''' Given the VQA model and its weights, compiles and returns the model '''

    from models.VQA.VQA import VQA_MODEL
    vqa_model = VQA_MODEL()
    vqa_model.load_weights(VQA_weights_file_name)

    vqa_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    return vqa_model

In [None]:
vqa_model = get_VQA_model(VQA_weights_file_name)

In [None]:
vqa_model.predict([question_features, image_features])