In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import os, argparse
import cv2, spacy, numpy as np
from keras.models import model_from_json
from keras.optimizers import SGD
from sklearn.externals import joblib
from keras import backend as K
#import keras.backend.tensorflow_backend as K
K.set_image_data_format('channels_first')

from keras.utils.vis_utils import plot_model
#K.set_image_dim_ordering('th')

Using TensorFlow backend.


In [2]:
VQA_model_file_name      = 'VQA_MODEL.json'
VQA_weights_file_name   = 'VQA_MODEL_WEIGHTS.hdf5'
label_encoder_file_name  = 'FULL_labelencoder_trainval.pkl'
CNN_weights_file_name   = 'vgg16_weights.h5'


In [3]:
def get_image_model(CNN_weights_file_name):
    ''' Takes the CNN weights file, and returns the VGG model update 
    with the weights. Requires the file VGG.py inside models/CNN '''
    from VGG import VGG_16
    image_model = VGG_16(CNN_weights_file_name)

    # this is standard VGG 16 without the last two layers
    sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
    # one may experiment with "adam" optimizer, but the loss function for
    # this kind of task is pretty standard
    image_model.compile(optimizer=sgd, loss='categorical_crossentropy')
    return image_model

In [4]:
model_vgg = get_image_model(CNN_weights_file_name)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [5]:
def get_image_features(image_file_name):
    ''' Runs the given image_file to VGG 16 model and returns the 
    weights (filters) as a 1, 4096 dimension vector '''
    image_features = np.zeros((1, 4096))
    # Magic_Number = 4096  > Comes from last layer of VGG Model

    # Since VGG was trained as a image of 224x224, every new image
    # is required to go through the same transformation
    im = cv2.resize(cv2.imread(image_file_name), (224, 224))
    im = im.transpose((2,0,1)) # convert the image to RGBA

    
    # this axis dimension is required because VGG was trained on a dimension
    # of 1, 3, 224, 224 (first axis is for the batch size
    # even though we are using only one image, we have to keep the dimensions consistent
    im = np.expand_dims(im, axis=0) 

    image_features[0,:] = model_vgg.predict(im)[0]
    return image_features

In [6]:
def get_question_features(question):
    ''' For a given question, a unicode string, returns the time series vector
    with each word (token) transformed into a 300 dimension representation
    calculated using Glove Vector '''
    #     print(word_embeddings)
    tokens = word_embeddings(question)
    token = np.asarray(tokens)
#     print(token)
    question_tensor = np.zeros((1, 30, 300))
    for j in range(len(tokens)):
#         print(len(tokens[j].vector))
        question_tensor[0,j,:] = tokens[j].vector[:300]
        
    return question_tensor
# print(get_question_features("how are you").shape)

In [7]:
word_embeddings = spacy.load('en_vectors_web_lg')

In [7]:
#run....
#import en_core_web_sm

#word_embeddings = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')


In [8]:
def get_VQA_model(VQA_weights_file_name):
    #Given the VQA model and its weights, compiles and returns the model 

    from VQA import VQA_MODEL
    vqa_model = VQA_MODEL()
    vqa_model.load_weights(VQA_weights_file_name)

    vqa_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    return vqa_model
    
    

In [9]:
    #return vqa_model
model_vqa = get_VQA_model(VQA_weights_file_name)


In [10]:
image_file_name = 'cycle.jpeg'
question = "what is in the picture."

#how many circles are there?

In [11]:
# get the image features
image_features = get_image_features(image_file_name)

In [12]:
question_features = get_question_features((question))

In [13]:
y_output = model_vqa.predict([question_features, image_features])

# This task here is represented as a classification into a 1000 top answers
# this means some of the answers were not part of training and thus would 
# not show up in the result.
# These 1000 answers are stored in the sklearn Encoder class

warnings.filterwarnings("ignore", category=DeprecationWarning)
labelencoder = joblib.load(label_encoder_file_name)

In [16]:
for label in reversed(np.argsort(y_output)[0,-5:]):
    print(label)
    print (str(round(y_output[0,label]*100,2)).zfill(5), "% ", labelencoder.inverse_transform(label))



118
29.63 %  bicycle
347
22.62 %  fish
125
13.65 %  bird
243
12.04 %  clock
344
03.35 %  fire hydrant
