In [3]:
#load all libraries
import cv2, spacy, numpy as np

from PIL import Image
%matplotlib inline
import os, argparse

import keras.backend as K

from sklearn.externals import joblib
from keras.models import model_from_json,Model,Sequential
from keras.layers import Convolution2D, ZeroPadding2D, MaxPooling2D,Dense, Flatten, Dropout
from keras.optimizers import SGD

In [4]:
# File paths for the model, all of these except the CNN Weights are 
# provided in the repo, See the models/CNN/README.md to download VGG weights

VQA_model_file_name      = 'models/VQA/VQA_MODEL.json'
VQA_weights_file_name   = 'models/VQA/VQA_MODEL_WEIGHTS.hdf5'
label_encoder_file_name  = 'models/VQA/FULL_labelencoder_trainval.pkl'
CNN_weights_file_name   = 'vgg16_weights.h5'


In [2]:
 """Build the VGG16 model.

    # Arguments
        weight_path: path of the pre_train vgg16 weights
                     If None, weights will be initalized by default 
    # Output shape
       The VGG16 model
"""
def vgg_16(weight_path=None):
    
    K.set_image_dim_ordering('th')  #Note that the pre_train weight we download is based on thenoa,not tensorflow
    
    model = Sequential()
    model.add(ZeroPadding2D((1,1),input_shape=(3,224,224)))
    model.add(Convolution2D(64,3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(64, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(Flatten())
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1000, activation='softmax'))
#     model.summary()
    if weight_path:
        model.load_weights(weight_path)


    return model

In [6]:
#load image model
image_model = vgg_16(CNN_weights_file_name)

#removing the last two layers of vgg to get the features
new_input = image_model.input
hidden_layer = image_model.layers[-3].output

#new model containing the first 14 layers of vgg16
image_model = Model(new_input, hidden_layer)

sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)

image_model.compile(optimizer=sgd, loss='categorical_crossentropy')
image_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
zeropadding2d_input_2 (InputLaye (None, 3, 224, 224)   0                                            
____________________________________________________________________________________________________
zeropadding2d_14 (ZeroPadding2D) (None, 3, 226, 226)   0           zeropadding2d_input_2[0][0]      
____________________________________________________________________________________________________
convolution2d_14 (Convolution2D) (None, 64, 224, 224)  1792        zeropadding2d_14[0][0]           
____________________________________________________________________________________________________
zeropadding2d_15 (ZeroPadding2D) (None, 64, 226, 226)  0           convolution2d_14[0][0]           
___________________________________________________________________________________________

In [34]:
image_file_name = 'cricket.jpg'
question = u"What are they playing?"

In [13]:
def get_image_feature(image_file_name):
    image_features = np.zeros((1, 4096))
    im = cv2.resize(cv2.imread(image_file_name), (224, 224))
    # print im.shape

    im = im.transpose((2,0,1))
    # print im.shape

    im = np.expand_dims(im, axis=0)
    print im.shape

    image_features = image_model.predict(im)
    # print image_features
    
    return image_features

In [38]:
image_features = get_image_feature(image_file_name)
print image_features.shape

(1, 3, 224, 224)
(1, 4096)


In [15]:
max_len=30
def get_question_features(question):
    
    ''' For a given question, a unicode string, returns the time series vector
    with each word (token) transformed into a 300 dimension representation
    calculated using Glove Vector '''
    
    word_embeddings = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
    tokens = word_embeddings(question)
    question_tensor = np.zeros((1, max_len, 300))
    for j in xrange(len(tokens)):
            question_tensor[0,j,:] = tokens[j].vector
    return question_tensor

In [12]:
obama = word_embeddings(u"obama")
america = word_embeddings(u"president")
banana = word_embeddings(u"banana")
monkey = word_embeddings(u"monkey")
orange=word_embeddings(u"orange")

In [13]:
banana.similarity(orange)

0.56299402173767421

In [16]:
def get_VQA_model(VQA_model_file_name, VQA_weights_file_name):
    
    ''' Given the VQA model and its weights, compiles and returns the model '''

    vqa_model = model_from_json(open(VQA_model_file_name).read())
    vqa_model.load_weights(VQA_weights_file_name)
    vqa_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    
    return vqa_model

In [17]:
model_vqa = get_VQA_model(VQA_model_file_name, VQA_weights_file_name)
model_vqa.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lstm_4 (LSTM)                    (None, 30, 512)       1665024                                      
____________________________________________________________________________________________________
lstm_5 (LSTM)                    (None, 30, 512)       2099200                                      
____________________________________________________________________________________________________
lstm_6 (LSTM)                    (None, 512)           2099200                                      
____________________________________________________________________________________________________
reshape_2 (Reshape)              (None, 4096)          0                                            
___________________________________________________________________________________________

# <center> What are they playing ? </center>

<img src="cricket.jpg">

In [40]:
question=u'what are they playing?'

In [41]:
question_features = get_question_features(question)
print question_features.shape

(1, 30, 300)


In [42]:
y_output = model_vqa.predict([question_features, image_features])

labelencoder = joblib.load(label_encoder_file_name)
for label in reversed(np.argsort(y_output)[0,-5:]):
    print str(round(y_output[0,label]*100,2)).zfill(5), "% ", labelencoder.inverse_transform(label)

99.98 %  baseball
00.01 %  bat
00.01 %  frisbee
000.0 %  soccer
000.0 %  tennis


  This is separate from the ipykernel package so we can avoid doing imports until
