In [1]:
#load all libraries
import cv2, spacy, numpy as np

from PIL import Image
%matplotlib inline
import os, argparse

import keras.backend as K

from sklearn.externals import joblib
from keras.models import model_from_json
from keras.models import Sequential
from keras.layers import Convolution2D, ZeroPadding2D, MaxPooling2D
from keras.layers import Dense, Flatten, Dropout
from keras.optimizers import SGD

Using Theano backend.


## Load the models and weights files
This does not load the models yet, but we are providing the files


In [36]:
# File paths for the model, all of these except the CNN Weights are 
# provided in the repo, See the models/CNN/README.md to download VGG weights
VQA_model_file_name      = 'models/VQA/VQA_MODEL.json'
VQA_weights_file_name   = 'models/VQA/VQA_MODEL_WEIGHTS.hdf5'
label_encoder_file_name  = 'models/VQA/FULL_labelencoder_trainval.pkl'
CNN_weights_file_name   = 'vgg16_weights.h5'


## Compile the model

In [4]:
def vgg_16(weight_path=None):
    """Build the VGG16 model.

    # Arguments
        weight_path: path of the pre_train vgg16 weights
                     If None, weights will be initalized by default 
    # Output shape
       The VGG16 model
    """
    
    K.set_image_dim_ordering('th')  #Note that the pre_train weight we download is based on thenoa,not tensorflow
    
    model = Sequential()
    model.add(ZeroPadding2D((1,1),input_shape=(3,224,224)))
    model.add(Convolution2D(64,3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(64, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(Flatten())
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1000, activation='softmax'))
#     model.summary()
    if weight_path:
        model.load_weights(weight_path)


    return model

In [5]:
from keras.models import Model
image_model = vgg_16(CNN_weights_file_name)
new_input = image_model.input
hidden_layer = image_model.layers[-3].output
image_model = Model(new_input, hidden_layer)

sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)

image_model.compile(optimizer=sgd, loss='categorical_crossentropy')
image_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
zeropadding2d_input_1 (InputLaye (None, 3, 224, 224)   0                                            
____________________________________________________________________________________________________
zeropadding2d_1 (ZeroPadding2D)  (None, 3, 226, 226)   0           zeropadding2d_input_1[0][0]      
____________________________________________________________________________________________________
convolution2d_1 (Convolution2D)  (None, 64, 224, 224)  1792        zeropadding2d_1[0][0]            
____________________________________________________________________________________________________
zeropadding2d_2 (ZeroPadding2D)  (None, 64, 226, 226)  0           convolution2d_1[0][0]            
___________________________________________________________________________________________

In [24]:
image_features = np.zeros((1, 4096))
im = cv2.resize(cv2.imread(image_file_name), (224, 224))
print im.shape
im = im.transpose((2,0,1))
print im.shape
im = np.expand_dims(im, axis=0)
print im.shape
image_features = image_model.predict(im)
print image_features

(224, 224, 3)
(3, 224, 224)
(1, 3, 224, 224)
[[ 0.          3.97431898  0.         ...,  0.          5.50633287
   1.58049059]]


In [25]:
image_feature = np.zeros((1, 4096))
image_feature[0,:] = image_features[0]
image_feature=image_features

In [10]:
def get_question_features(question):
    ''' For a given question, a unicode string, returns the time series vector
    with each word (token) transformed into a 300 dimension representation
    calculated using Glove Vector '''
    word_embeddings = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
    tokens = word_embeddings(question)
    question_tensor = np.zeros((1, len(tokens), 300))
    for j in xrange(len(tokens)):
            question_tensor[0,j,:] = tokens[j].vector
    return question_tensor

In [11]:
word_embeddings = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
print word_embeddings

<spacy.en.English object at 0x1046d0910>


In [12]:
obama = word_embeddings(u"obama")
america = word_embeddings(u"president")
banana = word_embeddings(u"banana")
monkey = word_embeddings(u"monkey")
orange=word_embeddings(u"orange")

In [13]:
banana.similarity(orange)

0.56299402173767421

In [14]:
def get_VQA_model(VQA_model_file_name, VQA_weights_file_name):
    ''' Given the VQA model and its weights, compiles and returns the model '''

    # thanks the keras function for loading a model from JSON, this becomes
    # very easy to understand and work. Alternative would be to load model
    # from binary like cPickle but then model would be obfuscated to users
    vqa_model = model_from_json(open(VQA_model_file_name).read())
    vqa_model.load_weights(VQA_weights_file_name)
    vqa_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    return vqa_model

In [15]:
model_vqa = get_VQA_model(VQA_model_file_name, VQA_weights_file_name)
model_vqa.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lstm_4 (LSTM)                    (None, 30, 512)       1665024                                      
____________________________________________________________________________________________________
lstm_5 (LSTM)                    (None, 30, 512)       2099200                                      
____________________________________________________________________________________________________
lstm_6 (LSTM)                    (None, 512)           2099200                                      
____________________________________________________________________________________________________
reshape_2 (Reshape)              (None, 4096)          0                                            
___________________________________________________________________________________________

In [43]:
image_file_name = 'soccer.jpg'
question = u"What are they playing?"

# <center> What vehicle is in the picture ? </center>

<img src="test.jpg">

In [44]:
word_embeddings = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
tokens = word_embeddings(question)
print tokens
question_tensor = np.zeros((1, 30, 300))
print question_tensor.shape
for j in xrange(len(tokens)):
    question_tensor[0,j,:] = tokens[j].vector
print question_tensor.shape
# question_features = get_question_features(question)

What are they playing?
(1, 30, 300)
(1, 30, 300)


In [45]:
y_output = model_vqa.predict([question_tensor, image_features])

labelencoder = joblib.load(label_encoder_file_name)
for label in reversed(np.argsort(y_output)[0,-5:]):
    print str(round(y_output[0,label]*100,2)).zfill(5), "% ", labelencoder.inverse_transform(label)

85.77 %  soccer
13.54 %  frisbee
00.43 %  soccer ball
00.11 %  tennis
00.06 %  baseball


  This is separate from the ipykernel package so we can avoid doing imports until


## Results
I am copying the output of the previous command, so that you can validate if your results are same as mine.

**78.32 %  train** <br />
01.11 %  truck <br />
00.98 %  passenger <br />
00.95 %  fire truck <br />
00.68 %  bus <br />

# Demo with image URL

Since cv2.imread cannot read an image from URL we will have to change our function `get_image_features`

In [20]:
def get_image_features(image_file_name, CNN_weights_file_name):
    ''' Runs the given image_file to VGG 16 model and returns the 
    weights (filters) as a 1, 4096 dimension vector '''
    image_features = np.zeros((1, 4096))
        
    from skimage import io
    # if you would rather not install skimage, then use cv2.VideoCapture which surprisingly can read from url
    # see this SO answer http://answers.opencv.org/question/16385/cv2imread-a-url/?answer=16389#post-id-16389
    im = cv2.resize(io.imread(image_file_name), (224, 224))
    im = im.transpose((2,0,1)) # convert the image to RGBA

    
    # this axis dimension is required because VGG was trained on a dimension
    # of 1, 3, 224, 224 (first axis is for the batch size
    # even though we are using only one image, we have to keep the dimensions consistent
    im = np.expand_dims(im, axis=0) 

    image_features[0,:] = get_image_model(CNN_weights_file_name).predict(im)[0]
    return image_features

In [21]:
image_file_name = "http://www.newarkhistory.com/indparksoccerkids.jpg"
# get the image features
image_features = get_image_features(image_file_name, CNN_weights_file_name)

NameError: global name 'get_image_model' is not defined

Feel free to change that url to any valid image, it can be any image format. Also try to use websites which have higher bandwidth

<img src="http://www.newarkhistory.com/indparksoccerkids.jpg">
# <center> What are they playing? </center>

In [None]:
question = u"What are they playing?"

# get the question features
question_features = get_question_features(question)

In [None]:
y_output = model_vqa.predict([question_features, image_features])

labelencoder = joblib.load(label_encoder_file_name)
for label in reversed(np.argsort(y_output)[0,-5:]):
    print str(round(y_output[0,label]*100,2)).zfill(5), "% ", labelencoder.inverse_transform(label)

## Result
Copying the result to validate your output.

**40.52 %  tennis **<br />
28.45 %  soccer <br />
17.88 %  baseball <br />
11.67 %  frisbee <br />
00.15 %  football <br />

As you can see, it got this wrong, but you can see why it could be harder to guess soccer and easier to guess tennis, lack of soccer ball and double lines at the edge.

Let's ask another question for the same image.

In [None]:
question = u"Are they playing soccer?"

# get the question features
question_features = get_question_features(question)

<img src="http://www.newarkhistory.com/indparksoccerkids.jpg">
# <center> Are they playing soccer? </center>

In [None]:
y_output = model_vqa.predict([question_features, image_features])

labelencoder = joblib.load(label_encoder_file_name)
for label in reversed(np.argsort(y_output)[0,-5:]):
    print str(round(y_output[0,label]*100,2)).zfill(5), "% ", labelencoder.inverse_transform(label)

## Result

**93.15 %  yes **<br />
06.42 %  no <br />
00.02 %  right <br />
00.01 %  left <br />
000.0 %  man <br />

As you can see, similar information about a Yes/No question elicits different response, or should I say correct response. This is an impertinent problem with `classification` tasks.

Feel free to experiment with different types of questions, `count`, `color`, `location`.

More interesting results are obtained when one takes a different crop of a image, instead of just scaling it to 224x224. This is again because we extract only the top level features of CNN model which was trained to classify one object in the image.