In [None]:
from google.colab import drive
drive.mount('/content/VQA_Demo-master')

In [None]:
%matplotlib inline
import os, argparse
import cv2, spacy, numpy as np
from keras.models import model_from_json
from keras.optimizers import SGD
from sklearn.externals import joblib

In [None]:
from keras import backend as K
K.set_image_data_format('channels_first')

In [None]:
# File paths for the model, all of these except the CNN Weights are 
# provided in the repo, See the models/CNN/README.md to download VGG weights
VQA_model_file_name      = '/content/VQA_Demo-master/My Drive/VQA_Demo-master/VQA_Demo-master/models/VQA/VQA_MODEL.json'
VQA_weights_file_name   = '/content/VQA_Demo-master/My Drive/VQA_Demo-master/VQA_Demo-master/models/VQA/VQA_MODEL_WEIGHTS.hdf5'
label_encoder_file_name  = '/content/VQA_Demo-master/My Drive/VQA_Demo-master/VQA_Demo-master/models/VQA/FULL_labelencoder_trainval.pkl'
CNN_weights_file_name   = '/content/VQA_Demo-master/My Drive/VQA_Demo-master/VQA_Demo-master/models/CNN/vgg16_weights .h5'

In [None]:
import os
os.chdir('/content/VQA_Demo-master/My Drive/VQA_Demo-master/VQA_Demo-master')

## Model Idea
This uses a classical CNN-LSTM  model like shown below, where Image features and language features are computed separately and combined together and a multi-layer perceptron is trained on the combined features.

<img src="http://i.imgur.com/Za5P1ZZ.png">
[Source](http://arxiv.org/pdf/1505.00468v4.pdf)

## Pretrained VGG Net (VGG-16)

While VGG Net is not the best CNN model for image features, GoogLeNet (winner 2014) and ResNet (winner 2015) have superior classification scores, but VGG Net is very versatile, simple, relatively small and more importantly portable to use. 

<img src="http://www.robots.ox.ac.uk/~vgg/research/very_deep/images/table_ILSVRC.png">

In [None]:
def get_image_model(CNN_weights_file_name):
    ''' Takes the CNN weights file, and returns the VGG model update 
    with the weights. Requires the file VGG.py inside models/CNN '''
    from models.CNN.VGG import VGG_16
    image_model = VGG_16(CNN_weights_file_name)

    # this is standard VGG 16 without the last two layers
    sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
    # one may experiment with "adam" optimizer, but the loss function for
    # this kind of task is pretty standard
    image_model.compile(optimizer=sgd, loss='categorical_crossentropy')
    return image_model

In [None]:
from keras.utils.vis_utils import plot_model
import tensorflow as tf
import keras.backend.tensorflow_backend as tfback


print("tf.__version__ is", tf.__version__)
print("tf.keras.__version__ is:", tf.keras.__version__)

def _get_available_gpus():
    """Get a list of available gpu devices (formatted as strings).

    # Returns
        A list of available GPU devices.
    """
    #global _LOCAL_DEVICES
    if tfback._LOCAL_DEVICES is None:
        devices = tf.config.list_logical_devices()
        tfback._LOCAL_DEVICES = [x.name for x in devices]
    return [x for x in tfback._LOCAL_DEVICES if 'device:gpu' in x.lower()]

tfback._get_available_gpus = _get_available_gpus


model_vgg = get_image_model(CNN_weights_file_name)
plot_model(model_vgg, to_file='model_vgg.png')

In [None]:
def get_image_features(image_file_name, CNN_weights_file_name):
    ''' Runs the given image_file to VGG 16 model and returns the 
    weights (filters) as a 1, 4096 dimension vector '''
    image_features = np.zeros((1, 4096))
    # Magic_Number = 4096  > Comes from last layer of VGG Model

    # Since VGG was trained as a image of 224x224, every new image
    # is required to go through the same transformation
    im = cv2.resize(cv2.imread(image_file_name), (224, 224))
    im = im.transpose((2,0,1)) # convert the image to RGBA

    
    # this axis dimension is required because VGG was trained on a dimension
    # of 1, 3, 224, 224 (first axis is for the batch size
    # even though we are using only one image, we have to keep the dimensions consistent
    im = np.expand_dims(im, axis=0) 

    image_features[0,:] = get_image_model(CNN_weights_file_name).predict(im)[0]
    return image_features

In [None]:
def get_question_features(question):
    ''' For a given question, a unicode string, returns the time series vector
    with each word (token) transformed into a 300 dimension representation
    calculated using Glove Vector '''
    word_embeddings = spacy.load('en', vectors='en_vectors_web_lg')
    tokens = word_embeddings(question)
    question_tensor = np.zeros((1, len(tokens), 96))
    for j in range(len(tokens)):
            question_tensor[0,j,:] = tokens[j].vector
    return question_tensor

In [None]:
word_embeddings = spacy.load('en', vectors='en_vectors_web_lg')

In [None]:
obama = word_embeddings(u"obama")
putin = word_embeddings(u"putin")
banana = word_embeddings(u"banana")
monkey = word_embeddings(u"monkey")

In [None]:
obama.similarity(putin)

  "__main__", mod_spec)


0.6526194124802338

In [None]:
obama.similarity(banana)

  "__main__", mod_spec)


0.4615127295688744

In [None]:
def get_VQA_model(VQA_model_file_name, VQA_weights_file_name):
    ''' Given the VQA model and its weights, compiles and returns the model '''

    # thanks the keras function for loading a model from JSON, this becomes
    # very easy to understand and work. Alternative would be to load model
    # from binary like cPickle but then model would be obfuscated to users
    vqa_model = model_from_json(open(VQA_model_file_name).read())
    vqa_model.load_weights(VQA_weights_file_name)
    vqa_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    return vqa_model

In [None]:
image_file_name = '/content/VQA_Demo-master/My Drive/VQA_Demo-master/VQA_Demo-master/test.jpg'
question = u"What vehicle is in the picture?"

# <center> What vehicle is in the picture ? </center>
<img src="test.jpg">


In [None]:
# get the image features
image_features = get_image_features(image_file_name, CNN_weights_file_name)

In [None]:
# get the question features
question_features = get_question_features(question)

In [None]:
model_vqa = get_VQA_model(VQA_model_file_name, VQA_weights_file_name)
plot_model(model_vqa, to_file='model_vqa.png')

In [None]:
y_output = model_vqa.predict([question_features, image_features])

# This task here is represented as a classification into a 1000 top answers
# this means some of the answers were not part of training and thus would 
# not show up in the result.
# These 1000 answers are stored in the sklearn Encoder class
labelencoder = joblib.load(label_encoder_file_name)
for label in reversed(np.argsort(y_output)[0,-5:]):
    print(str(round(y_output[0,label]*100,2)).zfill(5), "% ", labelencoder.inverse_transform(label))

51.87 % train <br>
031.5 % bicycle <br>
03.81 % bike <br>
02.91 % bus <br>
02.54 % scooter <br>

# Demo with image URL

In [5]:
def get_image_features(image_file_name):
    ''' Runs the given image_file to VGG 16 model and returns the 
    weights (filters) as a 1, 4096 dimension vector '''
    image_features = np.zeros((1, 4096))
        
    from skimage import io
    # if you would rather not install skimage, then use cv2.VideoCapture which surprisingly can read from url
    # see this SO answer http://answers.opencv.org/question/16385/cv2imread-a-url/?answer=16389#post-id-16389
    im = cv2.resize(io.imread(image_file_name), (224, 224))
    im = im.transpose((2,0,1)) # convert the image to RGBA

    
    # this axis dimension is required because VGG was trained on a dimension
    # of 1, 3, 224, 224 (first axis is for the batch size
    # even though we are using only one image, we have to keep the dimensions consistent
    im = np.expand_dims(im, axis=0) 

    image_features[0,:] = model_vgg.predict(im)[0]
    return image_features

In [7]:
image_file_name = "http://www.newarkhistory.com/indparksoccerkids.jpg"
# get the image features
image_features = get_image_features(image_file_name)

<img src="http://www.newarkhistory.com/indparksoccerkids.jpg">
 <center> What are they playing? </center>

In [None]:
question = u"What are they playing?"

# get the question features
question_features = get_question_features(question)

In [None]:
y_output = model_vqa.predict([question_features, image_features])
warnings.filterwarnings("ignore", category=DeprecationWarning)
for label in reversed(np.argsort(y_output)[0,-5:]):
    print str(round(y_output[0,label]*100,2)).zfill(5), "% ", labelencoder.inverse_transform(label)

55.44 % frisbee <br>
18.91 % tennis <br>
16.95 % baseball <br>
08.31 % soccer <br>
00.07 % ball

##Asking another question!

In [None]:
question = u"Are they playing Frisbee?"

# get the question features
question_features = get_question_features(question)

<img src="http://www.newarkhistory.com/indparksoccerkids.jpg">
<center> Are they playing Frisbee? </center>

In [None]:
y_output = model_vqa.predict([question_features, image_features])

warnings.filterwarnings("ignore", category=DeprecationWarning)
for label in reversed(np.argsort(y_output)[0,-5:]):
    print str(round(y_output[0,label]*100,2)).zfill(5), "% ", labelencoder.inverse_transform(label)

78.72 % yes <br>
21.28 % no <br>
000.0 % girl <br>
000.0 % halloween <br>
000.0 % left <br>