In [404]:
import json
import pandas as pd
from tqdm import tqdm
from vgg16 import Vgg16
import numpy as np
import PIL.Image
from utils import *
from matplotlib import pyplot as plt
import cv2
import os
import cPickle as pickle
from keras.preprocessing import sequence

import shutil
import string

import collections
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

In [326]:
def save_array_with_folder_create(folder_path,arr_name,arr):
    if not os.path.exists(folder_path):
        print("Creating folder: "+folder_path)
        os.makedirs(folder_path)
        
    save_array(folder_path + arr_name, arr)
    
def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))   




def create_emb(vecs,words,wordidx,index2word,vocab_size):
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    found = 0
    not_found = 0
    
    exclude = set(string.punctuation)
    for i in range(1,len(emb)):
        word = index2word[i]
        word = ''.join(ch for ch in word if ch not in exclude).lower()
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word) and word in wordidx:
            src_idx = wordidx[word]
            emb[i] = vecs[src_idx]
            found +=1
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = normal(scale=0.6, size=(n_fact,))
            not_found+=1
#             print(word)

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = normal(scale=0.6, size=(n_fact,))
    emb/=3
    
    print("Found = %d"%found)
    print("Not found = %d"%not_found)
        
    return emb

## Pairing

In [55]:
pairing_path = "../../vqa/raw_questions/train/pairing.json"
with open(pairing_path) as data_file:    
        data = json.load(data_file)
        

In [58]:
imgId_qId_question_list = []
for questionJson in tqdm(data["questions"]):
    imgId_qId_question_list.append((questionJson["image_id"],questionJson["question_id"],questionJson["question"]))
#     print("Image id = %s" % questionJson["image_id"])
#     print("Question  = %s" % questionJson["question"])
#     print("Question id = %s" % questionJson["question_id"])
#     print("------------------")

100%|██████████| 443757/443757 [00:00<00:00, 786727.85it/s]


In [59]:
imgId_qId_question_list[:10]

[(458752, 458752000, u'What is this photo taken looking through?'),
 (458752, 458752001, u'What position is this man playing?'),
 (458752, 458752002, u'What color is the players shirt?'),
 (458752, 458752003, u'Is this man a professional baseball player?'),
 (262146, 262146000, u'What color is the snow?'),
 (262146, 262146001, u'What is the person doing?'),
 (262146, 262146002, u'What color is the persons headwear?'),
 (524291, 524291000, u"What is in the person's hand?"),
 (524291, 524291001, u'Is the dog waiting?'),
 (524291, 524291002, u'Is the dog looking at a tennis ball or frisbee?')]

In [60]:
key2Question_df = pd.DataFrame(imgId_qId_question_list,columns=['image_id', 'question_id', 'question'])

In [61]:
key2Question_df

Unnamed: 0,image_id,question_id,question
0,458752,458752000,What is this photo taken looking through?
1,458752,458752001,What position is this man playing?
2,458752,458752002,What color is the players shirt?
3,458752,458752003,Is this man a professional baseball player?
4,262146,262146000,What color is the snow?
5,262146,262146001,What is the person doing?
6,262146,262146002,What color is the persons headwear?
7,524291,524291000,What is in the person's hand?
8,524291,524291001,Is the dog waiting?
9,524291,524291002,Is the dog looking at a tennis ball or frisbee?


# Answers

In [62]:
answers_path = "../../vqa/raw_questions/train/answers.json"
with open(answers_path) as data_file:    
        answer_data = json.load(data_file)
        

In [12]:
for answerJson in answer_data["annotations"][:1]:
    for key, value in answerJson.items():
        print("Key:")
        print(key)
#     print("Answer id = %s" % answerJson["answer_id"])
#     print("Answer  = %s" % answerJson["answer"])
#     print("Answer conf = %s" % answerJson["answer_confidence"])
#     print("------------------")

Key:
question_type
Key:
multiple_choice_answer
Key:
answers
Key:
image_id
Key:
answer_type
Key:
question_id


In [90]:
imgId_qId_answers_list = []

for answerJson in tqdm(answer_data["annotations"]):
    question_id = answerJson["question_id"]
    image_id = answerJson["image_id"]
    answer = answerJson["multiple_choice_answer"]
#     answers_arr = []
#     for indAnswerJson in answerJson["answers"]:
#         answers_arr.append(indAnswerJson["answer"])
            
    imgId_qId_answers_list.append((image_id,question_id,answer))

100%|██████████| 443757/443757 [00:01<00:00, 434277.51it/s]


In [91]:
key2Answers_df = pd.DataFrame(imgId_qId_answers_list,columns=['image_id','question_id','answer'])

In [92]:
key2Answers_df

Unnamed: 0,image_id,question_id,answer
0,458752,458752000,net
1,458752,458752001,pitcher
2,458752,458752002,orange
3,458752,458752003,yes
4,262146,262146000,white
5,262146,262146001,skiing
6,262146,262146002,red
7,524291,524291000,frisbee
8,524291,524291001,yes
9,524291,524291002,frisbee


In [93]:
key2Question_df

Unnamed: 0,image_id,question_id,question
0,458752,458752000,What is this photo taken looking through?
1,458752,458752001,What position is this man playing?
2,458752,458752002,What color is the players shirt?
3,458752,458752003,Is this man a professional baseball player?
4,262146,262146000,What color is the snow?
5,262146,262146001,What is the person doing?
6,262146,262146002,What color is the persons headwear?
7,524291,524291000,What is in the person's hand?
8,524291,524291001,Is the dog waiting?
9,524291,524291002,Is the dog looking at a tennis ball or frisbee?


In [94]:
print(key2Question_df.shape)
print(key2Answers_df.shape)

(443757, 3)
(443757, 3)


In [103]:
data_df = pd.merge(key2Answers_df, key2Question_df, left_on=['image_id','question_id'], right_on = ['image_id','question_id'])
data_df = data_df[["image_id","question","answer"]]
data_df = data_df.sort_values("image_id")

In [104]:
data_df

Unnamed: 0,image_id,question,answer
903,9,What is the green stuff?,broccoli
902,9,What color are the dishes?,pink and yellow
901,9,How many cookies can be seen?,2
95,25,Is the giraffe in the shade?,no
94,25,Are any of the animals eating?,yes
93,25,Are some of the trees dead?,yes
92,25,What is on the ground next to the giraffe on t...,log
91,25,Are they at a zoo?,yes
90,25,Are both giraffes standing?,no
88,25,What is the giraffe standing behind?,tree


# Most popular answers

In [110]:
NR_TOP_ANSWERS = 1000

In [111]:
answer_counts = data_df["answer"].value_counts()[:NR_TOP_ANSWERS]

In [113]:
answer_counts[:10]

yes      84978
no       82516
1        12540
2        12215
white     8916
3         6536
blue      5455
red       5201
black     5066
0         4977
Name: answer, dtype: int64

In [466]:
answers_arr = answer_counts.index.tolist()

In [468]:
answer2Index = {}
index2Answer = {}

In [470]:
for index in range(NR_TOP_ANSWERS):
    answer = answers_arr[index]
    answer2Index[answer] = index
    index2Answer[index] = answer

# Language Data Structures

In [471]:
def get_index_word_dicts(unique_words):
    word_index = {}
    index_word = {}
    for i,word in enumerate(unique_words):
        word_index[word] = i
        index_word[i] = word
        
    return (word_index,index_word)

def get_unique_words_helper(captions):
    unique_words = []
    words = [caption.split() for caption in captions]
   
    for word in words:
        unique_words.extend(word)
        
    unique_words = list(set(unique_words))
    
    return unique_words

def get_unique_words(data_df):
    question_counts = data_df["question"].value_counts()
    questions = [str(x)[:-1] for x in question_counts.index.tolist()]
    return get_unique_words_helper(questions)

In [392]:
unique_words = get_unique_words(data_df)
(word2index, index2word) = get_index_word_dicts(unique_words)

In [394]:
VOCAB_SIZE = len(unique_words)
VOCAB_SIZE

23907

In [396]:
MAX_QUESTION_LENGTH  = 10

# Make batches

In [318]:
save_path = "/home/docker/fastai-courses/deeplearning1/nbs/persistent/vqa/data/"

path = save_path+"first-try/"

current_folder = train_folder
# current_folder = val_folder

base_path = path + current_folder

In [319]:
#Remove the last two layers to get the 4096D activations    
image_model = Vgg16().model
image_model.pop()
image_model.pop()

In [298]:
BATCH_SIZE = 128

In [299]:
nr_batches = test_data_df.shape[0] / BATCH_SIZE
nr_batches

64

In [309]:
def get_np_vgg_features(img_id_list,base_path):
    imgId_2_img = {}
    
    images = []
    for img_id in tqdm(img_id_list):
        img = None
        if img_id in imgId_2_img:
            img = imgId_2_img[img_id]
        else:
            img = cv2.imread(base_path+"/"+"COCO_train2014_"+str(format(img_id, "012"))+".jpg")
            img = cv2.resize(img,(224,224))
            img = np.asarray(img)
            imgId_2_img[img_id] = img
        
        images.append(img)
        
    
    np_images = np.stack(images)
    np_images = np.transpose(np_images,(0,3,1,2))
    
    vgg_features = image_model.predict(np_images)
    
    return vgg_features

def write_batches(np_vgg_features, np_questions, np_answers, batch_index):
    
    save_array_with_folder_create(base_path + batch_folder + images_folder,'img_vgg_batch_'
                                  + str(format(batch_index, "06"))+ '.bc',np_vgg_features)
    
    save_array_with_folder_create(base_path + batch_folder + questions_folder,'questions_batch_'
                                  + str(format(batch_index, "06"))+ '.bc',np_questions)
    
    save_array_with_folder_create(base_path + batch_folder + answers_folder,'answers_batch_'
                                  + str(format(batch_index, "06"))+ '.bc',np_answers)
    
    

In [320]:
batch_folder = "batches/"
images_folder = "images/"
questions_folder = "questions/"
answers_folder = "answers/"

train_folder = "train/"


In [399]:
def get_indexed_qustions(raw_captions):
    indexed_captions = []
    for raw_caption in tqdm(raw_captions):
        raw_caption = raw_caption[:-1] #delete the final question mark
        indexed_caption = [word2index[caption_word] for caption_word in raw_caption.split()]
        indexed_caption = sequence.pad_sequences([indexed_caption], maxlen=MAX_QUESTION_LENGTH,padding='post')
        indexed_np_arr = np.asarray(np.squeeze(indexed_caption))

        indexed_captions.append(indexed_np_arr)
    
    return indexed_captions

In [472]:
# for batch_index in range(nr_batches - 1):
for batch_index in range(1):
    print("Batch = %d"%batch_index)
    current_data_df = test_data_df.iloc[batch_index*BATCH_SIZE:(batch_index+1)*BATCH_SIZE,:]
    print("current_data_df.shape = %s"%str(current_data_df.shape))
    img_id_list = current_data_df["image_id"].tolist()
    question_list = current_data_df["question"].tolist()
    answer_list = current_data_df["answer"].tolist()
    
#     np_vgg_features = get_np_vgg_features(img_id_list,train_images_path)
#     np_questions = get_indexed_qustions(question_list)
    np_answers = np.asarray(answer_list)
    
    random_answer = np_answers[1]

    print(answer2Index[random_answer])
    
#     write_batches(np_vgg_features, np_questions, np_answers,batch_index)

        

Batch = 0
current_data_df.shape = (128, 3)


KeyError: u'pink and yellow'

# Train Model

In [446]:
import re
from numpy.random import random, permutation, randn, normal
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten,Lambda
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD, Adadelta, Adagrad
from keras.utils import np_utils, generic_utils
from keras.callbacks import EarlyStopping
from keras.layers.advanced_activations import PReLU, LeakyReLU
from keras.layers import Embedding,GRU,LSTM,TimeDistributed,RepeatVector,Merge
from keras.optimizers import SGD, RMSprop, Adam

In [416]:
EMB_SIZE = 300
vecs, words, wordidx = load_vectors("/home/docker/fastai-courses/deeplearning1/nbs/persistent/coco/glove/"+"6B."+str(EMB_SIZE)+"d")
emb = create_emb(vecs, words, wordidx,index2word,VOCAB_SIZE)

Found = 21907
Not found = 1999


In [456]:
image_model = Sequential()
image_model.add(Dropout(0.1,input_shape=(4096,)))

In [457]:
language_model = Sequential()
language_model.add(Embedding(VOCAB_SIZE, 300, input_length=MAX_QUESTION_LENGTH,weights=[emb]))
language_model.add(GRU(output_dim = 512))

In [458]:
model = Sequential()
model.add(Merge([image_model,language_model], mode='concat', concat_axis=1))
model.add(Dense(1024, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(1024, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(NR_TOP_ANSWERS))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

In [460]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dropout_17 (Dropout)             (None, 4096)          0                                            
____________________________________________________________________________________________________
embedding_5 (Embedding)          (None, 10, 300)       7172100                                      
____________________________________________________________________________________________________
gru_3 (GRU)                      (None, 512)           1248768                                      
____________________________________________________________________________________________________
dense_22 (Dense)                 (None, 1024)          4719616     merge_6[0][0]                    
___________________________________________________________________________________________

In [461]:
def generate_from_path():
    
    images_dir = base_path + batch_folder + images_folder    
    questions_dir = base_path + batch_folder + questions_folder
    answers_dir = base_path + batch_folder + answers_folder

    images_paths = os.listdir(images_dir)
    questions_paths = os.listdir(questions_dir)
    answers_paths = os.listdir(answers_dir)
    
    images_paths.sort()
    questions_paths.sort()
    answers_paths.sort()
    
    nr_batches = len(images_paths)
    
    while(1):
    
        for batch_index in range(nr_batches):
            images_batch = load_array(images_dir + images_paths[batch_index])
            questions_batch = load_array(questions_dir + questions_paths[batch_index])
            answers_batch = load_array(answers_dir + answers_paths[batch_index])
        
            yield([images_batch,questions_batch],answers_batch)

In [462]:
model.fit_generator(generate_from_path(),
                    samples_per_epoch = 2048,
                    nb_epoch = 10)

Epoch 1/10


ValueError: Error when checking model target: expected activation_5 to have shape (None, 1000) but got array with shape (128, 1)