In [1]:
import json
import pandas as pd
from tqdm import tqdm
from vgg16 import Vgg16
import numpy as np
import PIL.Image
from utils import *
from matplotlib import pyplot as plt
import cv2
import os
import cPickle as pickle
from keras.preprocessing import sequence

import shutil
import string

import collections
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

import re
from numpy.random import random, permutation, randn, normal
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten,Lambda
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD, Adadelta, Adagrad
from keras.utils import np_utils, generic_utils
from keras.callbacks import EarlyStopping
from keras.layers.advanced_activations import PReLU, LeakyReLU
from keras.layers import Embedding,GRU,LSTM,TimeDistributed,RepeatVector,Merge
from keras.optimizers import SGD, RMSprop, Adam

import preprocessor as preproc

Using Theano backend.
Using gpu device 0: GeForce GTX 1080 (CNMeM is disabled, cuDNN 5110)


In [2]:
def save_array_with_folder_create(folder_path,arr_name,arr):
    if not os.path.exists(folder_path):
        print("Creating folder: "+folder_path)
        os.makedirs(folder_path)
        
    save_array(folder_path + arr_name, arr)
    
def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))   




def create_emb(vecs,words,wordidx,index2word,vocab_size):
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    found = 0
    not_found = 0
    
    exclude = set(string.punctuation)
    for i in range(1,len(emb)):
        word = index2word[i]
        word = ''.join(ch for ch in word if ch not in exclude).lower()
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word) and word in wordidx:
            src_idx = wordidx[word]
            emb[i] = vecs[src_idx]
            found +=1
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = normal(scale=0.6, size=(n_fact,))
            not_found+=1
#             print(word)

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = normal(scale=0.6, size=(n_fact,))
    emb/=3
    
    print("Found = %d"%found)
    print("Not found = %d"%not_found)
        
    return emb

## Pairing

In [39]:
# current_folder = train_folder
current_folder = val_folder

save_path = "/home/docker/fastai-courses/deeplearning1/nbs/persistent/vqa/data/"
path = save_path+"first-try/"
base_path = path + current_folder

In [40]:
pairing_path = "../../vqa/raw_questions/"+current_folder+"pairing.json"
with open(pairing_path) as data_file:    
        data = json.load(data_file)
     
    
imgId_qId_question_list = []
for questionJson in tqdm(data["questions"]):
    imgId_qId_question_list.append((questionJson["image_id"],questionJson["question_id"],questionJson["question"]))

100%|██████████| 214354/214354 [00:00<00:00, 791324.62it/s]


In [41]:
key2Question_df = pd.DataFrame(imgId_qId_question_list,columns=['image_id', 'question_id', 'question'])
key2Question_df

Unnamed: 0,image_id,question_id,question
0,262148,262148000,Where is he looking?
1,262148,262148001,What are the people in the background doing?
2,262148,262148002,What is he on top of?
3,393225,393225000,What website copyrighted the picture?
4,393225,393225001,Is this a creamy soup?
5,393225,393225002,Is this rice noodle soup?
6,393225,393225003,What is to the right of the soup?
7,393226,393226000,What is the man doing in the street?
8,393226,393226001,How many photo's can you see?
9,393226,393226002,What does the truck on the left sell?


# Answers

In [42]:
answers_path = "../../vqa/raw_questions/"+current_folder+"answers.json"
with open(answers_path) as data_file:    
        answer_data = json.load(data_file)
        
imgId_qId_answers_list = []

for answerJson in tqdm(answer_data["annotations"]):
    question_id = answerJson["question_id"]
    image_id = answerJson["image_id"]
    answer = answerJson["multiple_choice_answer"]
    imgId_qId_answers_list.append((image_id,question_id,answer))

100%|██████████| 214354/214354 [00:00<00:00, 358001.30it/s]


In [43]:
key2Answers_df = pd.DataFrame(imgId_qId_answers_list,columns=['image_id','question_id','answer'])

In [44]:
key2Answers_df

Unnamed: 0,image_id,question_id,answer
0,262148,262148000,down
1,262148,262148001,watching
2,262148,262148002,picnic table
3,393225,393225000,foodiebakercom
4,393225,393225001,no
5,393225,393225002,yes
6,393225,393225003,chopsticks
7,393226,393226000,walking
8,393226,393226001,1
9,393226,393226002,ice cream


In [45]:
key2Question_df

Unnamed: 0,image_id,question_id,question
0,262148,262148000,Where is he looking?
1,262148,262148001,What are the people in the background doing?
2,262148,262148002,What is he on top of?
3,393225,393225000,What website copyrighted the picture?
4,393225,393225001,Is this a creamy soup?
5,393225,393225002,Is this rice noodle soup?
6,393225,393225003,What is to the right of the soup?
7,393226,393226000,What is the man doing in the street?
8,393226,393226001,How many photo's can you see?
9,393226,393226002,What does the truck on the left sell?


In [46]:
print(key2Question_df.shape)
print(key2Answers_df.shape)

(214354, 3)
(214354, 3)


In [47]:
all_data_df = pd.merge(key2Answers_df, key2Question_df, 
                       left_on=['image_id','question_id'], 
                       right_on = ['image_id','question_id'])

all_data_df = all_data_df[["image_id","question","answer"]]
all_data_df = all_data_df.sort_values("image_id")

In [48]:
all_data_df

Unnamed: 0,image_id,question,answer
62,42,What color are the gym shoes?,white
64,42,What color is the flip flop?,red
63,42,Is there a red sandal here?,yes
915,73,What is the license number?,sv-6260
916,73,Is this a motorcycle or bike?,motorcycle
917,73,What color is the bike?,black
918,73,What letter and 3 numbers are on the tag?,sv-6260
160,74,Does this dog have a collar?,no
161,74,Where is the dog laying?,sidewalk
162,74,What is the dog doing?,sleeping


# Select only most popular answers

In [49]:
NR_TOP_ANSWERS = 1000

## For Train

In [32]:
answer_counts = all_data_df["answer"].value_counts()[:NR_TOP_ANSWERS]
answers_arr = answer_counts.index.tolist()

answer2Index = {}
index2Answer = {}

for index in range(NR_TOP_ANSWERS):
    answer = answers_arr[index]
    answer2Index[answer] = index
    index2Answer[index] = answer
    
if not os.path.exists(path+general_datastruct_folder):
    os.makedirs(path+general_datastruct_folder)

preproc.save_obj(answers_arr,path+general_datastruct_folder+"answers_arr")
preproc.save_obj(answer2Index,path+general_datastruct_folder+"answer2Index")
preproc.save_obj(index2Answer,path+general_datastruct_folder+"index2Answer")

## For Valid

In [50]:
answers_arr = preproc.load_obj(path+general_datastruct_folder+"answers_arr")
answer2Index = preproc.load_obj(path+general_datastruct_folder+"answer2Index")
index2Answer = preproc.load_obj(path+general_datastruct_folder+"index2Answer")

## Filtering

In [51]:
popular_answers_df = pd.DataFrame(answers_arr,columns = ["answer"])
popular_answers_df

Unnamed: 0,answer
0,yes
1,no
2,1
3,2
4,white
5,3
6,blue
7,red
8,black
9,0


In [52]:
data_df = pd.merge(all_data_df, popular_answers_df,  
                                     how='inner', 
                                     left_on=['answer'], right_on = ['answer'])
data_df = data_df.sort_values("image_id")
data_df

Unnamed: 0,image_id,question,answer
0,42,What color are the gym shoes?,white
6794,42,Is there a red sandal here?,yes
4311,42,What color is the flip flop?,red
47774,73,What color is the bike?,black
47522,73,Is this a motorcycle or bike?,motorcycle
50286,74,Does this dog have a collar?,no
90481,74,What is the dog doing?,sleeping
90368,74,Where is the dog laying?,sidewalk
6795,133,Is this a child room?,yes
93309,133,What size mattress would you need for this bed?,twin


In [53]:
print(all_data_df.shape)
print(data_df.shape)

(214354, 3)
(186503, 3)


In [54]:
data_df = data_df[:8192]
print(data_df.shape)

(8192, 3)


In [55]:
if not os.path.exists(base_path):
    os.makedirs(base_path)
    
data_df.to_csv(base_path+"data_df.csv")

# Language Data Structures => After train & val

In [56]:
def get_index_word_dicts(unique_words):
    word_index = {}
    index_word = {}
    for i,word in enumerate(unique_words):
        word_index[word] = i
        index_word[i] = word
        
    return (word_index,index_word)

def get_unique_words_helper(captions):
    unique_words = []
    words = [caption.split() for caption in captions]
   
    for word in words:
        unique_words.extend(word)
        
    unique_words = list(set(unique_words))
    
    return unique_words

def get_unique_words(data_df):
    question_counts = data_df["question"].value_counts()
    questions = [str(x)[:-1] for x in question_counts.index.tolist()]
    return get_unique_words_helper(questions)

In [57]:
train_df = pd.read_csv(path+train_folder+"data_df.csv")
val_df = pd.read_csv(path+val_folder+"data_df.csv")

In [58]:
print(train_df.shape)
print(val_df.shape)

(8192, 4)
(8192, 4)


In [59]:
joined_df = pd.concat([train_df,val_df])
joined_df.shape

(16384, 4)

In [60]:
unique_words = get_unique_words(joined_df)
(word2index, index2word) = get_index_word_dicts(unique_words)

In [61]:
VOCAB_SIZE = len(unique_words)
VOCAB_SIZE

4469

In [62]:
general_datastruct_folder = "general-data-structures/"

In [63]:
if not os.path.exists(path+general_datastruct_folder):
    os.makedirs(path+general_datastruct_folder)

preproc.save_obj(unique_words,path+general_datastruct_folder+"unique_words")
preproc.save_obj(word2index,path+general_datastruct_folder+"word2index")
preproc.save_obj(index2word,path+general_datastruct_folder+"index2word")