In [1]:
import json
import pandas as pd
from tqdm import tqdm
from vgg16 import Vgg16
import numpy as np
import PIL.Image
from utils import *
from matplotlib import pyplot as plt
import cv2
import os
import cPickle as pickle
from keras.preprocessing import sequence

import shutil
import string

import collections
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

import re
from numpy.random import random, permutation, randn, normal
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten,Lambda
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD, Adadelta, Adagrad
from keras.utils import np_utils, generic_utils
from keras.callbacks import EarlyStopping
from keras.layers.advanced_activations import PReLU, LeakyReLU
from keras.layers import Embedding,GRU,LSTM,TimeDistributed,RepeatVector,Merge
from keras.optimizers import SGD, RMSprop, Adam

import preprocessor as preproc

Using Theano backend.
Using gpu device 0: GeForce GTX 1080 (CNMeM is disabled, cuDNN 5110)


In [2]:
def save_array_with_folder_create(folder_path,arr_name,arr):
    if not os.path.exists(folder_path):
        print("Creating folder: "+folder_path)
        os.makedirs(folder_path)
        
    save_array(folder_path + arr_name, arr)
    
def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))   




def create_emb(vecs,words,wordidx,index2word,vocab_size):
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    found = 0
    not_found = 0
    
    exclude = set(string.punctuation)
    for i in range(1,len(emb)):
        word = index2word[i]
        word = ''.join(ch for ch in word if ch not in exclude).lower()
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word) and word in wordidx:
            src_idx = wordidx[word]
            emb[i] = vecs[src_idx]
            found +=1
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = normal(scale=0.6, size=(n_fact,))
            not_found+=1
#             print(word)

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = normal(scale=0.6, size=(n_fact,))
    emb/=3
    
    print("Found = %d"%found)
    print("Not found = %d"%not_found)
        
    return emb

In [3]:
# current_folder = train_folder
# images_path = train_images_path

current_folder = val_folder
images_path = val_images_path

save_path = "/home/docker/fastai-courses/deeplearning1/nbs/persistent/vqa/data/"
path = save_path+"first-try/"
base_path = path + current_folder

In [4]:
data_df = pd.read_csv(path+current_folder+"data_df.csv")
data_df

Unnamed: 0.1,Unnamed: 0,image_id,question,answer
0,0,42,What color are the gym shoes?,white
1,6794,42,Is there a red sandal here?,yes
2,4311,42,What color is the flip flop?,red
3,47774,73,What color is the bike?,black
4,47522,73,Is this a motorcycle or bike?,motorcycle
5,50286,74,Does this dog have a collar?,no
6,90481,74,What is the dog doing?,sleeping
7,90368,74,Where is the dog laying?,sidewalk
8,6795,133,Is this a child room?,yes
9,93309,133,What size mattress would you need for this bed?,twin


In [5]:
unique_words = preproc.load_obj(path+general_datastruct_folder+"unique_words")
word2index = preproc.load_obj(path+general_datastruct_folder+"word2index")
index2word = preproc.load_obj(path+general_datastruct_folder+"index2word")

answer2Index = preproc.load_obj(path+general_datastruct_folder+"answer2Index")
index2Answer = preproc.load_obj(path+general_datastruct_folder+"index2Answer")

VOCAB_SIZE = len(unique_words)
print(len(unique_words))

4469


In [6]:
MAX_QUESTION_LENGTH  = 10

In [7]:
BATCH_SIZE = 2048

In [8]:
NR_TOP_ANSWERS = 1000

# Make batches

In [9]:
#Remove the last two layers to get the 4096D activations    
image_model = Vgg16().model
image_model.pop()
image_model.pop()

In [10]:
NR_BATCHES = data_df.shape[0] / BATCH_SIZE
NR_BATCHES

4

In [11]:
def get_np_vgg_features(img_id_list,base_path):
    
    imgId_2_img = {}
    
    images = []
    for img_id in tqdm(img_id_list):
        img = None
        if img_id in imgId_2_img:
            img = imgId_2_img[img_id]
        else:
            img_path = base_path+"/"+"COCO_"+current_folder[:-1]+"2014_"+str(format(img_id, "012"))+".jpg"
            img = cv2.imread(img_path)
            img = cv2.resize(img,(224,224))
            img = np.asarray(img)
            imgId_2_img[img_id] = img
        
        images.append(img)
        
    
    np_images = np.stack(images)
    np_images = np.transpose(np_images,(0,3,1,2))
    
    vgg_features = image_model.predict(np_images)
    
    return (np_images,vgg_features)

def write_batches(np_images, np_vgg_features, np_indexed_questions, np_questions, np_answers, batch_index):
    
    save_array_with_folder_create(base_path + batch_folder + images_folder,'img_batch_'
                                  + str(format(batch_index, "06"))+ '.bc',np_images)
        
    save_array_with_folder_create(base_path + batch_folder + vgg_folder,'img_vgg_batch_'
                                  + str(format(batch_index, "06"))+ '.bc',np_vgg_features)
    
    save_array_with_folder_create(base_path + batch_folder + indexed_questions_folder,'indexed_questions_batch_'
                                  + str(format(batch_index, "06"))+ '.bc',np_indexed_questions)
    
    save_array_with_folder_create(base_path + batch_folder + questions_folder,'questions_batch_'
                                  + str(format(batch_index, "06"))+ '.bc',np_questions)
    
    save_array_with_folder_create(base_path + batch_folder + answers_folder,'answers_batch_'
                                  + str(format(batch_index, "06"))+ '.bc',np_answers)
    
    

In [12]:
def get_indexed_qustions(raw_captions):
    indexed_captions = []
    for raw_caption in tqdm(raw_captions):
        raw_caption = raw_caption[:-1] #delete the final question mark
        indexed_caption = [word2index[caption_word] for caption_word in raw_caption.split()]
        indexed_caption = sequence.pad_sequences([indexed_caption], maxlen=MAX_QUESTION_LENGTH,padding='post')
        indexed_np_arr = np.asarray(np.squeeze(indexed_caption))

        indexed_captions.append(indexed_np_arr)
    
    return indexed_captions

def get_np_answers(answer_list):
    
    indexed_answer_list = []
    for answer in answer_list:
        indexed_answer = np.zeros(NR_TOP_ANSWERS)
        word_index = answer2Index[answer]
        indexed_answer[word_index] = 1
        indexed_answer_list.append(indexed_answer)
        
        
    return np.stack(indexed_answer_list)
    

In [13]:
vgg_folder = "img-vgg/"
indexed_questions_folder = "indexed-questions/"

In [14]:
# for batch_index in range(nr_batches - 1):
for batch_index in range(2):
    
    print("Batch = %d"%batch_index)
    current_data_df = data_df.iloc[batch_index*BATCH_SIZE:(batch_index+1)*BATCH_SIZE,:]
    print("current_data_df.shape = %s"%str(current_data_df.shape))
    img_id_list = current_data_df["image_id"].tolist()
    question_list = current_data_df["question"].tolist()
    answer_list = current_data_df["answer"].tolist()
    
    (np_images,np_vgg_features) = get_np_vgg_features(img_id_list,images_path)
    np_indexed_questions = get_indexed_qustions(question_list)
    np_questions = np.asarray(question_list)
    np_answers = get_np_answers(answer_list)
    
    write_batches(np_images, np_vgg_features, np_indexed_questions, np_questions, np_answers,batch_index)


  2%|▏         | 51/2048 [00:00<00:04, 475.51it/s]

Batch = 0
current_data_df.shape = (2048, 4)


100%|██████████| 2048/2048 [00:02<00:00, 945.70it/s]
100%|██████████| 2048/2048 [00:00<00:00, 54981.56it/s]
  5%|▍         | 100/2048 [00:00<00:02, 972.02it/s]

Batch = 1
current_data_df.shape = (2048, 4)


100%|██████████| 2048/2048 [00:02<00:00, 902.67it/s] 
100%|██████████| 2048/2048 [00:00<00:00, 63340.59it/s]


## QA

In [15]:
image_dir = base_path + batch_folder + images_folder     
questions_dir = base_path + batch_folder + questions_folder
answers_dir = base_path + batch_folder + answers_folder

image_paths = os.listdir(image_dir)
questions_paths = os.listdir(questions_dir)
answers_paths = os.listdir(answers_dir)

image_paths.sort()
questions_paths.sort()
answers_paths.sort()

In [16]:
nr_batches = 0
nr_instances = 20