In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ! ls drive/MyDrive/BOWIE

In [None]:
import gzip
import json
import numpy as np
import h5py

In [None]:
root = '/content/drive/MyDrive/BOWIE/'
subset_path = root + 'Data/Subset/'
aux_path = root + 'Data/Additional/'
weights_path = root + 'Weights/'

# ==== Questions data ====
path_qdata_train = subset_path + 'qdata_train.gzip'
path_qdata_test = subset_path + 'qdata_test.gzip'
path_qdata_valid = subset_path + 'qdata_valid.gzip'


# ==== Annotations data ====
path_adata_train = subset_path + 'adata_train.gzip'
path_adata_test = subset_path + 'adata_test.gzip'
path_adata_valid = subset_path + 'adata_valid.gzip'


# ==== Additional data ====

# VQA_image_features.h5 - contains the ResNet image features for all the images(train, val, test) as an array
PATH_TO_H5_FILE      = aux_path + 'VQA_image_features.h5'

# VQA_img_features2id.json - contains the mapping from image_id to index in the .h5 file
PATH_TO_FEAT2ID_FILE = aux_path + 'VQA_img_features2id.json'

# image_ids_vqa.json - contains all images IDs from our data subset
PATH_TO_IDS_FILE     = aux_path + 'image_ids_vqa.json'

# imgid2imginfo.json - contains the flickr url (and more image information) for the MSCOCO training and validation dataset.
PATH_TO_ID2INFO_FILE = aux_path + 'imgid2imginfo.json'


In [None]:
def read_image_data():
    
    # load image features from hdf5 file and convert it to numpy array
    img_features = np.asarray(h5py.File(PATH_TO_H5_FILE, 'r')['img_features'])
    # img_features = list(h5py.File(PATH_TO_H5_FILE, 'r')['img_features'])

    # load IDs file
    with open(PATH_TO_IDS_FILE, 'r') as f:
        img_ids = json.load(f)['image_ids']

    # load feature mapping file
    with open(PATH_TO_FEAT2ID_FILE, 'r') as f:
        visual_feat_mapping = json.load(f)['VQA_imgid2id']

    # load info mapping file
    with open(PATH_TO_ID2INFO_FILE, 'r') as f:
        imgid2info = json.load(f)

    return img_ids, img_features, visual_feat_mapping, imgid2info


In [None]:
def read_textual_data():
    
    data = []

    for i, filepath in enumerate([ path_qdata_train, path_qdata_test, path_qdata_valid ]):
        print('Reading', filepath, '...')
        with gzip.open(filepath, 'r') as fin:
            w = [[x['question'], x['image_id']] for x in json.loads(fin.read().decode('utf-8'))['questions']]
            # print(len(w))
            data.append( w )

    for i, filepath in enumerate([ path_adata_train, path_adata_test, path_adata_valid ]):
        print('Reading', filepath, '...')
        with gzip.open(filepath, 'r') as fin:
            w = [[x['multiple_choice_answer'], x['image_id']] for x in json.loads(fin.read().decode('utf-8'))['annotations']]
            # print(len(w))
            data.append( w )


    # for w in data:
    #     print( w[0], '\n')
    
    return data

In [None]:
# map img_id to list of visual features corresponding to that image
def img_id_to_feat(img_id, visual_feat_mapping, img_features):
    # print(len(visual_feat_mapping), len(img_features))
    h5_id = visual_feat_mapping[str(img_id)]
    return img_features[h5_id]

In [None]:
def train_valid_test_data(
    q_train, q_valid, q_test, a_train, a_valid, a_test,
    visual_feat_mapping, img_features,
    TRAIN_LEN, VALID_LEN, TEST_LEN
):

    bag = { key:[] for key in [
        'train_data', 'train_visual',
        'valid_data', 'valid_visual',
        'test_data', 'test_visual'
    ]}

    for i in range(TRAIN_LEN):
        
        question = q_train[i][0].split()
        answer = a_train[i]
        answer, img_id = answer[0], answer[1]

        if visual_feat_mapping.get(str(img_id)) != None:

            content = ( question, answer, img_id  )
            bag['train_data'].append( content )
            bag['train_visual'].append( img_id_to_feat( img_id, visual_feat_mapping, img_features ) )


    for i in range(VALID_LEN):
        
        question = q_valid[i][0].split()
        answer = a_valid[i]
        answer, img_id = answer[0], answer[1]
        if visual_feat_mapping.get(str(img_id)) != None:
            content = ( question, answer, img_id  )
            bag['valid_data'].append( content )
            bag['valid_visual'].append( img_id_to_feat( img_id, visual_feat_mapping, img_features ) )

    for i in range(TEST_LEN):
        
        question = q_test[i][0].split()
        answer = a_test[i]
        answer, img_id = answer[0], answer[1]
        if visual_feat_mapping.get(str(img_id)) != None:
            content = ( question, answer, img_id  )
            bag['test_data'].append( content )
            bag['test_visual'].append( img_id_to_feat( img_id, visual_feat_mapping, img_features ) )




    for part, before in zip(["train_data", "valid_data", "test_data"], [TRAIN_LEN, VALID_LEN, TEST_LEN]):
        print( f'\n{part}' )
        print( f'Before filtering: {before}' )
        after = len(bag[part])
        print( f'After filtering: {after}' )
        print( f'Filtered out: { round((1-after/before)*100, 1)  }\n' )

    return bag

In [None]:
def clean(word):
    if word[-1]=='?':
        word = word[:-1]
    if '/' in word:
        word = word.split('/')[0]
    if '\\' in word:
        word = word.split('\\')[0]
    if "'s" in word:
        word = word[:-2]
    return word.lower()

In [None]:
def vocabulary(data):
    questions_vocabulary = {}
    answers_vocabulary = {}
    
    for sentence, answer, img_id in data['train_data'] + data['valid_data'] + data['test_data']:
        # print(sentence, answer, img_id)
        for word in sentence:
            word = clean(word)
            if word not in questions_vocabulary:
                questions_vocabulary[word] = len(questions_vocabulary)
        if answer not in answers_vocabulary:
            answers_vocabulary[answer] = len(answers_vocabulary)
            
            
    return questions_vocabulary, answers_vocabulary

In [None]:
def stats( prepared ):

    pad = '=============='
    def title(text):
        print( f'\n{pad} {text} {pad}\n' )

    print('\nSTATS FOR NERDS')

    title('DATA LENGTH')
    for k, v in prepared.get('data').items():
        print( f'{k}\t{len(v)}\t{len(v[0])}\t{v[0]}\n' )

    title('QUESTIONS VOCABULARY')
    print( f'Number of unqiue words: {prepared.get("VOCAB_SIZE")}' )

    title('ANSWERS VOCABULARY')
    print( f'Total number of possible answers: {prepared.get("NUM_LABELS")}' )
    


In [None]:
def prepare_data():

    # read in textual (question-answer) data
    q_train, q_valid, q_test, a_train, a_valid, a_test = read_textual_data()

    # read in visual feature data
    img_ids, img_features, visual_feat_mapping, imgid2info = read_image_data()

    # img_ids = filter_featureless( img_ids, visual_feat_mapping )

    IMG_FEAT_SIZE = len(img_features[0])
    # print('Each feature is a', IMG_FEAT_SIZE, 'vector')

    fraction = 1
    TRAIN_LEN = int(fraction * len(q_train))
    VALID_LEN = int(fraction * len(q_valid))
    TEST_LEN =  int(fraction * len(q_test))
    # print(TRAIN_LEN, VALID_LEN, TEST_LEN)

    data = train_valid_test_data(
        q_train, q_valid, q_test, a_train, a_valid, a_test,
        visual_feat_mapping, img_features,
        TRAIN_LEN, VALID_LEN, TEST_LEN
    )

    questions_vocabulary, answers_vocabulary = vocabulary( data )
    lookup = list(answers_vocabulary.keys())

    VOCAB_SIZE = len(questions_vocabulary) # amount of unique words in questions
    NUM_LABELS = len(answers_vocabulary) # amount of unique words in answers

    prepared = {
        'data': data,
        'questions_vocabulary': questions_vocabulary,
        'answers_vocabulary': answers_vocabulary,
        'lookup': lookup,
        'VOCAB_SIZE': VOCAB_SIZE,
        'NUM_LABELS': NUM_LABELS,
        'IMG_FEAT_SIZE': IMG_FEAT_SIZE,
        'TRAIN_LEN': len( data["train_data"] ),
        'VALID_LEN': len( data["valid_data"] ),
        'TEST_LEN': len( data["test_data"] ),
        'imgid2info': imgid2info,
        'visual_feat_mapping': visual_feat_mapping,
        'img_features': img_features
    }

    stats( prepared )

    return prepared


In [None]:
# data = read_textual_data()
# img_ids, img_features, visual_feat_mapping, imgid2info = read_image_data()

In [None]:
# p = prepare_data()

Reading /content/drive/MyDrive/BOWIE/Data/Subset/qdata_train.gzip ...
Reading /content/drive/MyDrive/BOWIE/Data/Subset/qdata_test.gzip ...
Reading /content/drive/MyDrive/BOWIE/Data/Subset/qdata_valid.gzip ...
Reading /content/drive/MyDrive/BOWIE/Data/Subset/adata_train.gzip ...
Reading /content/drive/MyDrive/BOWIE/Data/Subset/adata_test.gzip ...
Reading /content/drive/MyDrive/BOWIE/Data/Subset/adata_valid.gzip ...

train_data
Before filtering: 136721
After filtering: 96882
Filtered out: 29.1


valid_data
Before filtering: 8430
After filtering: 6008
Filtered out: 28.7


test_data
Before filtering: 25849
After filtering: 18217
Filtered out: 29.5


STATS FOR NERDS


train_data	96882	3	(['What', 'English', 'meal', 'is', 'this', 'likely', 'for?'], 'tea', 228478)

train_visual	96882	2048	[0.21652411 0.4433445  1.0575547  ... 0.43506446 0.16850366 0.24101193]

valid_data	6008	3	(['Is', 'there', 'a', 'bell', 'on', 'the', 'train?'], 'yes', 540769)

valid_visual	6008	2048	[0.02937265 0.24076903 

In [None]:
# %cd drive/MyDrive/BOWIE/Notebooks
# !jupyter nbconvert --to python DataPreparation.ipynb