In [3]:
! ln -s /usr/local/courses/lt2318/data/ data

ln: failed to create symbolic link 'data/data': File exists


In [4]:
! ls data/visual_genome/

image_data.json      objects.pickle	       VG_100K
image_data.json.zip  read_objects_resnet50.py  VG_100K_2
images2.zip	     relationships.json
images.zip	     relationships.json.zip


In [5]:
import json
from collections import Counter, defaultdict
import numpy as np
import pickle

%matplotlib inline
from matplotlib import pyplot as plt

In [6]:
# read from file
relationships_from_file = json.load(open('data/visual_genome/relationships.json'))

# name/names correction for reading content of nodes in the dataset
name_extract = lambda x: x['names'][0].lower() if 'names' in x and len(x['names']) else x['name'].lower() if 'name' in x else '' 

In [7]:
print('number of images:', len(relationships_from_file))

number of images: 108077


In [8]:
image_data_from_file = json.load(open('data/visual_genome/image_data.json'))
image_id_to_size = {
    image_data['image_id']: (image_data['width'], image_data['height'])
    for image_data in image_data_from_file
}

In [9]:
# Functions for getting descriptions (relations) and bounding boxes (which are normalised by image size in the process).
def get_description(relations_data):
    triplet = (
        name_extract(relation_data['subject']),
        relation_data['predicate'].lower(), # synset?
        name_extract(relation_data['object']),
    )
    
    return triplet

def get_bboxes(relations_data, size):
    w0, h0 = size
    def normalize(bbox):
        x, y, w, h = bbox
        return (x/w0, y/h0, w/w0, h/h0, )
    
    bboxes = (
        normalize([relation_data['subject'][d] for d in ['x', 'y', 'w', 'h', ]]),
        normalize([relation_data['object'][d] for d in ['x', 'y', 'w', 'h', ]]),
    )
    
    return bboxes

In [10]:
prepositions = ['over', 'above', 'below', 'under', 'on', 'in', 'right of', 'left of']

In [11]:
# Loading visual data.
with open('/usr/local/courses/lt2318/data/visual_genome/objects.pickle', 'rb') as f:
    visual_features = pickle.load(f)

In [12]:
# Function for getting the visual feature vectors from the previous loaded file based on the object_id of the subject and object in a relation.
def get_vis_vecs(relations_data, vis_features):
    subid = relations_data['subject']['object_id']
    objid = relations_data['object']['object_id']
    
    try:
        target_vf = vis_features[subid]
    except KeyError:
        target_vf = None
    try:
        landmark_vf = vis_features[objid]
    except KeyError:
        landmark_vf = None
                
    return target_vf, landmark_vf

In [13]:
# Creating a dictionary which contains another dictionary for each preposition, which contains two lists - one for targets and one for landmarks.
prep_vis_feat = {
    p: {}
    for p in prepositions
}

for p in prep_vis_feat:
    prep_vis_feat[p]['target'] = []
    prep_vis_feat[p]['landmark'] = []

# Gets visual feature vectors for relations if their predicate is one of the prepositions in question.
# Also checks that there are both target and landmark for each relation, and only keeps the ones that have the full pairs.
# This was to deal with the fact that I was initially getting different numbers of targets and landmarks.
for img_relations in relationships_from_file:
    for relation_data in img_relations['relationships']:
        _, p, _ = get_description(relation_data)
        
        if p in prep_vis_feat:
            target_vf, landmark_vf = get_vis_vecs(relation_data, visual_features)
            
            if target_vf is not None and landmark_vf is not None:
                prep_vis_feat[p]['target'].append(target_vf)
                prep_vis_feat[p]['landmark'].append(landmark_vf)
            else:
                pass

In [14]:
for p in prep_vis_feat:
    print(p, len(prep_vis_feat[p]["target"]), len(prep_vis_feat[p]["landmark"]))

over 8200 8200
above 12583 12583
below 3155 3155
under 16985 16985
on 544509 544509
in 183951 183951
right of 220 220
left of 365 365


In [15]:
# Saves the dictionary containing all of the visual feature vectors divided by targets and landmark by preposition to file.
with open('pvis_vecs.pkl', 'wb') as sf:
    pickle.dump(prep_vis_feat, sf)

In [16]:
type(prep_vis_feat[p]["target"])
prep_vis_feat[p]["target"][0].shape

(2048,)

In [35]:
# Takes all target and landmark vectors from prep_vis_feat. Sets a variable to the first vector of each set
# and then adds them together (one addition version and one append version currently). 
prep_sum_dict = {
    p: {}
    for p in prepositions
}

for p in prep_vis_feat:
    targ_vecs = prep_vis_feat[p]['target']
    landm_vecs = prep_vis_feat[p]['landmark']
    
    targets = targ_vecs[0]
    landmarks = landm_vecs[0]
    
    print(targets, targets.shape)
    print(landmarks, landmarks.shape)
    
    # Addition version, where dimension (2048) remains the same.
    # Change indexing to [1:] when running fully.
    for tvec in targ_vecs[1:2]:
        print(tvec)
        targets = targets + tvec
        
    for lvec in landm_vecs[1:2]:
        print(lvec)
        landmarks = landmarks + lvec
        
    print(targets, targets.shape)
    print(landmarks, landmarks.shape)
    
    # Append version, where vectors are just appended on each other and become super long (2048, 4096, 6144...).
    # Change indexing to [1:] when running fully.
    '''for tvec in targ_vecs[1:2]:
        targets = np.append(targets, tvec)
        
    for lvec in landm_vecs[1:2]:
        landmarks = np.append(landmarks, lvec)
        
    print(targets, targets.shape)
    print(landmarks, landmarks.shape)'''
 
    #Save summed vectors to a dictionary that has a dict entry for each preposition. Put the target one under target and the landmark one under landmark.
    prep_sum_dict[p]['target'] = targets
    prep_sum_dict[p]['landmark'] = landmarks

    '''#Might not need to save to file, this stuff is not exponential like the previous work.
    lst_filename = p + "_what_dict.pkl"

    with open(lst_filename, 'wb') as outfile:
        pickle.dump(prep_cos_dict, outfile, pickle.HIGHEST_PROTOCOL)'''

    # Progress indication print.
    print("Done with:", p)

[0.53191054 0.53470176 0.30310673 ... 1.0023298  0.         1.2908782 ] (2048,)
[0.127356   0.16918519 0.20135735 ... 0.8276626  0.         0.3897869 ] (2048,)
[0.         0.37064752 0.         ... 0.8056022  0.00893039 0.05825645]
[1.0312723e-01 6.0286057e-01 0.0000000e+00 ... 7.3783174e-02 4.8108550e-04
 0.0000000e+00]
[0.53191054 0.90534925 0.30310673 ... 1.807932   0.00893039 1.3491347 ] (2048,)
[2.3048323e-01 7.7204573e-01 2.0135735e-01 ... 9.0144575e-01 4.8108550e-04
 3.8978690e-01] (2048,)
Done with: over
[0.00050545 0.45223147 0.4421402  ... 0.29909486 0.39411917 0.29974324] (2048,)
[0.         0.52970886 0.22222234 ... 0.15910697 0.41383415 0.12658107] (2048,)
[1.3598702e-03 1.8180501e+00 2.2492766e-01 ... 1.4745821e-01 3.4083164e-01
 4.9154139e-01]
[0.07877858 1.6007465  0.         ... 0.15136574 0.6895241  0.13171735]
[1.8653248e-03 2.2702816e+00 6.6706789e-01 ... 4.4655305e-01 7.3495078e-01
 7.9128462e-01] (2048,)
[0.07877858 2.1304555  0.22222234 ... 0.31047273 1.1033583  

In [22]:
from scipy.spatial.distance import cosine

# Takes summed vectors and compares every single vector to the summed vector (cosine distance, targets and landmarks separately).
# Saves the lists of cosine distances to a dictionary.
prep_cos_dict = {
    p: {}
    for p in prepositions
}

for p in prep_vis_feat:
    targ_vecs = prep_vis_feat[p]['target']
    landm_vecs = prep_vis_feat[p]['landmark']
    
    targets = prep_sum_dict[p]['target']
    landmarks = prep_sum_dict[p]['landmark']
    
    targ_cos_list = [cosine(targ_vec, targets) for targ_vec in targ_vecs]
    prep_cos_dict[p]['target'] = targ_cos_list
    
    landm_cos_list = [cosine(landm_vec, landmarks) for landm_vec in landm_vecs]
    prep_cos_dict[p]['landmark'] = landm_cos_list
    
    print("Done with", p)

array([ 0,  2,  4,  6,  8, 10, 12, 14])

In [None]:
for p in prep_cos_dict:
    avg_targ_dist = np.mean(prep_cos_dict[p]['target'])
    avg_landm_dist = np.mean(prep_cos_dict[p]['landmark'])
    
    print(p + ":")
    print("Avg Cosine Distance:")
    print('Targets:', avg_targ_dist)
    print('Landmarks:', avg_landm_dist)