In [2]:
! ln -s /usr/local/courses/lt2318/data/ data

ln: failed to create symbolic link 'data/data': File exists


In [3]:
! ls data/visual_genome/

image_data.json      objects.pickle	       relationships.json.zip
image_data.json.zip  read_objects_resnet50.py  VG_100K
images2.zip	     region_descriptions.json  VG_100K_2
images.zip	     relationships.json


In [4]:
import json
from collections import Counter, defaultdict
import numpy as np
import pickle

%matplotlib inline
from matplotlib import pyplot as plt

In [5]:
# read from file
relationships_from_file = json.load(open('data/visual_genome/relationships.json'))

# name/names correction for reading content of nodes in the dataset
name_extract = lambda x: x['names'][0].lower() if 'names' in x and len(x['names']) else x['name'].lower() if 'name' in x else '' 

In [6]:
print('number of images:', len(relationships_from_file))

number of images: 108077


In [7]:
image_data_from_file = json.load(open('data/visual_genome/image_data.json'))
image_id_to_size = {
    image_data['image_id']: (image_data['width'], image_data['height'])
    for image_data in image_data_from_file
}

In [8]:
# Functions for getting descriptions (relations) and bounding boxes (which are normalised by image size in the process).
def get_description(relations_data):
    triplet = (
        name_extract(relation_data['subject']),
        relation_data['predicate'].lower(), # synset?
        name_extract(relation_data['object']),
    )
    
    return triplet

def get_bboxes(relations_data, size):
    w0, h0 = size
    def normalize(bbox):
        x, y, w, h = bbox
        return (x/w0, y/h0, w/w0, h/h0, )
    
    bboxes = (
        normalize([relation_data['subject'][d] for d in ['x', 'y', 'w', 'h', ]]),
        normalize([relation_data['object'][d] for d in ['x', 'y', 'w', 'h', ]]),
    )
    
    return bboxes

In [17]:
prepositions = ['over', 'above', 'below', 'under', 'on', 'in', 'right of', 'left of', 'at', 'by', 'nearby', 'with', 'within', 'near', 'underneath', 'on top of', 'against', 'around', 'behind', 'in front of', 'amid', 'amidst', 'between', 'beneath', 'between', 'beside', 'inside', 'alongside', 'upon', ]

In [18]:
# Loading visual data.
with open('/usr/local/courses/lt2318/data/visual_genome/objects.pickle', 'rb') as f:
    visual_features = pickle.load(f)

In [19]:
# Function for getting the visual feature vectors from the previous loaded file based on the object_id of the subject and object in a relation.
def get_vis_vecs(relations_data, vis_features):
    subid = relations_data['subject']['object_id']
    objid = relations_data['object']['object_id']
    
    try:
        target_vf = vis_features[subid]
    except KeyError:
        target_vf = None
    try:
        landmark_vf = vis_features[objid]
    except KeyError:
        landmark_vf = None
                
    return target_vf, landmark_vf

In [20]:
# Creating a dictionary which contains another dictionary for each preposition, which contains two lists - one for targets and one for landmarks.
prep_vis_feat = {
    p: {}
    for p in prepositions
}

for p in prep_vis_feat:
    prep_vis_feat[p]['target'] = []
    prep_vis_feat[p]['landmark'] = []

# Gets visual feature vectors for relations if their predicate is one of the prepositions in question.
# Also checks that there are both target and landmark for each relation, and only keeps the ones that have the full pairs.
# This was to deal with the fact that I was initially getting different numbers of targets and landmarks.
for img_relations in relationships_from_file:
    for relation_data in img_relations['relationships']:
        _, p, _ = get_description(relation_data)
        
        if p in prep_vis_feat:
            target_vf, landmark_vf = get_vis_vecs(relation_data, visual_features)
            
            if target_vf is not None and landmark_vf is not None:
                prep_vis_feat[p]['target'].append(target_vf)
                prep_vis_feat[p]['landmark'].append(landmark_vf)
            else:
                pass

In [21]:
for p in prep_vis_feat:
    print(p, len(prep_vis_feat[p]["target"]), len(prep_vis_feat[p]["landmark"]))

over 8200 8200
above 12583 12583
below 3155 3155
under 16985 16985
on 544509 544509
in 183951 183951
right of 220 220
left of 365 365
at 8248 8248
by 14801 14801
nearby 0 0
with 55352 55352
within 107 107
near 24637 24637
underneath 1347 1347
on top of 27906 27906
against 2751 2751
around 6846 6846
behind 36336 36336
in front of 11849 11849
amid 33 33
amidst 19 19
between 2875 2875
beneath 1266 1266
beside 6914 6914
inside 5672 5672
alongside 386 386
upon 15 15


In [22]:
# Saves the dictionary containing all of the visual feature vectors divided by targets and landmark by preposition to file.
with open('pvis_vecs.pkl', 'wb') as sf:
    pickle.dump(prep_vis_feat, sf)

In [23]:
type(prep_vis_feat[p]["target"])
prep_vis_feat[p]["target"][0].shape

(2048,)

In [24]:
# Takes all target and landmark vectors from prep_vis_feat. Sets a variable to the first vector of each set
# and then adds them together (one addition version and one append version currently). 
from scipy.spatial.distance import cosine

for p in prep_vis_feat:
    targ_vecs = prep_vis_feat[p]['target']
    landm_vecs = prep_vis_feat[p]['landmark']
    
    # creating centroids
    targ_centroid = np.array(targ_vecs).sum(0)
    landm_centroid = np.array(landm_vecs).sum(0)
    
    # cosine distances:
    targ_cos = np.array([cosine(u, targ_centroid) for u in targ_vecs if np.linalg.norm(u) != 0])
    landm_cos = np.array([cosine(u, landm_centroid) for u in landm_vecs if np.linalg.norm(u) != 0])
    
    # average cosine distances (variance):
    targ_var = targ_cos.mean()
    landm_var = landm_cos.mean()
    
    # print or save these numbers
    print(p, "\n", "Target variance:",  targ_var, "Landmark variance:", landm_var)

over 
 Target variance: 0.4377860509985831 Landmark variance: 0.43583857570843
above 
 Target variance: 0.4416074931721027 Landmark variance: 0.4337125514527151
below 
 Target variance: 0.4496400695075702 Landmark variance: 0.45063913436680325
under 
 Target variance: 0.44614292008054934 Landmark variance: 0.44368264179195344
on 
 Target variance: 0.44018997637074353 Landmark variance: 0.4381616170161583
in 
 Target variance: 0.4511215118310238 Landmark variance: 0.44347445626513315
right of 
 Target variance: 0.4397678258744153 Landmark variance: 0.4418984124606306
left of 
 Target variance: 0.4399039305664681 Landmark variance: 0.4394029849669436
at 
 Target variance: 0.4149755984219559 Landmark variance: 0.40214101678784847
by 
 Target variance: 0.4326867802861043 Landmark variance: 0.42981732624659275
nearby 
 Target variance: nan Landmark variance: nan


  ret = ret.dtype.type(ret / rcount)


with 
 Target variance: 0.4315445705944343 Landmark variance: 0.43508339927591966
within 
 Target variance: 0.45050542237602664 Landmark variance: 0.43340833666168643
near 
 Target variance: 0.43485046399439503 Landmark variance: 0.43220760487269505
underneath 
 Target variance: 0.4429341913066091 Landmark variance: 0.44331571003996006
on top of 
 Target variance: 0.4362727468801956 Landmark variance: 0.42958939257289125
against 
 Target variance: 0.43067744498857796 Landmark variance: 0.4141680103449421
around 
 Target variance: 0.4437486544303681 Landmark variance: 0.4433920195654335
behind 
 Target variance: 0.43375089205801487 Landmark variance: 0.42392210322233326
in front of 
 Target variance: 0.428566336933596 Landmark variance: 0.41663956024297855
amid 
 Target variance: 0.4064665337403615 Landmark variance: 0.3808368697310939
amidst 
 Target variance: 0.422967614311921 Landmark variance: 0.4420478045940399
between 
 Target variance: 0.4483775362657464 Landmark variance: 0.4435