In [1]:
from factories.vgkr_v2.config_v2 import *
import mmcv
import numpy as np
from collections import defaultdict
import pandas as pd
from pandas import Series, DataFrame
import h5py
import os.path as osp
import string

In [2]:
def sentence_preprocess(phrase):
    """ preprocess a sentence: lowercase, clean up weird chars, remove punctuation """
    replacements = {
      '½': 'half',
      '—' : '-',
      '™': '',
      '¢': 'cent',
      'ç': 'c',
      'û': 'u',
      'é': 'e',
      '°': ' degree',
      'è': 'e',
      '…': '',
    }
    table1 = str.maketrans("", "", '!\'*+-.,:;?^`{|}~')
    #table2 = str.maketrans("", "", "0123456789")
    #phrase = phrase.encode('utf-8')
    phrase = phrase.lstrip(' ').rstrip(' ')
    #for k, v in replacements.items():
    #    phrase = phrase.replace(k, v)
        
    phrase = str(phrase).lower().translate(table1)
    phrase = ' '.join(phrase.split())
    return phrase

### Re-organize the data: Using the pandas
1. With pandas, we write the record of each image into a DataFrame.
2. During joint operation between VG and COCO, we find some duplicated images. Before, we think that there are 51,498 overlapped images. However, 290 images are duplicated. So there actually are 51,208 overlapped images. 

In [3]:
image_meta = mmcv.load(meta_file)
corrupted_ims = ['1592.jpg', '1722.jpg', '4616.jpg', '4617.jpg']
image_meta_tmp = []
for item in image_meta:
    if str(item['image_id']) + '.jpg' not in corrupted_ims:
        image_meta_tmp.append(item)
image_meta = image_meta_tmp
# load the train/test flag from annotations
roi_h5 = h5py.File(roidb_file, 'r')
data_split = roi_h5['split'][:]

In [4]:
path_vgids = [item['url'].split('/')[-1][:-4] for item in image_meta]
meta_vgids = [str(item['image_id']) for item in image_meta]
meta_cocoids = [str(item['coco_id']) for item in image_meta]
meta_flickrids = [str(item['flickr_id']) for item in image_meta]
meta_paths = ['/'.join(item['url'].split('/')[-2:]) for item in image_meta]
meta_heights = [item['height'] for item in image_meta]
meta_widths = [item['width'] for item in image_meta]
meta_dbidx = [i for i in range(len(image_meta))]
meta_infos = DataFrame({"meta_dbidx": meta_dbidx, "path_vgids": path_vgids, "meta_vgids": meta_vgids, 
                        "meta_cocoids": meta_cocoids, "meta_flickr_ids": meta_flickrids, "meta_paths": meta_paths, "meta_heights": meta_heights,
                        "meta_widths": meta_widths, "vg150_split": data_split})


In [5]:
meta_infos

Unnamed: 0,meta_dbidx,path_vgids,meta_vgids,meta_cocoids,meta_flickr_ids,meta_paths,meta_heights,meta_widths,vg150_split
0,0,1,1,,,VG_100K_2/1.jpg,600,800,0
1,1,2,2,,,VG_100K/2.jpg,600,800,0
2,2,3,3,,,VG_100K/3.jpg,480,640,0
3,3,4,4,,,VG_100K/4.jpg,480,640,0
4,4,5,5,,,VG_100K/5.jpg,600,800,0
...,...,...,...,...,...,...,...,...,...
108068,108068,2417992,2417992,41116,9669763691,VG_100K_2/2417992.jpg,372,500,2
108069,108069,2417993,2417993,109761,9680535991,VG_100K_2/2417993.jpg,500,500,2
108070,108070,2417994,2417994,27438,9684712191,VG_100K_2/2417994.jpg,246,500,2
108071,108071,2417995,2417995,,9721821291,VG_100K_2/2417995.jpg,294,500,2


### Joint: VG and COCO
1. During joint operation between VG and COCO, we find some duplicated images. When constructing VGKR_v1 (ECCV version), we think that there are 51,498 overlapped images. However, 290 images are duplicated. So there actually are 51,208 overlapped images.

In [6]:
#meta_infos
#meta_infos.loc[meta_infos['meta_cocoids'].isin(['140244'])]
#meta_infos['meta_cocoids'].isin(['140244'])
#meta_infos.loc[meta_infos['meta_cocoids'].isin(['100711'])]['split'].iloc[1]
unique, counts = np.unique(list(meta_infos["meta_cocoids"]), return_counts=True)  # the last one is None
unique = unique[:-1]
counts = counts[:-1]
print(unique, counts)

['100000' '100001' '100006' ... '99984' '99995' '99996'] [1 1 1 ... 1 1 1]


In [7]:
# here, we show the situation: 290 cocoids containing repeatance has two repeated images, 105 pairs are both training split, 185 are test split.  
num_repeat_train, num_repeat_train_test = 0, 0
for cid in (np.where(counts>1)[0]):
    split_slice = meta_infos.loc[meta_infos['meta_cocoids'].isin([unique[cid]])]['vg150_split']
    if split_slice.iloc[0] == 0 and split_slice.iloc[1] == 2:
        num_repeat_train += 1
    else:
        num_repeat_train_test += 1
print(num_repeat_train, num_repeat_train_test)

105 185


In [8]:
# i.e, there are 51498-290=51208 unique images that are covered by coco and vg. 
repeat_cocoids = unique[np.where(counts>1)[0]]
print(len(repeat_cocoids))

290


In [9]:
duplicated_keep_idxes = {}
for cid in repeat_cocoids:
    dbidx_slice = meta_infos.loc[meta_infos['meta_cocoids'].isin([cid])]['meta_dbidx']
    split_slice = meta_infos.loc[meta_infos['meta_cocoids'].isin([cid])]['vg150_split']
    assert split_slice.iloc[0] == 0
    duplicated_keep_idxes[cid] = dbidx_slice.iloc[0]

In [10]:
len(duplicated_keep_idxes)

290

In [11]:
# for each duplicated pair, we use the first one as train split, and drop the second one
vgcoco_split = []
for idx, cid, s in zip(meta_dbidx, meta_cocoids, data_split):
    if cid == 'None':
        vgcoco_split.append(-1)
    elif cid in repeat_cocoids:
        if idx == duplicated_keep_idxes[cid]:
            vgcoco_split.append(0)
        else:
            vgcoco_split.append(-1)  # the second one, just drop it
    else:
        vgcoco_split.append(s)

In [12]:
print(len(np.where(np.array(vgcoco_split)==0)[0]), len(np.where(np.array(vgcoco_split)==2)[0]))

35155 16053


In [13]:
# Note: this is the split for VG-COCO: using the COCO ids as a bridge to obtain the overlapped images (51,208), then split them according to the VG150 split. 
meta_infos['vgcoco_split'] = vgcoco_split

In [14]:
meta_infos

Unnamed: 0,meta_dbidx,path_vgids,meta_vgids,meta_cocoids,meta_flickr_ids,meta_paths,meta_heights,meta_widths,vg150_split,vgcoco_split
0,0,1,1,,,VG_100K_2/1.jpg,600,800,0,-1
1,1,2,2,,,VG_100K/2.jpg,600,800,0,-1
2,2,3,3,,,VG_100K/3.jpg,480,640,0,-1
3,3,4,4,,,VG_100K/4.jpg,480,640,0,-1
4,4,5,5,,,VG_100K/5.jpg,600,800,0,-1
...,...,...,...,...,...,...,...,...,...,...
108068,108068,2417992,2417992,41116,9669763691,VG_100K_2/2417992.jpg,372,500,2,2
108069,108069,2417993,2417993,109761,9680535991,VG_100K_2/2417993.jpg,500,500,2,2
108070,108070,2417994,2417994,27438,9684712191,VG_100K_2/2417994.jpg,246,500,2,2
108071,108071,2417995,2417995,,9721821291,VG_100K_2/2417995.jpg,294,500,2,-1


### Constructing VGKR_v2: grounding the relations from captions to images

In [15]:
coco_entities = mmcv.load(coco_caption_entities_file)  # 123,169, the key is coco_id

In [16]:
if not osp.isfile(cap_to_sg_file_v2):
    capsg_v1 = mmcv.load(cap_to_sg_file_v1) # generating using the previous idx, so remain 51,498 items. the key is vg_id
    captions_vg = mmcv.load(captions_vg_file)   # 51,498 items, the key is vg_id
    capsg = dict()
    for key, sg_list in capsg_v1.items():
        cap_list = captions_vg[key]
        new_sg_list = []
        for sg, cap in zip(sg_list, cap_list):
            sg['caption'] = cap
            new_sg_list.append(sg)
        capsg[key] = new_sg_list
    mmcv.dump(capsg, cap_to_sg_file_v2)
else:
    capsg = mmcv.load(cap_to_sg_file_v2)

In [17]:
# extract the VGCOCO images, 51,208
vgcoco_sets = meta_infos.loc[(np.array(vgcoco_split)>-1)]  # 51,208 images
vgcoco_sets

Unnamed: 0,meta_dbidx,path_vgids,meta_vgids,meta_cocoids,meta_flickr_ids,meta_paths,meta_heights,meta_widths,vg150_split,vgcoco_split
4996,4996,2415074,2415074,33554,128827281,VG_100K_2/2415074.jpg,375,500,0,0
4997,4997,2415075,2415075,290942,163614481,VG_100K_2/2415075.jpg,375,500,0,0
4999,4999,2415077,2415077,111842,177587881,VG_100K_2/2415077.jpg,375,500,0,0
5002,5002,2415080,2415080,100318,393790581,VG_100K_2/2415080.jpg,426,500,0,0
5004,5004,2415082,2415082,127751,452973481,VG_100K_2/2415082.jpg,375,500,0,0
...,...,...,...,...,...,...,...,...,...,...
108064,108064,2417988,2417988,15751,9419515091,VG_100K_2/2417988.jpg,333,500,2,2
108067,108067,2417991,2417991,560819,9650692891,VG_100K_2/2417991.jpg,406,500,2,2
108068,108068,2417992,2417992,41116,9669763691,VG_100K_2/2417992.jpg,372,500,2,2
108069,108069,2417993,2417993,109761,9680535991,VG_100K_2/2417993.jpg,500,500,2,2


In [18]:
# We found that there are 16 images that not in coco_entities, so we remove them 
no_entities_cids = []
no_entities_idxes = []
for cid, dbidx in zip(vgcoco_sets['meta_cocoids'], vgcoco_sets['meta_dbidx']):
    if cid not in coco_entities:
        no_entities_cids.append(cid)
        no_entities_idxes.append(dbidx)
vgcoco_sets.loc[vgcoco_sets['meta_cocoids'].isin(no_entities_cids)]

Unnamed: 0,meta_dbidx,path_vgids,meta_vgids,meta_cocoids,meta_flickr_ids,meta_paths,meta_heights,meta_widths,vg150_split,vgcoco_split
7224,7224,713568,713568,390520,5996311408,VG_100K/713568.jpg,603,1024,0,0
23165,23165,2398666,2398666,260977,4272927117,VG_100K_2/2398666.jpg,333,500,0,0
32542,32542,2388883,2388883,11223,329037602,VG_100K_2/2388883.jpg,371,500,0,0
32942,32942,2388466,2388466,476113,2046220199,VG_100K_2/2388466.jpg,332,500,0,0
35992,35992,2385261,2385261,362351,6223746372,VG_100K_2/2385261.jpg,310,500,0,0
38425,38425,2382722,2382722,531831,5342815955,VG_100K_2/2382722.jpg,332,500,0,0
61458,61458,2358603,2358603,465211,2686767442,VG_100K/2358603.jpg,281,500,0,0
62781,62781,2357213,2357213,330652,7612513828,VG_100K/2357213.jpg,374,500,0,0
64852,64852,2355044,2355044,515820,6413178765,VG_100K/2355044.jpg,500,500,0,0
72722,72722,2346805,2346805,100407,4374150810,VG_100K/2346805.jpg,333,500,0,0


In [19]:
vgcoco_entity_split=vgcoco_split.copy()
for idx in no_entities_idxes:
    vgcoco_entity_split[idx] = -1
len(np.where(np.array(vgcoco_entity_split)>-1)[0])

51192

In [20]:
meta_infos['vgcoco_entity_split'] = vgcoco_entity_split
meta_infos

Unnamed: 0,meta_dbidx,path_vgids,meta_vgids,meta_cocoids,meta_flickr_ids,meta_paths,meta_heights,meta_widths,vg150_split,vgcoco_split,vgcoco_entity_split
0,0,1,1,,,VG_100K_2/1.jpg,600,800,0,-1,-1
1,1,2,2,,,VG_100K/2.jpg,600,800,0,-1,-1
2,2,3,3,,,VG_100K/3.jpg,480,640,0,-1,-1
3,3,4,4,,,VG_100K/4.jpg,480,640,0,-1,-1
4,4,5,5,,,VG_100K/5.jpg,600,800,0,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...
108068,108068,2417992,2417992,41116,9669763691,VG_100K_2/2417992.jpg,372,500,2,2,2
108069,108069,2417993,2417993,109761,9680535991,VG_100K_2/2417993.jpg,500,500,2,2,2
108070,108070,2417994,2417994,27438,9684712191,VG_100K_2/2417994.jpg,246,500,2,2,2
108071,108071,2417995,2417995,,9721821291,VG_100K_2/2417995.jpg,294,500,2,-1,-1


In [21]:
vgcoco_entity_sets = meta_infos.loc[(np.array(vgcoco_entity_split)>-1)]  # 51,192 images
vgcoco_entity_sets

Unnamed: 0,meta_dbidx,path_vgids,meta_vgids,meta_cocoids,meta_flickr_ids,meta_paths,meta_heights,meta_widths,vg150_split,vgcoco_split,vgcoco_entity_split
4996,4996,2415074,2415074,33554,128827281,VG_100K_2/2415074.jpg,375,500,0,0,0
4997,4997,2415075,2415075,290942,163614481,VG_100K_2/2415075.jpg,375,500,0,0,0
4999,4999,2415077,2415077,111842,177587881,VG_100K_2/2415077.jpg,375,500,0,0,0
5002,5002,2415080,2415080,100318,393790581,VG_100K_2/2415080.jpg,426,500,0,0,0
5004,5004,2415082,2415082,127751,452973481,VG_100K_2/2415082.jpg,375,500,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
108064,108064,2417988,2417988,15751,9419515091,VG_100K_2/2417988.jpg,333,500,2,2,2
108067,108067,2417991,2417991,560819,9650692891,VG_100K_2/2417991.jpg,406,500,2,2,2
108068,108068,2417992,2417992,41116,9669763691,VG_100K_2/2417992.jpg,372,500,2,2,2
108069,108069,2417993,2417993,109761,9680535991,VG_100K_2/2417993.jpg,500,500,2,2,2


In [22]:
# extract the sg and grounded captions for 51,192 images
cap_without_entity_cnt = 0
cap_total_cnt = 0
cap_sgentities_vgcoco = {}
for vid, cid, path, dbidx in zip(vgcoco_entity_sets['meta_vgids'], vgcoco_entity_sets['meta_cocoids'], vgcoco_entity_sets['meta_paths'],
                                vgcoco_entity_sets['meta_dbidx']):
    all_sgs = capsg[vid]
    all_entities = coco_entities[cid]
    cap_total_cnt += len(all_sgs)
    for sg in all_sgs:
        #print(sg)
        cap = sentence_preprocess(sg['caption'])
        sg['caption'] = cap
        if cap in all_entities:
            sg['entities'] = all_entities[cap]
        else:
            cap_without_entity_cnt += 1
            sg['entities'] = None
    cap_sgentities_vgcoco[vid] = {'coco_id':cid, 'path': path, 'dbidx': dbidx, 'sg': all_sgs}

In [23]:
cap_total_cnt, cap_without_entity_cnt

(256103, 8564)

In [24]:
cap_sgentities_vgcoco['2415074']

{'coco_id': '33554',
 'path': 'VG_100K_2/2415074.jpg',
 'dbidx': 4996,
 'sg': [{'node': ['tourist-1', "tourist-1'", 'sign-5', "sign-5'"],
   'edge': [['tourist-1', 'find', 'sign-5'],
    ["tourist-1'", 'find', "sign-5'"]],
   'caption': 'tourists will find blue signs like this in great britain',
   'entities': {'det_sequences': ['_',
     None,
     None,
     'sign',
     'sign',
     None,
     None,
     None,
     '_',
     '_'],
    'noun_chunks': [['tourists', '_'],
     ['blue signs', 'sign'],
     ['great britain', '_']],
    'detections': {'sign': [[1,
       [124.60584259033203,
        8.291168212890625,
        449.1304626464844,
        303.53790283203125]],
      [2,
       [177.33352661132812,
        164.6728515625,
        450.95135498046875,
        259.86029052734375]],
      [9,
       [162.57180786132812,
        38.54546356201172,
        482.7727355957031,
        139.89659118652344]],
      [16,
       [396.06201171875,
        318.5375061035156,
        453.985

In [26]:
# save to file
meta_infos.to_csv(meta_form_file, index=False)


In [27]:
mmcv.dump(cap_sgentities_vgcoco, cap_sgentities_vgcoco_file)

#### Some statistical results given here
1. COCO and VG have **51,208** overlapped images (`vgcoco_split` in `meta_infos`).
2. When concating with the `coco_entities` data, we find that 16 images do not have grounded entities. So we removed them, resulting in **51,192** images (`vgcoco_entity_split` in meta_infos)
3. Some images have 6 captions. So the 51,192 images have **256,103** image captions. We try our best to post-process our captions to match with `coco_entities` (modify the `sentence_preprocess` function) and **8,564** captions are not found in `coco_entities`. **8,452** of them are not found because `coco_entities` has less than 5 captions. **112** of them are not found because they do not match with any one of the 5 captions. 

#### Now, we get the most useful data: 
1. `meta_infos`: it contains the most integrated information of each image.
2. `cap_sgentities_vgcoco`:

```
JSON Format:
{
"XXXXX"(str, vg_id):{
    "coco_id": str,
    "path": str('VG_100K_2/2415074.jpg'),
    "dbidx": int(4996),
    "sg": [
        "node": [str],
        "edge": [[str]],
        "caption": str,
        "entities":None, or {
            "det_sequences": [str],
            "noun_chunks": [[str]],
            "detections": {
                str: [
                [int, [int]]
                ]
            }
        },
        "split": str (not use this)
    ]
}
...
	
}
```