This notebook fixes the issue with our generated fprefcocog. 

We generated from the splitBy=google but we should have used splitBy=umd. Since the google and umd versions are identical except for the train/val/test splits, we can just copy the split value from umd -> google

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
%pip list | grep json

fastjsonschema                    2.18.0
json5                             0.9.14
jsonpointer                       2.4
jsons                             1.6.3
jsonschema                        4.19.1
jsonschema-specifications         2023.7.1
pysimdjson                        5.0.2
python-json-logger                2.0.7
python-lsp-jsonrpc                1.1.1
ujson                             5.8.0
Note: you may need to restart the kernel to use updated packages.


In [10]:
import argparse
import copy
import csv
import decimal
import pickle
import typing
from collections import Counter, defaultdict
from copy import deepcopy
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Set, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import simdjson as json
from IPython.display import display
from pycocotools.coco import COCO
from pycocotools.helpers import CocoClassDistHelper
from tqdm.auto import tqdm

In [4]:
%ls -lah "/home/gbiamby/proj/geo-llm-ret/lib/neg_refer_llm/dataset/refer_seg/refcoco/"
%ls -lah "/home/gbiamby/proj/geo-llm-ret/lib/neg_refer_llm/dataset/refer_seg/fprefcoco_v002/"

total 93M
drwxr-xr-x  2 gbiamby users    5 Sep 22 22:33  [0m[01;34m.[0m/
drwxr-xr-x 23 gbiamby users   28 Nov  9 04:57  [01;34m..[0m/
-rw-r--r--  1 gbiamby users 115M Feb  3  2016  instances.json
-rw-r--r--  1 gbiamby users  31M Feb  3  2016 'refs(google).p'
-rw-r--r--  1 gbiamby users  31M Feb  3  2016 'refs(unc).p'
total 126M
drwxr-xr-x  2 gbiamby users    5 Nov  8 18:20  [0m[01;34m.[0m/
drwxr-xr-x 23 gbiamby users   28 Nov  9 04:57  [01;34m..[0m/
-rw-r--r--  1 gbiamby users 115M Nov  7 20:31  instances.json
-rw-r--r--  1 gbiamby users  77M Nov  8 18:40 'refs(berkeley).p'
-rw-r--r--  1 gbiamby users  77M Nov  8 18:20 'refs(google_berkeley).p'


In [5]:
REFSEG_DIR = Path("/home/gbiamby/proj/geo-llm-ret/lib/neg_refer_llm/dataset/refer_seg/")
# dataset_json = Path(REFSEG_DIR / "R-refcocog/instances.json")
# coco_dist = CocoClassDistHelper(dataset_json)

In [170]:
VALID_SPLITS = {
    "R-refcoco": ["unc"],
    "R-refcoco+": ["unc"],
    "R-refcocog": ["umd"],
    # "refclef": ["berkeley", "unc"],
    "refcoco": ["unc", "google"],
    "refcoco+": ["unc"],
    "refcocog": [
        "umd",
        "google",
    ],  # ["google", "umd"], # google and umd seem to be identical? THey have the same aggregate stats.
    # "coconegref": ["berkeley"],
    "fprefcoco_v002": ["berkeley"],
    "fprefcoco+_v002": ["berkeley"],
    "fprefcocog_v002": ["berkeley"],
}


def build_refcoco(refseg_path: Path, dataset_name: str, split_by: str = None) -> COCO:
    assert dataset_name in VALID_SPLITS, dataset_name
    if split_by is None:
        split_by = VALID_SPLITS[dataset_name][0]
    else:
        assert split_by in VALID_SPLITS[dataset_name]
    coco = COCO(
        refseg_path / dataset_name / "instances.json",
        is_ref_dataset=True,
        dataset_name=dataset_name,
        split_by=split_by,
    )
    return coco

In [107]:
refg_google = build_refcoco(REFSEG_DIR, "refcoco", "google")
refg_umd = build_refcoco(REFSEG_DIR, "refcoco", "unc")
fprefg_google = build_refcoco(REFSEG_DIR, "fprefcoco_v002", "berkeley")

Loading refs from '/shared/gbiamby/data/refer_seg/refcoco/refs(google).p'
Loaded 50000 refs
loading annotations into memory...
Done (t=1.19s)
creating index...
index created!
Loading refs from '/shared/gbiamby/data/refer_seg/refcoco/refs(unc).p'
Loaded 50000 refs
loading annotations into memory...
Done (t=1.24s)
creating index...
index created!
Loading refs from '/shared/gbiamby/data/refer_seg/fprefcoco_v002/refs(berkeley).p'
Loaded 50000 refs
loading annotations into memory...
Done (t=9.24s)
creating index...
index created!


In [108]:
def compare_refcocos(ref_a: COCO, ref_b: COCO):
    imgs_a = set(ref_a.imgs.keys())
    imgs_b = set(ref_b.imgs.keys())
    print(
        "img_ids same: ",
        imgs_a == imgs_b,
        len(imgs_a),
        len(imgs_b),
        len(imgs_a.intersection(imgs_b)),
    )

    ann_ids_a = set(ref_a.anns.keys())
    ann_ids_b = set(ref_b.anns.keys())
    print(
        "ann_ids same: ",
        ann_ids_a == ann_ids_b,
        len(ann_ids_a),
        len(ann_ids_b),
        len(ann_ids_a.intersection(ann_ids_b)),
    )

    ref_ids_a = set(ref_a.refs.keys())
    ref_ids_b = set(ref_b.refs.keys())
    print(
        "ref_ids same: ",
        ref_ids_a == ref_ids_b,
        len(ref_ids_a),
        len(ref_ids_b),
        len(ref_ids_a.intersection(ref_ids_b)),
    )


compare_refcocos(refg_google, refg_umd)
compare_refcocos(fprefg_google, refg_umd)

img_ids same:  True 19994 19994 19994
ann_ids same:  True 196771 196771 196771
ref_ids same:  True 50000 50000 50000
img_ids same:  True 19994 19994 19994
ann_ids same:  True 196771 196771 196771
ref_ids same:  True 50000 50000 50000


The ref_ids are not consistent between the two datasets, but the actual refering expressions and sentences mapped to them are the same. So we cannot use ref_id as a unique identifier fi we are compare across different ref_seg datasets.

In [41]:
from pycocotools.coco import Ann, Image, Ref

google_img, umd_img = next(
    zip(sorted(refg_google.img_to_refs.items()), sorted(refg_umd.img_to_refs.items()))
)
google_img[0], umd_img[0]
print(
    [(ref["ref_id"], len(ref["sentences"]), ref["sent_ids"]) for ref in google_img[1]]
)
print([(ref["ref_id"], len(ref["sentences"]), ref["sent_ids"]) for ref in umd_img[1]])

sents = []
for ref in google_img[1]:
    for s in ref["sentences"]:
        sents.append(s["sent"])
print(sorted(sents))

print("")
sents = []
for ref in umd_img[1]:
    for s in ref["sentences"]:
        sents.append(s["sent"])
print(sorted(sents))

[(17861, 1, [30065]), (20840, 2, [37026, 37027]), (44673, 2, [92713, 92714])]
[(16534, 1, [30065]), (19208, 2, [37026, 37027]), (40652, 2, [92713, 92714])]
['a pink plastic box which is rectangle', 'a yellow rectangle bowl with broccoli and a brown food in it', 'container holding fruit', 'pink container of tangerines and pineapple', 'the tray that holds the broccoli']

['a pink plastic box which is rectangle', 'a yellow rectangle bowl with broccoli and a brown food in it', 'container holding fruit', 'pink container of tangerines and pineapple', 'the tray that holds the broccoli']


Another result is that refs are not in the same order within an image. ref_ids are not equal, but even if we ignore that, the set of sentences across the image refs are the same, but the ordering of the refs (and therefore the sentences) are not the same between two different datasets.

In [112]:
from collections import defaultdict


def are_refs_equal(ref_a: Ref, ref_b: Ref) -> bool:
    sents_a, sents_b = set(), set()
    [sents_a.add(s["sent"]) for s in ref_a["sentences"]]
    [sents_b.add(s["sent"]) for s in ref_b["sentences"]]
    if sents_a == sents_b:
        return True
    else:
        print("sents_a: ", len(sents_a))
        print("sents_b: ", len(sents_b))
        print("sents_a: ", sents_a)
        print("sents_b: ", sents_b)


def inspect_image(img_id: int, refseg_a: COCO, refseg_b: COCO):
    refs_a = refseg_a.img_to_refs[img_id]
    refs_b = refseg_b.img_to_refs[img_id]
    print("\nrefseg_a: ")
    for ref in refs_a:
        for s in ref["sentences"]:
            print(s["sent"], ref["split"])
    print("\nrefseg_b: ")
    for ref in refs_b:
        for s in ref["sentences"]:
            print(s["sent"], ref["split"])


def copy_splits(refseg_a: COCO, refseg_b: COCO):
    idx = -1
    refseg_b = deepcopy(refseg_b)
    errors = []
    for (img_id_a, refs_a), (img_id_b, refs_b) in tqdm(
        zip(sorted(refseg_a.img_to_refs.items()), sorted(refseg_b.img_to_refs.items())),
        total=len(refseg_b.img_to_refs),
    ):
        idx += 1
        assert len(refs_a) == len(refs_b)
        sents_a, sents_b = {}, {}
        sents_to_ref_a, sents_to_ref_b = defaultdict(list), defaultdict(list)
        for ref_a in refs_a:
            sents_set = set()
            for s in ref_a["sentences"]:
                if ("is_false_premise" not in s) or (not s["is_false_premise"]):
                    sents_a[s["sent"]] = (ref_a, s)
                    sents_set.add(s["sent"])
            sents_to_ref_a[tuple(sents_set)].append(ref_a)
        for ref_b in refs_b:
            sents_set = set()
            for s in ref_b["sentences"]:
                if ("is_false_premise" not in s) or (not s["is_false_premise"]):
                    sents_b[s["sent"]] = (ref_b, s)
                    sents_set.add(s["sent"])
            sents_to_ref_b[tuple(sents_set)].append(ref_b)
        assert len(sents_a) == len(sents_b)
        # Ensure sentences are identical for the image in both datasets:
        assert set(sents_a.keys()) == set(sents_b.keys())
        assert set(sents_to_ref_a.keys()) == set(sents_to_ref_b.keys())
        # assert len(sents_to_ref_a) == len(refs_a)
        if len(sents_to_ref_a) != len(refs_a):
            errors.append(
                {
                    "err": f"len(sents_to_ref_a) != len(refs_a) [{len(sents_to_ref_a)} != {len(refs_a)}]",
                    "sents_to_ref_a": sents_to_ref_a,
                    "sents_to_ref_b": sents_to_ref_b,
                    "refs_a": refs_a,
                    "refs_b": refs_b,
                }
            )
        # assert len(sents_to_ref_b) == len(refs_b)
        if len(sents_to_ref_b) != len(refs_a):
            errors.append(
                {
                    "err": f"len(sents_to_ref_b) != len(refs_b) [{len(sents_to_ref_b)} != {len(refs_b)}]",
                    "sents_to_ref_a": sents_to_ref_a,
                    "sents_to_ref_b": sents_to_ref_b,
                    "refs_a": refs_a,
                    "refs_b": refs_b,
                    "img_id": img_id_a,
                }
            )
        # Now that the refs are ordered the same way, we can loop through them and copy the split data:
        for (_sents_a, _refs_a), (_sents_b, _refs_b) in zip(
            sents_to_ref_a.items(), sents_to_ref_b.items()
        ):
            for _ref_a, _ref_b in zip(_refs_a, _refs_b):
                if idx <= 0:
                    print(_ref_a["split"], ", ", _sents_a)
                    print(_ref_b["split"], ", ", _sents_b)
                _ref_b["split"] = _ref_a["split"]
    return refseg_b, errors


fprefg_google_new, errors = copy_splits(refg_umd, fprefg_google)
print(len(errors))
# inspect_image(529, refg_google, refg_umd)

 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                  | 11281/19994 [00:00<00:00, 56461.10it/s]

train ,  ('giraffe on left', 'left giraffe')
train ,  ('giraffe on left', 'left giraffe')
train ,  ('right girafe', 'right giraffe')
val ,  ('right girafe', 'right giraffe')


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19994/19994 [00:00<00:00, 55544.64it/s]

2





In [117]:
# We expect refcocog(UMD) to have two errors, caused by two of their refs having the same three sentences. It's not really an error.
# diplay(errors)

In [113]:
def show_train_val_test(refcoco: COCO):
    refs_list = []
    for ref in refcoco.refs.values():
        refs_list.append(
            {
                "ref_id": ref["ref_id"],
                "split": ref["split"],
                "image_id": ref["image_id"],
            }
        )
    df_refs = pd.DataFrame(refs_list)
    display(
        df_refs.groupby("split").agg(
            refs=("ref_id", "nunique"), images=("image_id", "nunique")
        )
    )


show_train_val_test(refg_umd)
show_train_val_test(refg_google)
show_train_val_test(fprefg_google)
show_train_val_test(fprefg_google_new)

Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
testA,1975,750
testB,1810,750
train,42404,16994
val,3811,1500


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
test,5000,4527
train,40000,19213
val,5000,4559


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
test,5000,4527
train,40000,19213
val,5000,4559


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
testA,1975,750
testB,1810,750
train,42404,16994
val,3811,1500


In [116]:
# now save the new refs:
# refs_path = REFSEG_DIR / "fprefcoco_v002"
# refs_path = Path("/home/gbiamby/proj/geo-llm-ret/output/refcoco_google_enhanced-gb006_remove_guidelines-gpt-3.5-turbo/refer_seg/fprefcoco_v002") # refcocog
refs_paths = [
    REFSEG_DIR / "fprefcoco_v002",
    Path(
        "/home/gbiamby/proj/geo-llm-ret/output/refcoco_google-gb006_remove_guidelines-gpt-3.5-turbo/refer_seg/fprefcoco_v002"
    ),  # refcoco
]
for refs_path in refs_paths:
    assert refs_path.exists(), str(refs_path)

for refs_path in refs_paths:
    refs_path = refs_path / "refs(berkeley).p"
    print(refs_path)
    pickle.dump(fprefg_google_new.refs_data, open(refs_path, "wb"))

/home/gbiamby/proj/geo-llm-ret/output/refcoco_google-gb006_remove_guidelines-gpt-3.5-turbo/refer_seg/fprefcoco_v002/refs(berkeley).p


---

---


# Generate _exclusion Versions of refcocog

Addresses contamination that would be caused by training on refcocog's train set and then evaluating on the val/test sets of refococo/refcoco+ splits.

In [7]:
VALID_SPLITS = {
    "R-refcoco": ["unc"],
    "R-refcoco+": ["unc"],
    "R-refcocog": ["umd"],
    # "refclef": ["berkeley", "unc"],
    "refcoco": ["unc"],  # ["unc", "google"],
    "refcoco+": ["unc"],
    # ["umd", "google"], # google and umd seem to be identical? THey have the same aggregate stats.
    "refcocog": ["umd"],
    # "coconegref": ["berkeley"],
    "fprefcoco_v002": ["berkeley"],
    "fprefcoco+_v002": ["berkeley"],
    "fprefcocog_v002": ["berkeley"],
}


def build_refcoco(refseg_path: Path, dataset_name: str, split_by: str = None) -> COCO:
    assert dataset_name in VALID_SPLITS, dataset_name
    if split_by is None:
        split_by = VALID_SPLITS[dataset_name][0]
    else:
        assert split_by in VALID_SPLITS[dataset_name]
    coco = COCO(
        refseg_path / dataset_name / "instances.json",
        is_ref_dataset=True,
        dataset_name=dataset_name,
        split_by=split_by,
    )
    return coco

## Load the Datasets

In [8]:
refcocos = []

for dataset_name, split_bys in VALID_SPLITS.items():
    for split_by in split_bys:
        print("\n")
        print("=" * 220)
        print(f"Dataset: {dataset_name}({split_by})")
        ref_coco = build_refcoco(REFSEG_DIR, dataset_name, split_by)
        print(ref_coco.dataset_name, ref_coco.split_by)
        refcocos.append(ref_coco)




Dataset: R-refcoco(unc)
Loading refs from '/shared/gbiamby/data/refer_seg/R-refcoco/refs(unc).p'
Loaded 50000 refs
loading annotations into memory...
Done (t=3.52s)
creating index...
index created!
R-refcoco unc



Dataset: R-refcoco+(unc)
Loading refs from '/shared/gbiamby/data/refer_seg/R-refcoco+/refs(unc).p'
Loaded 49856 refs
loading annotations into memory...
Done (t=4.59s)
creating index...
index created!
R-refcoco+ unc



Dataset: R-refcocog(umd)
Loading refs from '/shared/gbiamby/data/refer_seg/R-refcocog/refs(umd).p'
Loaded 49822 refs
loading annotations into memory...
Done (t=3.55s)
creating index...
index created!
R-refcocog umd



Dataset: refcoco(unc)
Loading refs from '/shared/gbiamby/data/refer_seg/refcoco/refs(unc).p'
Loaded 50000 refs
loading annotations into memory...
Done (t=1.10s)
creating index...
index created!
refcoco unc



Dataset: refcoco+(unc)
Loading refs from '/shared/gbiamby/data/refer_seg/refcoco+/refs(unc).p'
Loaded 49856 refs
loading annotations into

## Get image_id's for all dataset/split pairs

In [16]:
def get_img_ids(refcoco: COCO) -> dict[str, set[int]]:
    split_to_imgs = defaultdict(set)
    for idx, (image_id, refs) in tqdm(
        enumerate(refcoco.img_to_refs.items()), total=len(refcoco.img_to_refs)
    ):
        for ref in refs:
            split_to_imgs[ref["split"]].add(image_id)

    # Output some stats:
    for split, image_ids in split_to_imgs.items():
        print(f"split: {split}: {len(image_ids)} total refs")
    return split_to_imgs


# split_ids_refcoco_unc = get_img_ids(refcoco_unc)
# # split_ids_refg_umd = get_img_ids(refg_umd)
# # split_ids_fprefg_berkeley = get_img_ids(fprefg_berkeley)
# split_ids_refcoco_rrefg_g = get_img_ids(rrefg_g)

split_img_ids = {}
for refcoco in refcocos:
    print("Dataset: ", refcoco.dataset_name, refcoco.split_by)
    split_img_ids[(refcoco.dataset_name, refcoco.split_by)] = get_img_ids(refcoco)

Dataset:  R-refcoco unc


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19994/19994 [00:00<00:00, 599773.38it/s]


split: train: 16994 total refs
split: testB: 750 total refs
split: testA: 750 total refs
split: val: 1500 total refs
Dataset:  R-refcoco+ unc


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19992/19992 [00:00<00:00, 721070.14it/s]


split: train: 16992 total refs
split: testB: 750 total refs
split: testA: 750 total refs
split: val: 1500 total refs
Dataset:  R-refcocog umd


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25799/25799 [00:00<00:00, 945014.18it/s]


split: test: 2600 total refs
split: train: 21899 total refs
split: val: 1300 total refs
Dataset:  refcoco unc


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19994/19994 [00:00<00:00, 679332.78it/s]


split: train: 16994 total refs
split: testB: 750 total refs
split: testA: 750 total refs
split: val: 1500 total refs
Dataset:  refcoco+ unc


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19992/19992 [00:00<00:00, 624725.46it/s]


split: train: 16992 total refs
split: testB: 750 total refs
split: testA: 750 total refs
split: val: 1500 total refs
Dataset:  refcocog umd


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25799/25799 [00:00<00:00, 737881.52it/s]


split: test: 2600 total refs
split: train: 21899 total refs
split: val: 1300 total refs
Dataset:  fprefcoco_v002 berkeley


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19994/19994 [00:00<00:00, 676898.17it/s]


split: val: 1500 total refs
split: train: 16994 total refs
split: testB: 750 total refs
split: testA: 750 total refs
Dataset:  fprefcoco+_v002 berkeley


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19992/19992 [00:00<00:00, 919657.43it/s]


split: train: 16992 total refs
split: testB: 750 total refs
split: testA: 750 total refs
split: val: 1500 total refs
Dataset:  fprefcocog_v002 berkeley


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25799/25799 [00:00<00:00, 913571.15it/s]

split: train: 21899 total refs
split: val: 1300 total refs
split: test: 2600 total refs





## Show the train/val/test Splits For Each Dataset

In [14]:
def show_train_val_test(refcoco: COCO):
    refs_list = []
    for ref in refcoco.refs.values():
        refs_list.append(
            {
                "ref_id": ref["ref_id"],
                "split": ref["split"],
                "image_id": ref["image_id"],
            }
        )
    df_refs = pd.DataFrame(refs_list)
    display(
        df_refs.groupby("split").agg(
            refs=("ref_id", "nunique"), images=("image_id", "nunique")
        )
    )


for refcoco in refcocos:
    print("Dataset: ", refcoco.dataset_name, refcoco.split_by)
    show_train_val_test(refcoco)

Dataset:  R-refcoco unc


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
testA,1975,750
testB,1810,750
train,42404,16994
val,3811,1500


Dataset:  R-refcoco+ unc


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
testA,1975,750
testB,1798,750
train,42278,16992
val,3805,1500


Dataset:  R-refcocog umd


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
test,5023,2600
train,42226,21899
val,2573,1300


Dataset:  refcoco unc


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
testA,1975,750
testB,1810,750
train,42404,16994
val,3811,1500


Dataset:  refcoco+ unc


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
testA,1975,750
testB,1798,750
train,42278,16992
val,3805,1500


Dataset:  refcocog umd


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
test,5023,2600
train,42226,21899
val,2573,1300


Dataset:  fprefcoco_v002 berkeley


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
testA,1975,750
testB,1810,750
train,42404,16994
val,3811,1500


Dataset:  fprefcoco+_v002 berkeley


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
testA,1975,750
testB,1798,750
train,42278,16992
val,3805,1500


Dataset:  fprefcocog_v002 berkeley


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
test,5023,2600
train,42226,21899
val,2573,1300


## Generate New refseg COCO Objects, with test/val Images Excluded from All Train Splits 

In [18]:
def exclude_images(
    split_images_sources: list[dict[str, set[int]]], refcoco_target: COCO
):
    print("Cloning refcoco...")
    refcoco_target = deepcopy(refcoco_target)

    print("Gathering all val/test image_ids...")
    test_split_images = set()
    for split_images_source in split_images_sources.values():
        for split, imgs in split_images_source.items():
            if not split.startswith("train"):
                print("including non-train source split: ", split, len(imgs))
                test_split_images |= imgs
    print("test_split_images: ", len(test_split_images))

    # Exclude test_slit_images from the target
    print("Excluding val/test images from target refcoco...")
    removed = set()
    num_updates = 0
    num_train = 0
    for idx, (image_id, refs) in tqdm(
        enumerate(refcoco_target.img_to_refs.items()),
        total=len(refcoco_target.img_to_refs),
    ):
        for ref in refs:
            if ref["split"] == "train":
                num_train += 1
                if ref["image_id"] in test_split_images:
                    ref["split"] = "train_exclude"
                    removed.add(ref["image_id"])
                    num_updates += 1

    print(
        f"num_train: {num_train}, num_updates: {num_updates}, num_imgs_removed: {len(removed)}"
    )
    return refcoco_target


new_refcocos = []
for refcoco in refcocos:
    print("Dataset: ", refcoco.dataset_name, refcoco.split_by)
    new_refcocos.append(exclude_images(split_img_ids, refcoco))

Dataset:  R-refcoco unc
Cloning refcoco...
Gathering all val/test image_ids...
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  test 2600
including non-train source split:  val 1300
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  test 2600
including non-train source split:  val 1300
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  testB 750
including non-train source s

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19994/19994 [00:00<00:00, 995819.10it/s]

num_train: 42404, num_updates: 4470, num_imgs_removed: 1891
Dataset:  R-refcoco+ unc
Cloning refcoco...





Gathering all val/test image_ids...
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  test 2600
including non-train source split:  val 1300
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  test 2600
including non-train source split:  val 1300
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19992/19992 [00:00<00:00, 949352.69it/s]

num_train: 42278, num_updates: 4461, num_imgs_removed: 1891
Dataset:  R-refcocog umd
Cloning refcoco...





Gathering all val/test image_ids...
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  test 2600
including non-train source split:  val 1300
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  test 2600
including non-train source split:  val 1300
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25799/25799 [00:00<00:00, 1001943.06it/s]

num_train: 42226, num_updates: 4276, num_imgs_removed: 1860
Dataset:  refcoco unc
Cloning refcoco...





Gathering all val/test image_ids...
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  test 2600
including non-train source split:  val 1300
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  test 2600
including non-train source split:  val 1300
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19994/19994 [00:00<00:00, 1049967.62it/s]

num_train: 42404, num_updates: 4470, num_imgs_removed: 1891
Dataset:  refcoco+ unc
Cloning refcoco...





Gathering all val/test image_ids...
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  test 2600
including non-train source split:  val 1300
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  test 2600
including non-train source split:  val 1300
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19992/19992 [00:00<00:00, 996998.10it/s]

num_train: 42278, num_updates: 4461, num_imgs_removed: 1891
Dataset:  refcocog umd
Cloning refcoco...





Gathering all val/test image_ids...
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  test 2600
including non-train source split:  val 1300
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  test 2600
including non-train source split:  val 1300
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25799/25799 [00:00<00:00, 985212.54it/s]

num_train: 42226, num_updates: 4276, num_imgs_removed: 1860
Dataset:  fprefcoco_v002 berkeley
Cloning refcoco...





Gathering all val/test image_ids...
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  test 2600
including non-train source split:  val 1300
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  test 2600
including non-train source split:  val 1300
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19994/19994 [00:00<00:00, 670174.25it/s]

num_train: 42404, num_updates: 4470, num_imgs_removed: 1891
Dataset:  fprefcoco+_v002 berkeley
Cloning refcoco...





Gathering all val/test image_ids...
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  test 2600
including non-train source split:  val 1300
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  test 2600
including non-train source split:  val 1300
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19992/19992 [00:00<00:00, 827813.35it/s]

num_train: 42278, num_updates: 4461, num_imgs_removed: 1891
Dataset:  fprefcocog_v002 berkeley
Cloning refcoco...





Gathering all val/test image_ids...
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  test 2600
including non-train source split:  val 1300
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  val 1500
including non-train source split:  test 2600
including non-train source split:  val 1300
including non-train source split:  val 1500
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source split:  testB 750
including non-train source split:  testA 750
including non-train source

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25799/25799 [00:00<00:00, 868414.98it/s]

num_train: 42226, num_updates: 4276, num_imgs_removed: 1860





In [20]:
for refcoco in new_refcocos:
    print("Dataset: ", refcoco.dataset_name, refcoco.split_by)
    show_train_val_test(refcoco)

Dataset:  R-refcoco unc


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
testA,1975,750
testB,1810,750
train,37934,15103
train_exclude,4470,1891
val,3811,1500


Dataset:  R-refcoco+ unc


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
testA,1975,750
testB,1798,750
train,37817,15101
train_exclude,4461,1891
val,3805,1500


Dataset:  R-refcocog umd


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
test,5023,2600
train,37950,20039
train_exclude,4276,1860
val,2573,1300


Dataset:  refcoco unc


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
testA,1975,750
testB,1810,750
train,37934,15103
train_exclude,4470,1891
val,3811,1500


Dataset:  refcoco+ unc


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
testA,1975,750
testB,1798,750
train,37817,15101
train_exclude,4461,1891
val,3805,1500


Dataset:  refcocog umd


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
test,5023,2600
train,37950,20039
train_exclude,4276,1860
val,2573,1300


Dataset:  fprefcoco_v002 berkeley


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
testA,1975,750
testB,1810,750
train,37934,15103
train_exclude,4470,1891
val,3811,1500


Dataset:  fprefcoco+_v002 berkeley


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
testA,1975,750
testB,1798,750
train,37817,15101
train_exclude,4461,1891
val,3805,1500


Dataset:  fprefcocog_v002 berkeley


Unnamed: 0_level_0,refs,images
split,Unnamed: 1_level_1,Unnamed: 2_level_1
test,5023,2600
train,37950,20039
train_exclude,4276,1860
val,2573,1300


## Save Results

In [22]:
OUTPUT_DIR = Path("/shared/gbiamby/data/refer_seg")


def save_results(dataset_name, split_by, refcoco: COCO, output_dir: Path):
    # now save the new refs:
    refs_paths = [output_dir / dataset_name]
    for refs_path in refs_paths:
        assert refs_path.exists(), str(refs_path)

    for refs_path in refs_paths:
        refs_path = refs_path / f"refs({split_by}_exclude_unified).p"
        print("Saving to: ", refs_path)
        pickle.dump(refcoco.refs_data, open(refs_path, "wb"))


for refcoco_new in new_refcocos:
    print("Dataset: ", refcoco_new.dataset_name, refcoco_new.split_by)
    save_results(
        refcoco_new.dataset_name, refcoco_new.split_by, refcoco_new, OUTPUT_DIR
    )

Dataset:  R-refcoco unc
Saving to:  /shared/gbiamby/data/refer_seg/R-refcoco/refs(unc_exclude_unified).p
Dataset:  R-refcoco+ unc
Saving to:  /shared/gbiamby/data/refer_seg/R-refcoco+/refs(unc_exclude_unified).p
Dataset:  R-refcocog umd
Saving to:  /shared/gbiamby/data/refer_seg/R-refcocog/refs(umd_exclude_unified).p
Dataset:  refcoco unc
Saving to:  /shared/gbiamby/data/refer_seg/refcoco/refs(unc_exclude_unified).p
Dataset:  refcoco+ unc
Saving to:  /shared/gbiamby/data/refer_seg/refcoco+/refs(unc_exclude_unified).p
Dataset:  refcocog umd
Saving to:  /shared/gbiamby/data/refer_seg/refcocog/refs(umd_exclude_unified).p
Dataset:  fprefcoco_v002 berkeley
Saving to:  /shared/gbiamby/data/refer_seg/fprefcoco_v002/refs(berkeley_exclude_unified).p
Dataset:  fprefcoco+_v002 berkeley
Saving to:  /shared/gbiamby/data/refer_seg/fprefcoco+_v002/refs(berkeley_exclude_unified).p
Dataset:  fprefcocog_v002 berkeley
Saving to:  /shared/gbiamby/data/refer_seg/fprefcocog_v002/refs(berkeley_exclude_unifie