In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
%pip list | grep json

fastjsonschema                    2.18.0
json5                             0.9.14
jsonpointer                       2.4
jsons                             1.6.3
jsonschema                        4.19.1
jsonschema-specifications         2023.7.1
pysimdjson                        5.0.2
python-json-logger                2.0.7
python-lsp-jsonrpc                1.1.1
ujson                             5.8.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import argparse
import copy
import csv
import decimal
import typing
from collections import Counter
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Set, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import simdjson as json
from IPython.display import display
from pycocotools.coco import COCO
from pycocotools.helpers import CocoClassDistHelper

In [4]:
%ls -lah "/home/gbiamby/proj/geo-llm-ret/lib/neg_refer_llm/dataset/refer_seg/refcocog/"

total 96M
drwxr-xr-x 2 gbiamby users    5 Nov 20  2016  [0m[01;34m.[0m/
drwxr-xr-x 7 gbiamby users   16 Oct 23 21:25  [01;34m..[0m/
-rw-r--r-- 1 gbiamby users 119M Feb 11  2016  instances.json
-rw-r--r-- 1 gbiamby users  33M Feb 16  2016 'refs(google).p'
-rw-r--r-- 1 gbiamby users  33M Nov 20  2016 'refs(umd).p'


In [5]:
REFSEG_DIR = Path("/home/gbiamby/proj/geo-llm-ret/lib/neg_refer_llm/dataset/refer_seg/")
dataset_json = Path(REFSEG_DIR / "R-refcocog/instances.json")
coco_dist = CocoClassDistHelper(dataset_json)

loading annotations into memory...
Done (t=8.20s)
creating index...
index created!
num images: 25799
num annotations: 208960



### refcocog AKA G-Ref has two versions, sort of
While we were collecting our dataset, we learned that Tamara Berg had independently applied her ReferIt game [27] to the MSCOCO dataset to generate expressions for 50,000 objects from 19,994 images. She kindly shared her data (named as UNC-Ref-COCO dataset) with us. For brevity, we call our Google Refexp dataset as G-Ref and the UNC-Ref-COCO as UNC-ref. We report results on both datasets in this paper. However, due to differences in our collection methodologies, we have found that the de- scriptions in the two overlapped datasets exhibit significant qualitative differences, with descriptions in the UNC-Ref dataset tending to be more concise and to contain less flow- ery language than our descriptions. 1 More specifically, the average lengths of expressions from our dataset and UNC- Ref are 8.43 and 3.61 respectively. And the size of the word dictionaries (keeping only words appearing more than 3 times) from our dataset and UNC-Ref are 4849 and 2890 respectively. See Figure 3 for some visual comparisons

In [6]:
VALID_SPLITS = {
    "R-refcoco": ["unc"],
    "R-refcoco+": ["unc"],
    "R-refcocog": ["umd"],
    "refclef": ["berkeley", "unc"],
    "refcoco": ["google"],
    "refcoco+": ["unc"],
    "refcocog": ["google", "umd"],
}


def build_refcoco(dataset_name: str, splitBy: str = None) -> COCO:
    assert dataset_name in VALID_SPLITS, dataset_name
    if splitBy is None:
        splitBy = VALID_SPLITS[dataset_name][0]
    else:
        assert splitBy in VALID_SPLITS[dataset_name]
    coco = COCO(
        REFSEG_DIR / dataset_name / "instances.json",
        is_ref_dataset=True,
        dataset_name=dataset_name,
        splitBy=splitBy,
    )
    return coco


ref_coco = build_refcoco("refcocog")
rref_coco = build_refcoco("R-refcocog", "umd")

Loading refs from '/shared/gbiamby/data/refer_seg/refcocog/refs(google).p'
Loaded 49822 refs
loading annotations into memory...
Done (t=3.18s)
creating index...
index created!
Loading refs from '/shared/patrickwu/dataset/refer_seg/R-refcocog/refs(umd).p'
Loaded 49822 refs
loading annotations into memory...
Done (t=2.68s)
creating index...
index created!


### The `refs(unk|umd|google|berkeley).p` files are just pickled python objects

They are lists of `dict`s

The keys are:

```python
dict_keys(['image_id', 'split', 'sentences', 'file_name', 'category_id', 'ann_id', 'sent_ids', 'ref_id'])
```


In [7]:
print(type(ref_coco.refs_data))
print(type(ref_coco.refs_data[0]))
print(type(rref_coco.refs_data))
print(type(rref_coco.refs_data[0]))

<class 'list'>
<class 'dict'>
<class 'list'>
<class 'dict'>


In [None]:
pd.options.display.float_format = "{:,}".format
pd.set_option("display.max_colwidth", None)


def get_property_details(obj):
    if isinstance(obj, dict):
        first_value = list(obj.values())[0]
        if isinstance(first_value, list):
            first_item = first_value[0]
            if isinstance(first_item, dict):
                return "dict->list->dict:" + str(first_item.keys())
            else:
                return (
                    "a: dict->list->"
                    + str(type(first_item))
                    + ", lengths: "
                    + str([len(i) for i in first_item[:5]])
                )
        elif isinstance(first_value, dict):
            second_value = list(first_value.values())[0]
            if isinstance(second_value, list):
                _list = second_value
                if isinstance(_list[0], list):
                    return (
                        "b: dict->dict->list["
                        + str(type(_list[0]))
                        + "]"
                        + ", lengths: "
                        + str([len(i) for i in _list[:5]])
                    )
                else:
                    return "b: dict->dict->list[" + str(type(_list[0])) + "]"

            else:
                return (
                    "b: dict->dict->["
                    + str(type(second_value))
                    + "]"
                    + str(first_value.keys())
                )
        else:
            return "c: dict->UNKNOWN " + str(type(first_value)) + "]"
    elif isinstance(obj, list):
        first_item = obj[0]
        if isinstance(first_item, dict):
            second_value = list(first_item.values())[0]
            return (
                "d: list->dict" + str(type(second_value)) + "]" + str(first_item.keys())
            )
        else:
            return "UNKNOWN"


def get_coco_df(coco: COCO) -> pd.DataFrame:
    df_meta = pd.DataFrame(
        {
            "property": [
                "cats",
                "images",
                "anns",
                "refs_data",
                "refs",
                "img_to_refs",
                "cat_to_refs",
                "ann_to_ref",
                "ref_to_ann",
                "sents",
                "sent_to_ref",
                "sent_to_tokens",
            ],
            "count": [
                len(coco.cats),
                len(coco.imgs),
                len(coco.anns),
                len(coco.refs_data),
                len(coco.refs),
                len(coco.img_to_refs),
                len(coco.cat_to_refs),
                len(coco.ann_to_ref),
                len(coco.ref_to_ann),
                len(coco.sents),
                len(coco.sent_to_ref),
                len(coco.sent_to_tokens),
            ],
            "python_type": [
                str(type(coco.cats)),
                str(type(coco.imgs)),
                str(type(coco.anns)),
                str(type(coco.refs_data)),
                str(type(coco.refs)),
                str(type(coco.img_to_refs)),
                str(type(coco.cat_to_refs)),
                str(type(coco.ann_to_ref)),
                str(type(coco.ref_to_ann)),
                str(type(coco.sents)),
                str(type(coco.sent_to_ref)),
                str(type(coco.sent_to_tokens)),
            ],
            "dict_keys": [
                get_property_details(coco.cats),
                get_property_details(coco.imgs),
                get_property_details(coco.anns),
                get_property_details(coco.refs_data),
                get_property_details(coco.refs),
                get_property_details(coco.img_to_refs),
                get_property_details(coco.cat_to_refs),
                get_property_details(coco.ann_to_ref),
                get_property_details(coco.ref_to_ann),
                get_property_details(coco.sents),
                get_property_details(coco.sent_to_ref),
                get_property_details(coco.sent_to_tokens),
            ],
        }
    )
    display(df_meta)
    # display(df_meta.style.format(thousands=","))


df_meta_ref = get_coco_df(ref_coco)
df_meta_rref = get_coco_df(rref_coco)

It appears that each `ref` corresponds to each `ann` from the original dataset.

---

### Inspecting Refs


In [8]:
from copy import deepcopy


def inspect_refs(coco: COCO):
    ref = coco.refs_data[0]
    print("ref:")
    print([(key, ref[key]) for key in ref.keys() if key not in {"sentences"}])
    print("First sentence: ", ref["sentences"][0])

    print("\nThe corresponding annotation:")
    ann = coco.ref_to_ann[ref["ref_id"]]

    print("ref_to_anns")
    print(ann)
    print(type(ann))

    print("ann category: ")
    print(coco.cats[ann["category_id"]])

    img_anns = coco.imgToAnns[ref["image_id"]]
    print(
        f"\nimage anns for image_id:{ref['image_id'], ref['file_name']}, (total anns: {len(img_anns)})"
    )
    for ann in img_anns:
        ann = copy.deepcopy(ann)
        del ann["segmentation"]
        del ann["area"]
        ann["cat"] = coco.cats[ann["category_id"]]
        print(ann)

    img_refs = coco.img_to_refs[ref["image_id"]]
    print("\nHow many refs for this image: ", len(img_refs))
    for img_ref in img_refs:
        print("")
        ref_cat = coco.cats[coco.anns[ref["ann_id"]]["category_id"]]
        print("ref_id: ", img_ref["ref_id"], ", ann: ", ref["ann_id"], ", ref_cat: ", ref_cat)
        img_ref_copy =  deepcopy(img_ref)
        del img_ref_copy["sentences"]
        # del ref_copy[""]
        print("ref: ", img_ref_copy)
        print(
            "Total sentences where obj exists: ",
            len(
                [
                    s
                    for s in img_ref["sentences"]
                    if ("exist" in s and s["exist"]) or "exist" not in s
                ]
            ),
        )
        print(
            "Total sentences where obj does not exist: ",
            len([s for s in img_ref["sentences"] if "exist" in s and not s["exist"]]),
        )
        print(len(img_ref["sentences"]))
        for s in img_ref["sentences"]:
            print("\t", s["raw"], f", exist: {s['exist']}" if "exist" in s else "")


print("refcocog")
inspect_refs(ref_coco)

print("=" * 200)
print("R-refcocog")
inspect_refs(rref_coco)

refcocog
ref:
[('image_id', 546154), ('split', 'val'), ('file_name', 'COCO_train2014_000000546154_298801.jpg'), ('category_id', 32), ('ann_id', 298801), ('sent_ids', [2, 3]), ('ref_id', 0)]
First sentence:  {'tokens': ['the', 'tie', 'of', 'the', 'standing', 'man'], 'raw': 'The tie of the standing man', 'sent_id': 2, 'sent': 'the tie of the standing man'}

The corresponding annotation:
ref_to_anns
{'segmentation': [[286.76, 243.95, 290.38, 248.77, 291.58, 253.59, 290.38, 265.04, 293.99, 276.49, 299.41, 292.16, 301.22, 308.42, 303.63, 330.12, 303.63, 342.17, 304.84, 350.0, 306.65, 363.86, 310.86, 385.56, 316.89, 412.67, 317.49, 419.3, 322.92, 454.85, 340.99, 469.92, 351.84, 437.38, 345.81, 415.68, 342.8, 400.62, 337.98, 375.91, 333.76, 346.39, 329.14, 327.07, 323.97, 308.29, 321.16, 291.86, 315.99, 275.43, 311.3, 262.28, 306.13, 250.08, 301.91, 243.97, 301.91, 241.15, 292.98, 233.17, 287.35, 242.56]], 'area': 5730.7198, 'iscrowd': 0, 'image_id': 546154, 'bbox': [286.76, 233.17, 65.08, 23

In [9]:

filename = "COCO_train2014_000000380440.jpg"

for img_id, img in rref_coco.imgs.items():
    if img["file_name"].lower() == filename.lower():
        print(f"Found img: ", img)
        break
        
print(img)
print("")
for ann in rref_coco.imgToAnns[img_id]:
    ann_copy = deepcopy(ann)
    del ann_copy["segmentation"]
    del ann_copy["bbox"]
    print("Ann: ", ann_copy)
    print("Cat: ", rref_coco.cats[ann["category_id"]])

Found img:  {'license': 3, 'file_name': 'COCO_train2014_000000380440.jpg', 'coco_url': 'http://mscoco.org/images/380440', 'height': 376, 'width': 640, 'date_captured': '2013-11-19 19:48:52', 'flickr_url': 'http://farm9.staticflickr.com/8125/8614152509_bc463a8248_z.jpg', 'id': 380440}
{'license': 3, 'file_name': 'COCO_train2014_000000380440.jpg', 'coco_url': 'http://mscoco.org/images/380440', 'height': 376, 'width': 640, 'date_captured': '2013-11-19 19:48:52', 'flickr_url': 'http://farm9.staticflickr.com/8125/8614152509_bc463a8248_z.jpg', 'id': 380440}

Ann:  {'area': 15877.136, 'iscrowd': 0, 'image_id': 380440, 'category_id': 1, 'id': 470048}
Cat:  {'supercategory': 'person', 'id': 1, 'name': 'person'}
Ann:  {'area': 10315.838250000003, 'iscrowd': 0, 'image_id': 380440, 'category_id': 1, 'id': 491042}
Cat:  {'supercategory': 'person', 'id': 1, 'name': 'person'}
Ann:  {'area': 1658.1348500000036, 'iscrowd': 0, 'image_id': 380440, 'category_id': 35, 'id': 607150}
Cat:  {'supercategory': 

In [None]:
imgs = [img["file_name"] for img in rref_coco.imgs.values()]
print(imgs[:20])

In [None]:
# what is 607150, it is appended to the rrefcoco filename:
print(607150 in rref_coco.imgs)
print(607150 in rref_coco.anns) # <-- looks like it is an ann_id
print(607150 in rref_coco.refs)

For the most part, refcocog has two sentences for each image.

`Counter({2: 45028, 1: 4714, 3: 80})`


In [10]:
def show_sentence_counts(coco: COCO):
    # print("Sentence counts for ", coco.)
    sentence_counts = Counter()
    counts = []
    for idx, ref in enumerate(coco.refs_data):
        sentences: list[dict] = ref["sentences"]
        count = len(sentences)
        sentence_counts.update({count: 1})
        counts.append(
            {
                "ref_id": ref["ref_id"],
                "ann_id": ref["ann_id"],
                "category_id": ref["category_id"],
                "category": coco.cats[ref["category_id"]]["name"],
                "supercategory": coco.cats[ref["category_id"]]["supercategory"],
                "sent_count": len(sentences),
                "pos_sent_count": len(
                    [
                        s
                        for s in sentences
                        if ("exist" in s and s["exist"]) or "exist" not in s
                    ]
                ),
                "neg_sent_count": len(
                    [s for s in sentences if ("exist" in s and not s["exist"])]
                ),
            }
        )

    # print(sentence_counts, len(sentence_counts))
    df = pd.DataFrame(counts)
    # display(df)
    print("pos/neg sentence_counts: ", df.pos_sent_count.sum(), df.neg_sent_count.sum())
    df_agg = pd.DataFrame(
        df.groupby(lambda x: True).agg(
            num_refs=("ref_id", "count"),
            sent_count=("sent_count", "sum"),
            pos_sent_count=("pos_sent_count", "sum"),
            neg_sent_count=("neg_sent_count", "sum"),
        )
    )
    display(df_agg)
    df_agg = pd.DataFrame(
        df.groupby(["pos_sent_count"]).agg(
            num_refs=("ref_id", "count"),
            sent_count=("sent_count", "sum"),
            pos_sent_count=("pos_sent_count", "sum"),
            neg_sent_count=("neg_sent_count", "sum"),
        )
    )
    display(df_agg)
    df_agg = pd.DataFrame(
        df.groupby(["pos_sent_count", "neg_sent_count"]).agg(
            num_refs=("ref_id", "count"),
            sent_count=("sent_count", "sum"),
            pos_sent_count=("pos_sent_count", "sum"),
            neg_sent_count=("neg_sent_count", "sum"),
        )
    )
    display(df_agg)
    # display(df_agg.droplevel(axis=0, level=0).reset_index(drop=True))
    return df, df_agg


print("\nrefcoco:")
df_refcoco, ref_recoco_agg = show_sentence_counts(ref_coco)
print("\nR-refcoco:")
df_rrefcoco, df_rrefcoco_agg = show_sentence_counts(rref_coco)


refcoco:
pos/neg sentence_counts:  95010 0


Unnamed: 0,num_refs,sent_count,pos_sent_count,neg_sent_count
True,49822,95010,95010,0


Unnamed: 0_level_0,num_refs,sent_count,pos_sent_count,neg_sent_count
pos_sent_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,4714,4714,4714,0
2,45028,90056,90056,0
3,80,240,240,0


Unnamed: 0_level_0,Unnamed: 1_level_0,num_refs,sent_count,pos_sent_count,neg_sent_count
pos_sent_count,neg_sent_count,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,4714,4714,4714,0
2,0,45028,90056,90056,0
3,0,80,240,240,0



R-refcoco:
pos/neg sentence_counts:  95010 159806


Unnamed: 0,num_refs,sent_count,pos_sent_count,neg_sent_count
True,49822,254816,95010,159806


Unnamed: 0_level_0,num_refs,sent_count,pos_sent_count,neg_sent_count
pos_sent_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,4714,16107,4714,11393
2,45028,238115,90056,148059
3,80,594,240,354


Unnamed: 0_level_0,Unnamed: 1_level_0,num_refs,sent_count,pos_sent_count,neg_sent_count
pos_sent_count,neg_sent_count,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,4005,8010,4005,4005
1,10,667,7337,667,6670
1,11,9,108,9,99
1,12,7,91,7,84
1,13,6,84,6,78
1,14,5,75,5,70
1,15,2,32,2,30
1,16,5,85,5,80
1,18,1,19,1,18
1,19,1,20,1,19


In [None]:
import spacy
from Levenshtein import distance, hamming

spacy.prefer_gpu()
import spacy_transformers

nlp = spacy.load("en_core_web_trf")


def inspect_robust_ref(
    num_pos: int, num_neg: int, coco: COCO, df: pd.DataFrame, ref_index=None
):
    print("=" * 220)
    df_ref_example = df[(df.pos_sent_count == num_pos) & (df.neg_sent_count == num_neg)]
    if ref_index is None or ref_index < 0:
        ref_index = int(np.random.uniform(0, len(df_ref_example)) // 1)
        print("ref_index: ", ref_index)
    display(pd.DataFrame(df_ref_example.iloc[ref_index]))
    # print(type(df_ref_example))

    example_ref = coco.refs[df_ref_example.iloc[ref_index]["ref_id"]]

    # Show the category and ann:
    ann = coco.ref_to_ann[example_ref["ref_id"]]
    print("ann: ", ann)
    print("category: ", coco.cats[ann["category_id"]])
    # Show the ref:
    ref_display = copy.deepcopy(example_ref)
    del ref_display["sentences"]
    print("ref: ", ref_display)

    # Show sentences:
    pos_sents = [s for s in example_ref["sentences"] if s["exist"]]
    neg_sents = [s for s in example_ref["sentences"] if not s["exist"]]
    print("\npositive sentences:")
    for s in pos_sents:
        s = copy.deepcopy(s)
        del s["tokens"]
        del s["raw"]
        print(s)
        doc = nlp(s["sent"])
        print("noun chunks: ", list(doc.noun_chunks))
        # print(doc.
        spacy.displacy.render(doc, style="dep")
        spacy.displacy.render(doc, style="span")

    print("\nnegative sentences:")
    for s in neg_sents:
        s_display = copy.deepcopy(s)
        num_tokens = len(s["tokens"])
        del s_display["tokens"]
        del s_display["raw"]
        print("neg_sent: ", s_display)
        print("noun chunks: ", list(nlp(s["sent"]).noun_chunks))
        for ps in pos_sents:
            distances = [distance(ps["tokens"], s["tokens"])]
            print(
                f"\tLevenstein (from pos:'{ps['sent_id']})': ",
                distances,
                # [d / num_tokens for d in distances],
            )
            distances = [hamming(ps["tokens"], s["tokens"])]
            print(
                f"\tHamming    (from pos:'{ps['sent_id']})': ",
                distances,
                # [d / num_tokens for d in distances],
            )


## Look at an example with 2 positive and 33 negative sentences, to make sure the above counts make sense:
inspect_robust_ref(2, 10, rref_coco, df_rrefcoco)
inspect_robust_ref(2, 33, rref_coco, df_rrefcoco)

In [None]:
inspect_robust_ref(1, 11, rref_coco, df_rrefcoco)

In [None]:
# % pip install spacy[cuda11x,transformers]
# %python -m spacy download en_core_web_trf
# Shouldn't need this since we added transformers in the first line:
### %pip install spacy-transformers

### Tagging the data

In [None]:
# import spacy
# spacy.prefer_gpu()
# import spacy_transformers
# nlp = spacy.load("en_core_web_trf")

In [None]:
doc = nlp("This is a sentence.")
print("noun chunks: ", list(doc.noun_chunks))
print([(w.text, w.pos_) for w in doc])

In [None]:
import spacy

spacy.prefer_gpu()
import spacy_transformers

nlp = spacy.load("en_core_web_trf")

In [None]:
def get_sents_df(coco: COCO) -> pd.DataFrame():
    for ref_idx, ref in enumerate(coco.refs_data):
        if ref_idx > 10:
            break

        sentences = ref["sentences"]
        for sent in sentences:
            doc = nlp(sent["sent"])
            print("sent: ", sent["sent"], ", ", sent["exist"])
            print("pos_:", [token.pos_ for token in doc])
            print("nouns: ", list(doc.noun_chunks))
            print("ents: ", doc.ents)
            
            print("spans: ", doc.spans)
        
    # print(len(coco.sents))
    # print(coco.sents[0])


df_sents = get_sents_df(rref_coco)

In [None]:
print("R-refcocog")
inspect_refs(rref_coco)

Look at sentences


In [None]:
print("Total sentences: ", len(ref["sentences"]))
print(
    "Total sentences where obj exists: ",
    len([s for s in ref["sentences"] if s["exist"]]),
)
print(
    "Total sentences where obj does not exist: ",
    len([s for s in ref["sentences"] if not s["exist"]]),
)
for sent in ref["sentences"]:
    print(sent)

In [None]:
df_counts = (
    pd.DataFrame(list(coco.get_cat_counts().values()))
    .sort_values("ann_count", ascending=False)
    .reset_index(drop=True)
)
total_anns = df_counts.ann_count.sum()
df_counts["ann_count_pdf"] = df_counts.ann_count / total_anns
df_counts["ann_count_cdf"] = df_counts.ann_count_pdf.cumsum()
display(df_counts)

## Add frequency bins based on annotation count Cumulative Distribution Function


In [None]:
df_counts["freq_bin_2"] = df_counts.ann_count_cdf.apply(
    lambda x: "high" if x < 0.5 else "low"
)
df_counts["freq_bin_3"] = df_counts.ann_count_cdf.apply(
    lambda x: "high" if x < 0.333 else "medium" if x <= 0.667 else "low"
)
display(df_counts)

In [None]:
plt.figure(figsize=(25, 15))
color_map = plt.get_cmap("magma")
fig = sns.barplot(
    data=df_counts.sort_values(["img_count"], ascending=False),
    x="name",
    y="img_count",
    hue=df_counts.freq_bin_3.values,
)
fig.set_xticklabels(fig.get_xticklabels(), rotation=45, horizontalalignment="right")
fig.set_title(f"Per-category Annotated Image Counts")
plt.tight_layout()

In [None]:
plt.figure(figsize=(25, 15))
color_map = plt.get_cmap("magma")
fig = sns.barplot(
    data=df_counts.sort_values(["ann_count"], ascending=False),
    x="name",
    y="ann_count",
    hue="freq_bin_3",
)
fig.set_xticklabels(fig.get_xticklabels(), rotation=45, horizontalalignment="right")
fig.set_title(f"Per-category Annotation Counts")
plt.tight_layout()