In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
%pip list | grep json

fastjsonschema                    2.18.0
json5                             0.9.14
jsonpointer                       2.4
jsons                             1.6.3
jsonschema                        4.19.1
jsonschema-specifications         2023.7.1
pysimdjson                        5.0.2
python-json-logger                2.0.7
python-lsp-jsonrpc                1.1.1
ujson                             5.8.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import argparse
import copy
import csv
import decimal
import typing
from collections import Counter
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Set, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import simdjson as json
from IPython.display import display
from pycocotools.coco import COCO
from pycocotools.helpers import CocoClassDistHelper

In [4]:
%ls -lah "/home/gbiamby/proj/geo-llm-ret/lib/neg_refer_llm/dataset/refer_seg/refcocog/"

total 96M
drwxr-xr-x 2 gbiamby users    5 Nov 20  2016  [0m[01;34m.[0m/
drwxr-xr-x 7 gbiamby users   16 Oct 23 21:25  [01;34m..[0m/
-rw-r--r-- 1 gbiamby users 119M Feb 11  2016  instances.json
-rw-r--r-- 1 gbiamby users  33M Feb 16  2016 'refs(google).p'
-rw-r--r-- 1 gbiamby users  33M Nov 20  2016 'refs(umd).p'


In [5]:
REFSEG_DIR = Path("/home/gbiamby/proj/geo-llm-ret/lib/neg_refer_llm/dataset/refer_seg/")
# dataset_json = Path(REFSEG_DIR / "R-refcocog/instances.json")
# coco_dist = CocoClassDistHelper(dataset_json)


### refcocog AKA G-Ref has two versions, sort of
While we were collecting our dataset, we learned that Tamara Berg had independently applied her ReferIt game [27] to the MSCOCO dataset to generate expressions for 50,000 objects from 19,994 images. She kindly shared her data (named as UNC-Ref-COCO dataset) with us. For brevity, we call our Google Refexp dataset as G-Ref and the UNC-Ref-COCO as UNC-ref. We report results on both datasets in this paper. However, due to differences in our collection methodologies, we have found that the de- scriptions in the two overlapped datasets exhibit significant qualitative differences, with descriptions in the UNC-Ref dataset tending to be more concise and to contain less flow- ery language than our descriptions. 1 More specifically, the average lengths of expressions from our dataset and UNC- Ref are 8.43 and 3.61 respectively. And the size of the word dictionaries (keeping only words appearing more than 3 times) from our dataset and UNC-Ref are 4849 and 2890 respectively. See Figure 3 for some visual comparisons

In [None]:
pd.options.display.float_format = "{:,}".format
pd.set_option("display.max_colwidth", None)


def get_property_details(obj):
    if isinstance(obj, dict):
        first_value = list(obj.values())[0]
        if isinstance(first_value, list):
            first_item = first_value[0]
            if isinstance(first_item, dict):
                return "dict->list->dict:" + str(first_item.keys())
            else:
                return (
                    "a: dict->list->"
                    + str(type(first_item))
                    + ", lengths: "
                    + str([len(i) for i in first_item[:5]])
                )
        elif isinstance(first_value, dict):
            second_value = list(first_value.values())[0]
            if isinstance(second_value, list):
                _list = second_value
                if isinstance(_list[0], list):
                    return (
                        "b: dict->dict->list["
                        + str(type(_list[0]))
                        + "]"
                        + ", lengths: "
                        + str([len(i) for i in _list[:5]])
                    )
                else:
                    return "b: dict->dict->list[" + str(type(_list[0])) + "]"

            else:
                return (
                    "b: dict->dict->["
                    + str(type(second_value))
                    + "]"
                    + str(first_value.keys())
                )
        else:
            return "c: dict->UNKNOWN " + str(type(first_value)) + "]"
    elif isinstance(obj, list):
        first_item = obj[0]
        if isinstance(first_item, dict):
            second_value = list(first_item.values())[0]
            return (
                "d: list->dict" + str(type(second_value)) + "]" + str(first_item.keys())
            )
        else:
            return "UNKNOWN"


def get_coco_df(coco: COCO) -> pd.DataFrame:
    df_meta = pd.DataFrame(
        {
            "property": [
                "cats",
                "images",
                "anns",
                "refs_data",
                "refs",
                "img_to_refs",
                "cat_to_refs",
                "ann_to_ref",
                "ref_to_ann",
                "sents",
                "sent_to_ref",
                "sent_to_tokens",
            ],
            "count": [
                len(coco.cats),
                len(coco.imgs),
                len(coco.anns),
                len(coco.refs_data),
                len(coco.refs),
                len(coco.img_to_refs),
                len(coco.cat_to_refs),
                len(coco.ann_to_ref),
                len(coco.ref_to_ann),
                len(coco.sents),
                len(coco.sent_to_ref),
                len(coco.sent_to_tokens),
            ],
            "python_type": [
                str(type(coco.cats)),
                str(type(coco.imgs)),
                str(type(coco.anns)),
                str(type(coco.refs_data)),
                str(type(coco.refs)),
                str(type(coco.img_to_refs)),
                str(type(coco.cat_to_refs)),
                str(type(coco.ann_to_ref)),
                str(type(coco.ref_to_ann)),
                str(type(coco.sents)),
                str(type(coco.sent_to_ref)),
                str(type(coco.sent_to_tokens)),
            ],
            "dict_keys": [
                get_property_details(coco.cats),
                get_property_details(coco.imgs),
                get_property_details(coco.anns),
                get_property_details(coco.refs_data),
                get_property_details(coco.refs),
                get_property_details(coco.img_to_refs),
                get_property_details(coco.cat_to_refs),
                get_property_details(coco.ann_to_ref),
                get_property_details(coco.ref_to_ann),
                get_property_details(coco.sents),
                get_property_details(coco.sent_to_ref),
                get_property_details(coco.sent_to_tokens),
            ],
        }
    )
    display(df_meta)
    # display(df_meta.style.format(thousands=","))


# df_meta_ref = get_coco_df(ref_coco)
# df_meta_rref = get_coco_df(rref_coco)

In [6]:
def show_sentence_counts(coco: COCO, L=3):
    # print("Sentence counts for ", coco.)
    sentence_counts = Counter()
    counts = []
    for idx, ref in enumerate(coco.refs_data):
        sentences: list[dict] = ref["sentences"]
        count = len(sentences)
        sentence_counts.update({count: 1})
        counts.append(
            {
                "ref_id": ref["ref_id"],
                "ann_id": ref["ann_id"],
                "category_id": ref["category_id"],
                "category": coco.cats[ref["category_id"]]["name"],
                "supercategory": coco.cats[ref["category_id"]]["supercategory"]
                if "supercategory" in coco.cats[ref["category_id"]]
                else str(coco.cats[ref["category_id"]]),
                "sent_count": len(sentences),
                "pos_sent_count": len(
                    [
                        s
                        for s in sentences
                        if ("exist" in s and s["exist"]) or "exist" not in s
                    ]
                ),
                "neg_sent_count": len(
                    [s for s in sentences if ("exist" in s and not s["exist"])]
                ),
            }
        )

    # print(sentence_counts, len(sentence_counts))
    df = pd.DataFrame(counts)
    # display(df)
    print("pos/neg sentence_counts: ", df.pos_sent_count.sum(), df.neg_sent_count.sum())
    df_agg = pd.DataFrame(
        df.groupby(lambda x: True).agg(
            num_refs=("ref_id", "count"),
            sent_count=("sent_count", "sum"),
            total_pos_sents=("pos_sent_count", "sum"),
            total_neg_sents=("neg_sent_count", "sum"),
        )
    )
    display(df_agg)
    if L >= 1:
        df_agg = pd.DataFrame(
            df.groupby(["pos_sent_count"]).agg(
                num_refs=("ref_id", "count"),
                sent_count=("sent_count", "sum"),
                total_pos_sents=("pos_sent_count", "sum"),
                total_neg_sents=("neg_sent_count", "sum"),
            )
        )
        display(df_agg)
    if L >= 2:
        df_agg = pd.DataFrame(
            df.groupby(["pos_sent_count", "neg_sent_count"]).agg(
                num_refs=("ref_id", "count"),
                sent_count=("sent_count", "sum"),
                total_pos_sents=("pos_sent_count", "sum"),
                total_neg_sents=("neg_sent_count", "sum"),
            )
        )
        display(df_agg)
    # display(df_agg.droplevel(axis=0, level=0).reset_index(drop=True))
    return df, df_agg.reset_index()


# print("\nrefcoco:")
# df_refcoco, ref_recoco_agg = show_sentence_counts(ref_coco)
# print("\nR-refcoco:")
# df_rrefcoco, df_rrefcoco_agg = show_sentence_counts(rref_coco)

In [None]:
VALID_SPLITS = {
    "R-refcoco": ["unc"],
    "R-refcoco+": ["unc"],
    "R-refcocog": ["umd"],
    "refclef": ["berkeley", "unc"],
    "refcoco": ["google"],
    "refcoco+": ["unc"],
    "refcocog": ["google", "umd"],
    "coconegref": ["berkeley"],
}


def build_refcoco(refseg_path: Path, dataset_name: str, splitBy: str = None) -> COCO:
    assert dataset_name in VALID_SPLITS, dataset_name
    if splitBy is None:
        splitBy = VALID_SPLITS[dataset_name][0]
    else:
        assert splitBy in VALID_SPLITS[dataset_name]
    coco = COCO(
        refseg_path / dataset_name / "instances.json",
        is_ref_dataset=True,
        dataset_name=dataset_name,
        splitBy=splitBy,
    )
    return coco


df_aggs = []
for dataset_name, splits in VALID_SPLITS.items():
    for split in splits:
        print("\n\n")
        print("=" * 220)
        print(f"Dataset: {dataset_name}({split})")
        ref_coco = build_refcoco(dataset_name, split)
        df_meta = get_coco_df(ref_coco)
        df_refcoco, df_refcoco_agg = show_sentence_counts(ref_coco, L=1)
        df_refcoco_agg["dataset"] = f"{dataset_name}({split})"
        df_refcoco_agg["ann_count"] = len(ref_coco.anns)
        df_refcoco_agg["img_count"] = len(ref_coco.imgs)
        df_aggs.append(df_refcoco_agg)
        
df_aggs = pd.concat(df_aggs)
# Make 'dataset' the first column:
df_aggs.insert(0, "dataset", df_aggs.pop("dataset"))

display(df_aggs)

In [None]:
df_aggs.groupby("dataset").agg(
    num_refs=("num_refs", "sum"),
    sent_count=("sent_count", "sum"),
    total_pos_sents=("total_pos_sents", "sum"),
    total_neg_sents=("total_neg_sents", "sum"),
    total_ann_count=("ann_count", "min"),
    total_img_count=("img_count", "min"),
)

In [None]:
import spacy
from Levenshtein import distance, hamming

spacy.prefer_gpu()
import spacy_transformers

nlp = spacy.load("en_core_web_trf")


def inspect_robust_ref(
    num_pos: int, num_neg: int, coco: COCO, df: pd.DataFrame, ref_index=None
):
    print("=" * 220)
    df_ref_example = df[(df.pos_sent_count == num_pos) & (df.neg_sent_count == num_neg)]
    if ref_index is None or ref_index < 0:
        ref_index = int(np.random.uniform(0, len(df_ref_example)) // 1)
        print("ref_index: ", ref_index)
    display(pd.DataFrame(df_ref_example.iloc[ref_index]))
    # print(type(df_ref_example))

    example_ref = coco.refs[df_ref_example.iloc[ref_index]["ref_id"]]

    # Show the category and ann:
    ann = coco.ref_to_ann[example_ref["ref_id"]]
    print("ann: ", ann)
    print("category: ", coco.cats[ann["category_id"]])
    # Show the ref:
    ref_display = copy.deepcopy(example_ref)
    del ref_display["sentences"]
    print("ref: ", ref_display)

    # Show sentences:
    pos_sents = [s for s in example_ref["sentences"] if s["exist"]]
    neg_sents = [s for s in example_ref["sentences"] if not s["exist"]]
    print("\npositive sentences:")
    for s in pos_sents:
        s = copy.deepcopy(s)
        del s["tokens"]
        del s["raw"]
        print(s)
        doc = nlp(s["sent"])
        print("noun chunks: ", list(doc.noun_chunks))
        # print(doc.
        spacy.displacy.render(doc, style="dep")
        spacy.displacy.render(doc, style="span")

    print("\nnegative sentences:")
    for s in neg_sents:
        s_display = copy.deepcopy(s)
        num_tokens = len(s["tokens"])
        del s_display["tokens"]
        del s_display["raw"]
        print("neg_sent: ", s_display)
        print("noun chunks: ", list(nlp(s["sent"]).noun_chunks))
        for ps in pos_sents:
            distances = [distance(ps["tokens"], s["tokens"])]
            print(
                f"\tLevenstein distances from pos_sent '{ps['sent_id']}': ",
                distances,
                [d / num_tokens for d in distances],
            )
            distances = [hamming(ps["tokens"], s["tokens"])]
            print(
                f"\tHamming    distances from pos_sent '{ps['sent_id']}': ",
                distances,
                [d / num_tokens for d in distances],
            )


## Look at an example with 2 positive and 33 negative sentences, to make sure the above counts make sense:
inspect_robust_ref(2, 10, rref_coco, df_rrefcoco)
inspect_robust_ref(2, 33, rref_coco, df_rrefcoco)

In [None]:
inspect_robust_ref(1, 11, rref_coco, df_rrefcoco)

In [None]:
# % pip install spacy[cuda11x,transformers]
# %python -m spacy download en_core_web_trf
# Shouldn't need this since we added transformers in the first line:
### %pip install spacy-transformers