In [15]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
%pip list | grep json

fastjsonschema                    2.18.0
json5                             0.9.14
jsonpointer                       2.4
jsons                             1.6.3
jsonschema                        4.19.1
jsonschema-specifications         2023.7.1
pysimdjson                        5.0.2
python-json-logger                2.0.7
python-lsp-jsonrpc                1.1.1
ujson                             5.8.0
Note: you may need to restart the kernel to use updated packages.


In [17]:
import argparse
import copy
import csv
import decimal
import typing
from collections import Counter
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Set, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import simdjson as json
from IPython.display import display
from pycocotools.coco import COCO
from pycocotools.helpers import CocoClassDistHelper

In [18]:
%ls -lah "/home/gbiamby/proj/geo-llm-ret/lib/neg_refer_llm/dataset/refer_seg/refcocog/"

total 96M
drwxr-xr-x  2 gbiamby users    5 Nov 20  2016  [0m[01;34m.[0m/
drwxr-xr-x 17 gbiamby users   26 Oct 31 05:41  [01;34m..[0m/
-rw-r--r--  1 gbiamby users 119M Feb 11  2016  instances.json
-rw-r--r--  1 gbiamby users  33M Feb 16  2016 'refs(google).p'
-rw-r--r--  1 gbiamby users  33M Nov 20  2016 'refs(umd).p'


In [19]:
REFSEG_DIR = Path("/home/gbiamby/proj/geo-llm-ret/lib/neg_refer_llm/dataset/refer_seg/")
# dataset_json = Path(REFSEG_DIR / "R-refcocog/instances.json")
# coco_dist = CocoClassDistHelper(dataset_json)


### refcocog AKA G-Ref has two versions, sort of
While we were collecting our dataset, we learned that Tamara Berg had independently applied her ReferIt game [27] to the MSCOCO dataset to generate expressions for 50,000 objects from 19,994 images. She kindly shared her data (named as UNC-Ref-COCO dataset) with us. For brevity, we call our Google Refexp dataset as G-Ref and the UNC-Ref-COCO as UNC-ref. We report results on both datasets in this paper. However, due to differences in our collection methodologies, we have found that the de- scriptions in the two overlapped datasets exhibit significant qualitative differences, with descriptions in the UNC-Ref dataset tending to be more concise and to contain less flow- ery language than our descriptions. 1 More specifically, the average lengths of expressions from our dataset and UNC- Ref are 8.43 and 3.61 respectively. And the size of the word dictionaries (keeping only words appearing more than 3 times) from our dataset and UNC-Ref are 4849 and 2890 respectively. See Figure 3 for some visual comparisons

In [31]:
pd.options.display.float_format = "{:,}".format
pd.set_option("display.max_colwidth", None)


def get_property_details(obj):
    if isinstance(obj, dict):
        first_value = list(obj.values())[0]
        if isinstance(first_value, list):
            first_item = first_value[0]
            if isinstance(first_item, dict):
                return "dict->list->dict:" + str(first_item.keys())
            else:
                return (
                    "a: dict->list->"
                    + str(type(first_item))
                    + ", lengths: "
                    + str([len(i) for i in first_item[:5]])
                )
        elif isinstance(first_value, dict):
            second_value = list(first_value.values())[0]
            if isinstance(second_value, list):
                _list = second_value
                if isinstance(_list[0], list):
                    return (
                        "b: dict->dict->list["
                        + str(type(_list[0]))
                        + "]"
                        + ", lengths: "
                        + str([len(i) for i in _list[:5]])
                    )
                else:
                    return "b: dict->dict->list[" + str(type(_list[0])) + "]"

            else:
                return (
                    "b: dict->dict->["
                    + str(type(second_value))
                    + "]"
                    + str(first_value.keys())
                )
        else:
            return "c: dict->UNKNOWN " + str(type(first_value)) + "]"
    elif isinstance(obj, list):
        first_item = obj[0]
        if isinstance(first_item, dict):
            second_value = list(first_item.values())[0]
            return (
                "d: list->dict" + str(type(second_value)) + "]" + str(first_item.keys())
            )
        else:
            return "UNKNOWN"


def get_coco_df(coco: COCO) -> pd.DataFrame:
    df_meta = pd.DataFrame(
        {
            "property": [
                "cats",
                "images",
                "anns",
                "refs_data",
                "refs",
                "img_to_refs",
                "cat_to_refs",
                "ann_to_ref",
                "ref_to_ann",
                "sents",
                "sent_to_ref",
                "sent_to_tokens",
            ],
            "count": [
                len(coco.cats),
                len(coco.imgs),
                len(coco.anns),
                len(coco.refs_data),
                len(coco.refs),
                len(coco.img_to_refs),
                len(coco.cat_to_refs),
                len(coco.ann_to_ref),
                len(coco.ref_to_ann),
                len(coco.sents),
                len(coco.sent_to_ref),
                len(coco.sent_to_tokens),
            ],
            "python_type": [
                str(type(coco.cats)),
                str(type(coco.imgs)),
                str(type(coco.anns)),
                str(type(coco.refs_data)),
                str(type(coco.refs)),
                str(type(coco.img_to_refs)),
                str(type(coco.cat_to_refs)),
                str(type(coco.ann_to_ref)),
                str(type(coco.ref_to_ann)),
                str(type(coco.sents)),
                str(type(coco.sent_to_ref)),
                str(type(coco.sent_to_tokens)),
            ],
            "dict_keys": [
                get_property_details(coco.cats),
                get_property_details(coco.imgs),
                get_property_details(coco.anns),
                get_property_details(coco.refs_data),
                get_property_details(coco.refs),
                get_property_details(coco.img_to_refs),
                get_property_details(coco.cat_to_refs),
                get_property_details(coco.ann_to_ref),
                get_property_details(coco.ref_to_ann),
                get_property_details(coco.sents),
                get_property_details(coco.sent_to_ref),
                get_property_details(coco.sent_to_tokens),
            ],
        }
    )
    # display(df_meta)
    # display(df_meta.style.format(thousands=","))


# df_meta_ref = get_coco_df(ref_coco)
# df_meta_rref = get_coco_df(rref_coco)

In [60]:
def show_sentence_counts(coco: COCO, L=3, visualize=True):
    # print("Sentence counts for ", coco.)
    sentence_counts = Counter()
    counts = []
    anns_all = []
    for idx, ref in enumerate(coco.refs_data):
        sentences: list[dict] = ref["sentences"]
        count = len(sentences)
        sentence_counts.update({count: 1})
        counts.append(
            {
                "ref_id": ref["ref_id"],
                "image_id": ref["image_id"],
                "ann_id": ref["ann_id"],
                "category_id": ref["category_id"],
                "category": coco.cats[ref["category_id"]]["name"],
                "supercategory": coco.cats[ref["category_id"]]["supercategory"]
                if "supercategory" in coco.cats[ref["category_id"]]
                else str(coco.cats[ref["category_id"]]),
                "sent_count": len(sentences),
                "pos_sent_count": len(
                    [
                        s
                        for s in sentences
                        if ("exist" in s and s["exist"]) or "exist" not in s
                    ]
                ),
                "neg_sent_count": len(
                    [s for s in sentences if ("exist" in s and not s["exist"])]
                ),
            }
        )
    # anns
    df_anns = pd.DataFrame(coco.anns.values())
    df_anns["category"] = df_anns.category_id.apply(
        lambda cat_id: coco.cats[cat_id]["name"]
    )
    if "segmentation" in df_anns.columns:
        df_anns = df_anns.drop(columns=["segmentation"])
    if "bbox" in df_anns.columns:
        df_anns = df_anns.drop(columns=["bbox"])

    # print(sentence_counts, len(sentence_counts))
    df = pd.DataFrame(counts)

    print("pos/neg sentence_counts: ", df.pos_sent_count.sum(), df.neg_sent_count.sum())
    df_agg = pd.DataFrame(
        df.groupby(lambda x: True).agg(
            num_refs=("ref_id", "count"),
            sent_count=("sent_count", "sum"),
            total_pos_sents=("pos_sent_count", "sum"),
            total_neg_sents=("neg_sent_count", "sum"),
        )
    )
    if visualize:
        display(df_agg)
    if L >= 1:
        df_agg = pd.DataFrame(
            df.groupby(["pos_sent_count"]).agg(
                num_refs=("ref_id", "count"),
                sent_count=("sent_count", "sum"),
                total_pos_sents=("pos_sent_count", "sum"),
                total_neg_sents=("neg_sent_count", "sum"),
            )
        )
        if visualize:
            display(df_agg)
    if L >= 2:
        df_agg = pd.DataFrame(
            df.groupby(["pos_sent_count", "neg_sent_count"]).agg(
                num_refs=("ref_id", "count"),
                sent_count=("sent_count", "sum"),
                total_pos_sents=("pos_sent_count", "sum"),
                total_neg_sents=("neg_sent_count", "sum"),
            )
        )
        if visualize:
            display(df_agg)
    # display(df_agg.droplevel(axis=0, level=0).reset_index(drop=True))
    return df, df_agg.reset_index(), df_anns


# print("\nrefcoco:")
# df_refcoco, ref_recoco_agg = show_sentence_counts(ref_coco)
# print("\nR-refcoco:")
# df_rrefcoco, df_rrefcoco_agg = show_sentence_counts(rref_coco)

In [91]:
VALID_SPLITS = {
    "R-refcoco": ["unc"],
    "R-refcoco+": ["unc"],
    "R-refcocog": ["umd"],
    "refclef": ["berkeley", "unc"],
    "refcoco": ["google"],
    "refcoco+": ["unc"],
    "refcocog": [
        "google"
    ]  # ["google", "umd"], # google and umd seem to be identical? THey have the same aggregate stats.
    # "coconegref": ["berkeley"],
}


def build_refcoco(refseg_path: Path, dataset_name: str, split_by: str = None) -> COCO:
    assert dataset_name in VALID_SPLITS, dataset_name
    if split_by is None:
        split_by = VALID_SPLITS[dataset_name][0]
    else:
        assert split_by in VALID_SPLITS[dataset_name]
    coco = COCO(
        refseg_path / dataset_name / "instances.json",
        is_ref_dataset=True,
        dataset_name=dataset_name,
        split_by=split_by,
    )
    return coco


df_anns = []
df_refs = []
df_aggs = []
for dataset_name, splits in VALID_SPLITS.items():
    for split in splits:
        print("\n\n")
        print("=" * 220)
        print(f"Dataset: {dataset_name}({split})")
        ref_coco = build_refcoco(REFSEG_DIR, dataset_name, split)
        df_meta = get_coco_df(ref_coco)

        df_refcoco, df_refcoco_agg, df_ann = show_sentence_counts(
            ref_coco, L=1, visualize=False
        )
        # df_aggs
        df_refcoco_agg["dataset"] = f"{dataset_name}({split})"
        df_refcoco_agg["ann_count"] = len(ref_coco.anns)
        df_refcoco_agg["img_count"] = len(ref_coco.imgs)
        # Make 'dataset' the first column:
        df_refcoco_agg.insert(0, "dataset", df_refcoco_agg.pop("dataset"))
        df_aggs.append(df_refcoco_agg)

        # df_refs
        df_refcoco["dataset"] = f"{dataset_name}({split})"
        df_refcoco.insert(0, "dataset", df_refcoco.pop("dataset"))
        df_refs.append(df_refcoco)
        # df_anns
        df_ann["dataset"] = f"{dataset_name}({split})"
        df_ann.insert(0, "dataset", df_ann.pop("dataset"))
        df_anns.append(df_ann)


df_refs = pd.concat(df_refs)
df_aggs = pd.concat(df_aggs)
df_anns = pd.concat(df_anns)

display(df_anns)
display(df_refs)
display(df_aggs)




Dataset: R-refcoco(unc)
Loading refs from '/shared/patrickwu/dataset/refer_seg/R-refcoco/refs(unc).p'
Loaded 50000 refs
loading annotations into memory...
Done (t=3.31s)
creating index...
index created!
pos/neg sentence_counts:  142210 199869



Dataset: R-refcoco+(unc)
Loading refs from '/shared/patrickwu/dataset/refer_seg/R-refcoco+/refs(unc).p'
Loaded 49856 refs
loading annotations into memory...
Done (t=3.37s)
creating index...
index created!
pos/neg sentence_counts:  141564 196783



Dataset: R-refcocog(umd)
Loading refs from '/shared/patrickwu/dataset/refer_seg/R-refcocog/refs(umd).p'
Loaded 49822 refs
loading annotations into memory...
Done (t=5.86s)
creating index...
index created!
pos/neg sentence_counts:  95010 159806



Dataset: refclef(berkeley)
Loading refs from '/shared/gbiamby/data/refer_seg/refclef/refs(berkeley).p'
Loaded 99296 refs
loading annotations into memory...
Done (t=2.03s)
creating index...
index created!
pos/neg sentence_counts:  130364 0



Dataset: refcl

Unnamed: 0,dataset,area,iscrowd,image_id,category_id,id,category,mask_name
0,R-refcoco(unc),197.29899999999986,0.0,98304,18,3007,dog,
1,R-refcoco(unc),27152.935449999997,0.0,98304,63,99893,couch,
2,R-refcoco(unc),11087.449149999997,0.0,98304,62,108703,chair,
3,R-refcoco(unc),4841.5902000000015,0.0,98304,63,115415,couch,
4,R-refcoco(unc),19175.430549999997,0.0,98304,63,116865,couch,
...,...,...,...,...,...,...,...,...
208955,refcocog(google),566.7122999999999,0.0,393207,1,1202801,person,
208956,refcocog(google),729.3624500000001,0.0,393207,31,1836790,handbag,
208957,refcocog(google),88909.53885,0.0,524286,73,1099077,laptop,
208958,refcocog(google),59106.64675000001,0.0,524286,76,1116665,keyboard,


Unnamed: 0,dataset,ref_id,image_id,ann_id,category_id,category,supercategory,sent_count,pos_sent_count,neg_sent_count
0,R-refcoco(unc),0,581857,1719310,1,person,person,6,3,3
1,R-refcoco(unc),1,581857,463958,1,person,person,6,3,3
2,R-refcoco(unc),2,581839,495152,1,person,person,6,3,3
3,R-refcoco(unc),3,581839,485695,1,person,person,6,3,3
4,R-refcoco(unc),4,581789,453177,1,person,person,6,3,3
...,...,...,...,...,...,...,...,...,...,...
49817,refcocog(google),49817,287303,135604,3,car,vehicle,1,1,0
49818,refcocog(google),49818,355159,2166645,1,person,person,2,2,0
49819,refcocog(google),49819,400744,56109,19,horse,animal,2,2,0
49820,refcocog(google),49820,82135,584687,22,elephant,animal,2,2,0


Unnamed: 0,dataset,pos_sent_count,num_refs,sent_count,total_pos_sents,total_neg_sents,ann_count,img_count
0,R-refcoco(unc),1,2,13,2,11,196771,19994
1,R-refcoco(unc),2,8159,43210,16318,26892,196771,19994
2,R-refcoco(unc),3,41473,295620,124419,171201,196771,19994
3,R-refcoco(unc),4,360,3169,1440,1729,196771,19994
4,R-refcoco(unc),5,5,55,25,30,196771,19994
...,...,...,...,...,...,...,...,...
3,refcoco+(unc),4,521,2084,2084,0,196737,19992
4,refcoco+(unc),5,7,35,35,0,196737,19992
0,refcocog(google),1,4714,4714,4714,0,208960,25799
1,refcocog(google),2,45028,90056,90056,0,208960,25799


In [92]:
df_aggs.groupby("dataset").agg(
    num_refs=("num_refs", "sum"),
    sent_count=("sent_count", "sum"),
    total_pos_sents=("total_pos_sents", "sum"),
    total_neg_sents=("total_neg_sents", "sum"),
    total_ann_count=("ann_count", "min"),
    total_img_count=("img_count", "min"),
)

Unnamed: 0_level_0,num_refs,sent_count,total_pos_sents,total_neg_sents,total_ann_count,total_img_count
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
R-refcoco(unc),50000,342079,142210,199869,196771,19994
R-refcoco+(unc),49856,338347,141564,196783,196737,19992
R-refcocog(umd),49822,254816,95010,159806,208960,25799
refclef(berkeley),99296,130364,130364,0,99523,19997
refclef(unc),99296,130364,130364,0,99523,19997
refcoco(google),50000,142210,142210,0,196771,19994
refcoco+(unc),49856,141564,141564,0,196737,19992
refcocog(google),49822,95010,95010,0,208960,25799


In [93]:
display(df_anns)

Unnamed: 0,dataset,area,iscrowd,image_id,category_id,id,category,mask_name
0,R-refcoco(unc),197.29899999999986,0.0,98304,18,3007,dog,
1,R-refcoco(unc),27152.935449999997,0.0,98304,63,99893,couch,
2,R-refcoco(unc),11087.449149999997,0.0,98304,62,108703,chair,
3,R-refcoco(unc),4841.5902000000015,0.0,98304,63,115415,couch,
4,R-refcoco(unc),19175.430549999997,0.0,98304,63,116865,couch,
...,...,...,...,...,...,...,...,...
208955,refcocog(google),566.7122999999999,0.0,393207,1,1202801,person,
208956,refcocog(google),729.3624500000001,0.0,393207,31,1836790,handbag,
208957,refcocog(google),88909.53885,0.0,524286,73,1099077,laptop,
208958,refcocog(google),59106.64675000001,0.0,524286,76,1116665,keyboard,


In [94]:
df_img_ann = (
    df_refs.groupby(["dataset", "image_id"])
    .agg(
        img_anns=("ann_id", "nunique"),
        img_cats=("category_id", "count"),
        img_unique_cats=("category_id", "nunique"),
    )
    .sort_values(["dataset", "img_unique_cats"])
    .reset_index()
)
display(df_img_ann)

Unnamed: 0,dataset,image_id,img_anns,img_cats,img_unique_cats
0,R-refcoco(unc),72,2,2,1
1,R-refcoco(unc),110,4,4,1
2,R-refcoco(unc),113,3,3,1
3,R-refcoco(unc),144,3,3,1
4,R-refcoco(unc),154,2,2,1
...,...,...,...,...,...
171559,refcocog(google),226658,6,6,5
171560,refcocog(google),305772,5,5,5
171561,refcocog(google),318638,8,8,5
171562,refcocog(google),365659,7,7,5


In [95]:
display(df_refs)

Unnamed: 0,dataset,ref_id,image_id,ann_id,category_id,category,supercategory,sent_count,pos_sent_count,neg_sent_count
0,R-refcoco(unc),0,581857,1719310,1,person,person,6,3,3
1,R-refcoco(unc),1,581857,463958,1,person,person,6,3,3
2,R-refcoco(unc),2,581839,495152,1,person,person,6,3,3
3,R-refcoco(unc),3,581839,485695,1,person,person,6,3,3
4,R-refcoco(unc),4,581789,453177,1,person,person,6,3,3
...,...,...,...,...,...,...,...,...,...,...
49817,refcocog(google),49817,287303,135604,3,car,vehicle,1,1,0
49818,refcocog(google),49818,355159,2166645,1,person,person,2,2,0
49819,refcocog(google),49819,400744,56109,19,horse,animal,2,2,0
49820,refcocog(google),49820,82135,584687,22,elephant,animal,2,2,0


In [96]:
# These parts are unfinished. 
# Have to gather df_imgs properly (by left joining to df_refs)k to properly count refs_per_image, sents_per_image. 
# Right now the those are actually counting 
df_temp = df_refs.merge(df_img_ann, how="inner", on=["dataset", "image_id"])
# display(df_temp)
df_ref_img = (
    df_temp.groupby(["dataset", "image_id"])
    .agg(
        total_anns=("ann_id", "nunique"),
        total_refs=("ref_id", "nunique"),
        total_cats=("category_id", "count"),
        total_unique_cats=("category_id", "nunique"),
        total_sents=("sent_count", "sum"),
        pos_sent_count=("pos_sent_count", "sum"),
        neg_sent_count=("neg_sent_count", "sum"),
        img_anns=("img_anns", "max"),
        img_cats=("img_cats", "max"),
        img_unique_cats=("img_unique_cats", "max"),
    )
    .sort_values(["dataset", "img_unique_cats"])
)
display(df_ref_img)
del df_temp

Unnamed: 0_level_0,Unnamed: 1_level_0,total_anns,total_refs,total_cats,total_unique_cats,total_sents,pos_sent_count,neg_sent_count,img_anns,img_cats,img_unique_cats
dataset,image_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
R-refcoco(unc),72,2,2,2,1,10,5,5,2,2,1
R-refcoco(unc),110,4,4,4,1,22,11,11,4,4,1
R-refcoco(unc),113,3,3,3,1,24,12,12,3,3,1
R-refcoco(unc),144,3,3,3,1,22,11,11,3,3,1
R-refcoco(unc),154,2,2,2,1,28,8,20,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...
refcocog(google),226658,6,6,6,5,12,12,0,6,6,5
refcocog(google),305772,5,5,5,5,10,10,0,5,5,5
refcocog(google),318638,8,8,8,5,15,15,0,8,8,5
refcocog(google),365659,7,7,7,5,14,14,0,7,7,5


In [108]:

df_summary = df_ref_img.reset_index().groupby(["dataset"]).agg(
    images=("image_id", "nunique"),
    total_refs=("total_refs", "sum"),
    refs_per_object=("total_refs", "mean"),
    total_sents=("total_sents", "sum"),
    sents_per_object=("total_sents", "mean"),
    pos_sent_count=("pos_sent_count", "sum"),
    neg_sent_count=("neg_sent_count", "sum"),
    min_cats_per_img=("img_unique_cats", "min"),
    max_cats_per_img=("img_unique_cats", "max"),
    mean_cats_per_img=("img_unique_cats", "mean"),
)
pd.options.display.float_format = "{:,.2f}".format
pd.set_option("display.max_colwidth", None)

display(df_summary)

Unnamed: 0_level_0,images,total_refs,refs_per_object,total_sents,sents_per_object,pos_sent_count,neg_sent_count,min_cats_per_img,max_cats_per_img,mean_cats_per_img
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
R-refcoco(unc),19994,50000,2.5,342079,17.11,142210,199869,1,4,1.07
R-refcoco+(unc),19992,49856,2.49,338347,16.92,141564,196783,1,4,1.07
R-refcocog(umd),25799,49822,1.93,254816,9.88,95010,159806,1,6,1.17
refclef(berkeley),19997,99296,4.97,130364,6.52,130364,0,1,18,4.12
refclef(unc),19997,99296,4.97,130364,6.52,130364,0,1,18,4.12
refcoco(google),19994,50000,2.5,142210,7.11,142210,0,1,4,1.07
refcoco+(unc),19992,49856,2.49,141564,7.08,141564,0,1,4,1.07
refcocog(google),25799,49822,1.93,95010,3.68,95010,0,1,6,1.17


In [28]:
num_images = 50000
token_price = 3.0 / 100.0
tokens_per_image = 500
price = num_images * tokens_per_image * (1.0 / 1000) * token_price
print(price)

750.0


In [None]:
import spacy
from Levenshtein import distance, hamming

spacy.prefer_gpu()
import spacy_transformers

nlp = spacy.load("en_core_web_trf")


def inspect_robust_ref(
    num_pos: int, num_neg: int, coco: COCO, df: pd.DataFrame, ref_index=None
):
    print("=" * 220)
    df_ref_example = df[(df.pos_sent_count == num_pos) & (df.neg_sent_count == num_neg)]
    if ref_index is None or ref_index < 0:
        ref_index = int(np.random.uniform(0, len(df_ref_example)) // 1)
        print("ref_index: ", ref_index)
    display(pd.DataFrame(df_ref_example.iloc[ref_index]))
    # print(type(df_ref_example))

    example_ref = coco.refs[df_ref_example.iloc[ref_index]["ref_id"]]

    # Show the category and ann:
    ann = coco.ref_to_ann[example_ref["ref_id"]]
    print("ann: ", ann)
    print("category: ", coco.cats[ann["category_id"]])
    # Show the ref:
    ref_display = copy.deepcopy(example_ref)
    del ref_display["sentences"]
    print("ref: ", ref_display)

    # Show sentences:
    pos_sents = [s for s in example_ref["sentences"] if s["exist"]]
    neg_sents = [s for s in example_ref["sentences"] if not s["exist"]]
    print("\npositive sentences:")
    for s in pos_sents:
        s = copy.deepcopy(s)
        del s["tokens"]
        del s["raw"]
        print(s)
        doc = nlp(s["sent"])
        print("noun chunks: ", list(doc.noun_chunks))
        # print(doc.
        spacy.displacy.render(doc, style="dep")
        spacy.displacy.render(doc, style="span")

    print("\nnegative sentences:")
    for s in neg_sents:
        s_display = copy.deepcopy(s)
        num_tokens = len(s["tokens"])
        del s_display["tokens"]
        del s_display["raw"]
        print("neg_sent: ", s_display)
        print("noun chunks: ", list(nlp(s["sent"]).noun_chunks))
        for ps in pos_sents:
            distances = [distance(ps["tokens"], s["tokens"])]
            print(
                f"\tLevenstein distances from pos_sent '{ps['sent_id']}': ",
                distances,
                [d / num_tokens for d in distances],
            )
            distances = [hamming(ps["tokens"], s["tokens"])]
            print(
                f"\tHamming    distances from pos_sent '{ps['sent_id']}': ",
                distances,
                [d / num_tokens for d in distances],
            )


## Look at an example with 2 positive and 33 negative sentences, to make sure the above counts make sense:
inspect_robust_ref(2, 10, rref_coco, df_rrefcoco)
inspect_robust_ref(2, 33, rref_coco, df_rrefcoco)

In [None]:
inspect_robust_ref(1, 11, rref_coco, df_rrefcoco)

In [None]:
# % pip install spacy[cuda11x,transformers]
# %python -m spacy download en_core_web_trf
# Shouldn't need this since we added transformers in the first line:
### %pip install spacy-transformers