## neg_refcocov001.ipynb

Create a COCO formatted dataset that uses a simplistic method to create false-premise referring expressions, along with correcting expressions. The method is to swap nouns with categories from a sibling class in the same COCO supercategory.

The referring expressions follow same format as refcoco/refcocog/refcoco+/R-refcoco/etc, i.e., a coco formatted json file, accompanied by a file with a `.p` extension, which contains the referring expression. The `.p` file is a python pickle file. These datasets can be loaded using the common `refer.py`, or the `COCO` class in `github.com/GiscardBiamby/cocobetter.git`.

### Assumptions:

- Positive samples acreated with a template: `The <class_name>`
- Negative samples are what we call the false premise referring expressions. I.e., expressions that specify objects that don't exist in the image.
- Each negative sample has a "parent", which is the possitive sample (a referring expression).
- In this simplified dataset, the positive samples are just class names, and the negative samples are some other COCO class name that is swapped in for the positive one.
- Positive/negative pairs are only created for unambiguous cases (ignoring noisy dataset issues, like unlabelled objects). This means we only create positive and negative referring expressions for "human" if the image contains exactly one human ground truth annotation. If there are two or more humans, we cannot easily create a referring expression that refers to one specific human, so we avoid trying to do that.

### Example: 

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
%pip list | grep json

fastjsonschema                    2.18.0
json5                             0.9.14
jsonpointer                       2.4
jsons                             1.6.3
jsonschema                        4.19.1
jsonschema-specifications         2023.7.1
pysimdjson                        5.0.2
python-json-logger                2.0.7
python-lsp-jsonrpc                1.1.1
ujson                             5.8.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import argparse
import copy
import csv
import decimal
import json
import os
import typing
from collections import Counter, defaultdict
from copy import deepcopy
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Set, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import PIL.Image as pil_img
import seaborn as sns
import simdjson as json
from IPython.display import display
from PIL import Image
from pycocotools.coco import COCO, Ann, Cat, Image, Ref
from pycocotools.helpers import CocoClassDistHelper, CocoJsonBuilder
from pycocotools.helpers.coco_builder import COCOShrinker
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
COCO_DIR = Path("/shared/gbiamby/data/coco")
IMG_DIR = COCO_DIR / "val2017"

In [None]:
# dataset_json = Path(COCO_DIR / "annotations" / "instances_val2017.json")
# coco_dist = CocoClassDistHelper(dataset_json)
# coco = COCO(dataset_json)
# for key in list(coco.anns.keys()):
#     ann = coco.anns[key]
#     if "is_neg" not in ann:
#         ann["is_neg"] = False
#     cat = coco.cats[ann["category_id"]]
#     ann["supercategory"] = cat["supercategory"]
#     ann["cat_name"] = cat["name"]

In [5]:
pd.options.display.float_format = "{:,}".format
pd.set_option("display.max_colwidth", None)

In [None]:
# import spacy

# spacy.prefer_gpu()
# import spacy_transformers

# nlp = spacy.load("en_core_web_trf")

In [None]:
# doc = nlp("This is a sentence.")
# print("noun chunks: ", list(doc.noun_chunks))
# print([(w.text, w.pos_) for w in doc])

In [None]:
# def get_img_info(img_dir: Path, img: dict):
#     img_path = img_dir / img["file_name"]
#     img = deepcopy(img)
#     result = {
#         "filename": img_path.name,
#         "suffix": img_path.suffix,
#         "img_dim": np.asarray(pil_img.open(img_path).convert("L")).shape,
#         "image_height": np.asarray(pil_img.open(img_path).convert("L")).shape[0],
#         "image_width": np.asarray(pil_img.open(img_path).convert("L")).shape[1],
#     }
#     result["area"] = result["img_dim"][0] * result["img_dim"][1]
#     img.update(result)
#     return img


# print(f"Found {len(coco.imgs)} images to process.")
# tqdm._instances.clear()
# df_imgs = pd.DataFrame(
#     get_img_info(IMG_DIR, img) for img in tqdm(list(coco.imgs.values()))
# )
# display(df_imgs)

In [None]:
# df_anns = pd.DataFrame(coco.anns.values()).drop(columns=["segmentation", "bbox"])
# display(df_anns)

In [None]:
# df_img_cat_counts = (
#     df_anns.groupby(["image_id", "supercategory", "category_id", "cat_name"])
#     .agg(total_anns=("id", "count"))
#     .reset_index()
# )
# display(df_img_cat_counts)

In [None]:
# df_imgs.merge(df_anns, how="inner", left_on="id", right_on="image_id")

In [None]:
# df_positive_cats = df_img_cat_counts[df_img_cat_counts.total_anns == 1].set_index(
#     ["image_id", "category_id"]
# )
# # df_positive_cats["]
# display(df_positive_cats)
# display(df_positive_cats.loc[581317])
# display(df_positive_cats.loc[581317, 77])

In [None]:
# def test_dataframe_indexing():
#     # Get one cat when there are many:
#     cats = df_positive_cats.loc[581317]
#     display(cats)
#     print(len(cats))
#     print(cats.sample(n=1, replace=False))

#     # When there is one cat
#     cats = df_positive_cats.loc[581615]
#     display(cats)
#     print(len(cats))
#     print(cats.sample(n=1, replace=False))

#     # # image_id doesn't exist
#     # cats = df_positive_cats.loc[58131887]
#     # display(cats)

#     # Check when category doesn't exist but img does


# test_dataframe_indexing()
# # df_positive_cats[df_positive_cats.index["image_id"]==581317]

In [None]:
# def get_img2cats(df: pd.DataFrame) -> dict[int, dict[str, Any]]:
#     img_cats = df.reset_index().to_dict(orient="records")
#     img2cats = defaultdict(dict)
#     for img_cat in img_cats:
#         img2cats[img_cat["image_id"]][img_cat["category_id"]] = img_cat

#     return img2cats


# cat_counts_pos_samples: dict = get_img2cats(df_positive_cats)
# cat_counts_all: dict = get_img2cats(df_img_cat_counts)
# print(len(cat_counts_pos_samples), len(df_img_cat_counts))

# print(cat_counts_pos_samples[581317])

In [9]:
def get_sibling_lookup(
    coco: COCO, img_cat_counts: dict[int, dict[int, dict]]
) -> dict[int, set[int]]:
    """
    Return a dictionary allowing lookup of the sibling categories, given a category_id.
    """
    sup_to_children = defaultdict(set)
    for cat_id, cat in coco.cats.items():
        sup = cat["supercategory"]
        sup_to_children[sup].add(cat_id)
    cat_to_siblings = defaultdict(set)
    for cat_id, cat in coco.cats.items():
        super_cat = cat["supercategory"]
        children = sup_to_children[super_cat]
        # don't count current cat as it's own sibling:
        siblings = children - set([cat_id])
        if len(siblings) == 0:
            print(
                f"Category {cat} has no siblings based on supercategory. Using all other classes as siblings instead."
            )
            siblings = {cat["id"] for cat in coco.cats.values()} - set([cat_id])
            print("\tAjusted siblings: ", len(siblings))
        cat_to_siblings[cat_id] = cat_to_siblings[cat_id].union(siblings)
    return cat_to_siblings


# sibling_lookup = get_sibling_lookup(coco, cat_counts_all)
# print("Num sibling lookups: ", len(sibling_lookup))

In [17]:
def add_sentence(
    sentence_id: int,
    cat_id: int,
    coco: COCO,
    ref: Ref,
    exist: bool,
    pos_sent: dict[str, Any] = None,
    true_cat_id: int = None,
):
    """
    Add sentence, and sent_ids to the ref object. exist=True/False means it is a positive/negative sample, resp.
    """
    cat = coco.cats[cat_id]
    s = f"The {cat['name'].lower()}"
    sent = {
        "tokens": s.split(" "),
        "raw": s,
        "sent_id": sentence_id,
        "sent": s,  # TODO: what exactly is the dif between "raw" and "sent"?
        "exist": exist,
    }
    if exist == False:
        assert pos_sent is not None
        assert true_cat_id is not None
        sent["source_sent"] = pos_sent["sent_id"]
        sent["true_cat_id"] = true_cat_id
    ref["sent_ids"].append(sent["sent_id"])
    ref["sentences"].append(sent)
    return sent


def get_img2cats(coco: COCO) -> dict[int, dict[str, Any]]:
    """
    This is pretty effed up, but it arose from jupyter notebook code that started for one purpose and then evolved over time.
    
    Returns:
        Dict 1: key = image_id (int), value = Dict
        Dict 2: key = category_id (int), value = Dict (category object)
    """
    df_anns = pd.DataFrame(coco.anns.values()).drop(columns=["segmentation", "bbox"])
    df_anns["supercategory"] = df_anns.category_id.apply(lambda x: coco.cats[x]["supercategory"])
    df_anns["cat_name"] = df_anns.category_id.apply(lambda x: coco.cats[x]["name"])
    display(df_anns)
    df_img_cat_counts = (
        df_anns.groupby(["image_id", "supercategory", "category_id", "cat_name"])
        .agg(total_anns=("id", "count"))
        .reset_index()
    )
    df_positive_cats = df_img_cat_counts[df_img_cat_counts.total_anns == 1].set_index(
        ["image_id", "category_id"]
    )
    img_cats = df_positive_cats.reset_index().to_dict(orient="records")
    img2cats = defaultdict(dict)
    for img_cat in img_cats:
        img2cats[img_cat["image_id"]][img_cat["category_id"]] = img_cat

    return img2cats


def coco_negref(
    args,
    split: str,
    dataset_name: str,
    split_by: str = "berkeley",
):
    """
    Generate COCO with negated annotations.

    Negated Anns are added to every image in the dataset. Negative classes are detected
    as those not appearing in an image ground truth.
    """
    np.random.seed(args.seed)
    coco_annotations_file: Path = args.coco_ann_path
    output_path: Path = args.output_path.absolute()
    output_path.mkdir(exist_ok=True, parents=True)
    assert coco_annotations_file.exists(), str(coco_annotations_file)

    coco_original = COCO(str(coco_annotations_file))
    coco_builder = CocoJsonBuilder(
        coco_original.dataset["categories"],
        dest_path=output_path,
        dest_name=f"instances.json",
        source_coco=coco_original,
        is_ref_dataset=True,
        dataset_name=dataset_name,
        split_by=split_by,
    )
    cat_counts_pos_samples: dict[int, dict[int, dict[str, Any]]] = get_img2cats(coco_original)
    sibling_lookup = get_sibling_lookup(coco_original, cat_counts_pos_samples)
    current_sent_id = 1

    for i, (img_id, img) in tqdm(
        enumerate(coco_original.imgs.items()), total=len(coco_original.imgs)
    ):
        annotations: list[Ann] = deepcopy(coco_original.imgToAnns[img_id])
        pos_candidates = []
        # Sample the positive classes. Primarily this means classes with exactly
        # one annotation in the image, therefore we can create unambiguous refering expressions for them:
        img_cat_counts: list[dict[int, dict]] = list(
            cat_counts_pos_samples[img_id].values()
        )
        if img_cat_counts is not None and len(img_cat_counts):
            pos_candidates = np.random.choice(
                img_cat_counts, args.num_pos_parents_per_image
            ).tolist()
        refs: list[Ref] = []
        # All anns pass through to the new dataset (but won't be used unless a ref points to them).
        # Each ref points to an ann_id. Each ref has a list of sentences.

        ## What we're building looks like this:
        # img: {"id": , "height": , ...}
        #     anns: [{"id": , "category_id": , bbox: , "segmentation": , ...}]
        #     refs: [{"ref_id": , "ann_id": , "category_id": , "image_id": , sent_ids: ,
        #        "sentences": [
        #             {
        #                 'tokens': ['the', 'man', 'in', 'yellow', 'coat'],
        #                 'raw': 'the man in yellow coat',
        #                 'sent_id': 8,
        #                 'sent': 'the man in yellow coat',  # TODO: what exactly is the dif between "raw" and "sent"?
        #                 'exist': True,
        #                 ...,
        #             }
        #     ]}, ...]

        for pos_candidate in pos_candidates:
            ann = next(
                ann
                for ann in annotations
                if ann["category_id"] == pos_candidate["category_id"]
            )
            assert ann is not None
            ref = {
                "image_id": img_id,
                "split": split,
                "file_name": img["file_name"],
                "category_id": ann["category_id"],
                "ann_id": ann["id"],
                "sent_ids": [],
                "ref_id": -1,
                "sentences": [],
            }
            pos_sent = add_sentence(
                current_sent_id, ann["category_id"], coco_original, ref, exist=True
            )
            current_sent_id += 1

            # Sample candidate negative classes (siblings)
            siblings = sibling_lookup[pos_candidate["category_id"]]
            if len(siblings) > 0:
                neg_samples = set(
                    np.random.choice(list(siblings), args.num_negs_per_pos)
                )
                for ns in neg_samples:
                    neg_sent = add_sentence(
                        current_sent_id,
                        ann["category_id"],
                        coco_original,
                        ref,
                        exist=False,
                        pos_sent=pos_sent,
                        true_cat_id=ann["category_id"],
                    )
                    current_sent_id += 1
            else:
                print("Empty siblings for cat: ", pos_candidate)
            refs.append(ref)

        # Add the negated annotations:
        new_img: Image = img
        coco_builder.add_image(new_img, annotations, refs)
    neg_coco_path = coco_builder.save()

    # Output a miniature version of the dataset file just for debugging/inspection:
    print("\n\n")
    print("Building shrunken version")
    num_images = 50
    mini_dataset_name = f"{dataset_name}_mini"
    shrinker = COCOShrinker(
        neg_coco_path,
        is_ref_dataset=True,
        split_by=split_by,
        dataset_name=dataset_name,
    )
    shrink_path = Path(f"./output/ref_seg/{mini_dataset_name}")
    shrinker.shrink(
        "instances.json",
        size=num_images,
        output_dir=shrink_path,
        is_ref_dataset=True,
        dataset_name=mini_dataset_name,
        split_by="berkeley",
    )


if __name__ == "__main__":
    _dataset_name = f"refcoconeg_v001"

    for year in ["2014", "2017"]:
        for split in ["val", "train"]:
            dataset_name = f"{_dataset_name}_{split}{year}"
            dataset_json = Path(
                COCO_DIR / "annotations" / f"instances_{split}{year}.json"
            )
            args = argparse.Namespace(
                **{
                    "coco_ann_path": dataset_json,
                    "output_path": Path(f"./output/ref_seg/{dataset_name}").resolve(),
                    "num_pos_parents_per_image": 1,
                    "num_negs_per_pos": 5,
                    "seed": 42,
                }
            )
            # print(args)
            coco_negref(
                args,
                split,
                dataset_name=dataset_name,
                split_by="berkeley",
            )

loading annotations into memory...
Done (t=9.16s)
creating index...
index created!


Unnamed: 0,area,iscrowd,image_id,category_id,id,supercategory,cat_name
0,2765.1486500000005,0,558840,58,156,food,hot dog
1,1545.4213000000007,0,200365,58,509,food,hot dog
2,5607.661349999996,0,200365,58,603,food,hot dog
3,0.0,0,200365,58,918,food,hot dog
4,800.4132499999978,0,200365,58,1072,food,hot dog
...,...,...,...,...,...,...,...
291870,2644.0,1,349689,2,900200349689,vehicle,bicycle
291871,3603.0,1,442106,57,905700442106,food,carrot
291872,1520.0,1,382715,1,900100382715,person,person
291873,220834.0,1,250282,1,900100250282,person,person


Category {'supercategory': 'person', 'id': 1, 'name': 'person'} has no siblings based on supercategory. Using all other classes as siblings instead.
	Ajusted siblings:  79


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40504/40504 [00:21<00:00, 1855.89it/s]


counts(id):  291875
Num refs:  32926
counts(ref_id):  1
Counter({-1: 32926})
Total anns:291875
ann_ids: 291875
Writing coco_builder (num_img: 40504, num_ann: 291875) output to: '/home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_val2014/instances.json'
Saved /home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_val2014/instances.json' (573.19MB)
Saving 32926 refs to file:  /home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_val2014/refs(berkeley).p
Saved refs file /home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_val2014/refs(berkeley).p' (12.77MB)



Building shrunken version
Creating subset of /home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_val2014/instances.json, of size: 50, at: output/ref_seg/ref

Unnamed: 0,area,iscrowd,image_id,category_id,id,supercategory,cat_name
0,54652.9556,0,480023,58,86,food,hot dog
1,421.47274999999996,0,50518,58,89,food,hot dog
2,53535.29024999999,0,142589,58,93,food,hot dog
3,3892.3764,0,209263,58,113,food,hot dog
4,72576.18295,0,15307,58,116,food,hot dog
...,...,...,...,...,...,...,...
604902,4227.0,1,390883,1,900100390883,person,person
604903,6058.0,1,49902,53,905300049902,food,apple
604904,737.0,1,363764,43,904300363764,sports,tennis racket
604905,6478.0,1,554743,1,900100554743,person,person


Category {'supercategory': 'person', 'id': 1, 'name': 'person'} has no siblings based on supercategory. Using all other classes as siblings instead.
	Ajusted siblings:  79


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 82783/82783 [00:52<00:00, 1578.29it/s]


counts(id):  604907
Num refs:  67082
counts(ref_id):  1
Counter({-1: 67082})
Total anns:604907
ann_ids: 604907
Writing coco_builder (num_img: 82783, num_ann: 604907) output to: '/home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_train2014/instances.json'
Saved /home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_train2014/instances.json' (1181.69MB)
Saving 67082 refs to file:  /home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_train2014/refs(berkeley).p
Saved refs file /home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_train2014/refs(berkeley).p' (26.56MB)



Building shrunken version
Creating subset of /home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_train2014/instances.json, of size: 50, at: output/

Unnamed: 0,area,iscrowd,image_id,category_id,id,supercategory,cat_name
0,702.1057499999998,0,289343,18,1768,animal,dog
1,27718.476299999995,0,61471,18,1773,animal,dog
2,78969.31690000003,0,472375,18,2551,animal,dog
3,108316.66515000002,0,520301,18,3186,animal,dog
4,75864.53530000002,0,579321,18,3419,animal,dog
...,...,...,...,...,...,...,...
36776,3773.0,1,15517,6,900600015517,vehicle,bus
36777,112181.0,1,439994,1,900100439994,person,person
36778,47024.0,1,117719,44,904400117719,kitchen,bottle
36779,27277.0,1,50149,52,905200050149,food,banana


Category {'supercategory': 'person', 'id': 1, 'name': 'person'} has no siblings based on supercategory. Using all other classes as siblings instead.
	Ajusted siblings:  79


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:02<00:00, 1828.97it/s]


counts(id):  36781
Num refs:  4109
counts(ref_id):  1
Counter({-1: 4109})
Total anns:36781
ann_ids: 36781
Writing coco_builder (num_img: 5000, num_ann: 36781) output to: '/home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_val2017/instances.json'
Saved /home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_val2017/instances.json' (71.86MB)
Saving 4109 refs to file:  /home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_val2017/refs(berkeley).p
Saved refs file /home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_val2017/refs(berkeley).p' (1.49MB)



Building shrunken version
Creating subset of /home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_val2017/instances.json, of size: 50, at: output/ref_seg/refcoconeg_v0

Unnamed: 0,area,iscrowd,image_id,category_id,id,supercategory,cat_name
0,2765.1486500000005,0,558840,58,156,food,hot dog
1,1545.4213000000007,0,200365,58,509,food,hot dog
2,5607.661349999996,0,200365,58,603,food,hot dog
3,0.0,0,200365,58,918,food,hot dog
4,800.4132499999978,0,200365,58,1072,food,hot dog
...,...,...,...,...,...,...,...
859996,4227.0,1,390883,1,900100390883,person,person
859997,6058.0,1,49902,53,905300049902,food,apple
859998,737.0,1,363764,43,904300363764,sports,tennis racket
859999,6478.0,1,554743,1,900100554743,person,person


Category {'supercategory': 'person', 'id': 1, 'name': 'person'} has no siblings based on supercategory. Using all other classes as siblings instead.
	Ajusted siblings:  79


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 118287/118287 [01:14<00:00, 1585.99it/s]


counts(id):  860001
Num refs:  95899
counts(ref_id):  1
Counter({-1: 95899})
Total anns:860001
ann_ids: 860001
Writing coco_builder (num_img: 118287, num_ann: 860001) output to: '/home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_train2017/instances.json'
Saved /home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_train2017/instances.json' (1679.71MB)
Saving 95899 refs to file:  /home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_train2017/refs(berkeley).p
Saved refs file /home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_train2017/refs(berkeley).p' (36.81MB)



Building shrunken version
Creating subset of /home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_train2017/instances.json, of size: 50, at: output

### Test Creating the new CocoNegRef Dataset, and Output Stats

In [8]:
VALID_SPLITS = {
    # "R-refcoco": ["unc"],
    # "R-refcoco+": ["unc"],
    # "R-refcocog": ["umd"],
    # "refclef": ["berkeley", "unc"],
    "refcoco": ["unc", "google"],
    "refcoco+": ["unc"],
    "refcocog": [
        "umd",
        "google",
    ],  # ["google", "umd"], # google and umd are identical, but have differet train/val/test splits.
    # "coconegref": ["berkeley"],
    "fprefcoco_v002": ["berkeley"],
    "fprefcoco+_v002": ["berkeley"],
    "fprefcocog_v002": ["berkeley"],
    "refcoconeg_v001_train2014": ["berkeley"],
    "refcoconeg_v001_train2014_mini": ["berkeley"],
    "refcoconeg_v001_val2014": ["berkeley"],
    "refcoconeg_v001_val2014_mini": ["berkeley"],
    "refcoconeg_v001_train2017": ["berkeley"],
    "refcoconeg_v001_train2017_mini": ["berkeley"],
    "refcoconeg_v001_val2017": ["berkeley"],
    "refcoconeg_v001_val2017_mini": ["berkeley"],
}


def build_refcoco(refseg_path: Path, dataset_name: str, split_by: str = None) -> COCO:
    assert dataset_name in VALID_SPLITS, dataset_name
    if split_by is None:
        split_by = VALID_SPLITS[dataset_name][0]
    else:
        assert split_by in VALID_SPLITS[dataset_name]
    coco = COCO(
        refseg_path / dataset_name / "instances.json",
        is_ref_dataset=True,
        dataset_name=dataset_name,
        split_by=split_by,
    )
    return coco



df_aggs = []
for ds_name in [
    "refcoconeg_v001_train2014",
    "refcoconeg_v001_train2014_mini",
    "refcoconeg_v001_val2014",
    "refcoconeg_v001_val2014_mini",
    "refcoconeg_v001_train2017",
    "refcoconeg_v001_train2017_mini",
    "refcoconeg_v001_val2017",
    "refcoconeg_v001_val2017_mini",
]:
    print("\n\n")
    print("=" * 220)
    print(f"Dataset: {ds_name}(berkeley)")
    coconegref_stats = CocoClassDistHelper(
        Path(f"./output/ref_seg").resolve(),
        is_ref_dataset=True,
        dataset_name=ds_name,
        split_by="berkeley",
    )
    df_refcoco, df_refcoco_agg = coconegref_stats.get_ref_stats()
    df_aggs.append(df_refcoco_agg)


df_aggs = pd.concat(df_aggs)


display(df_aggs)




Dataset: refcoconeg_v001_train2014(berkeley)
Loading refs from '/home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_train2014/refs(berkeley).p'
Loaded 67082 refs
loading annotations into memory...
Done (t=9.18s)
creating index...
index created!
num images: 82783
num annotations: 604907
pos/neg sentence_counts:  67082 251417



Dataset: refcoconeg_v001_train2014_mini(berkeley)
Loading refs from '/home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_train2014_mini/refs(berkeley).p'
Loaded 46 refs
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
num images: 50
num annotations: 524
pos/neg sentence_counts:  46 159



Dataset: refcoconeg_v001_val2014(berkeley)
Loading refs from '/home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001_val2014/refs(berkeley).p'
Loaded 32926 refs
loading annotat

Unnamed: 0,pos_sent_count,neg_sent_count,dataset,num_refs,sent_count,total_pos_sents,total_neg_sents,ann_count,img_count
0,1,1,refcoconeg_v001_train2014(berkeley),68,136,68,68,604907,82783
1,1,2,refcoconeg_v001_train2014(berkeley),4321,12963,4321,8642,604907,82783
2,1,3,refcoconeg_v001_train2014(berkeley),22391,89564,22391,67173,604907,82783
3,1,4,refcoconeg_v001_train2014(berkeley),25976,129880,25976,103904,604907,82783
4,1,5,refcoconeg_v001_train2014(berkeley),14326,85956,14326,71630,604907,82783
0,1,2,refcoconeg_v001_train2014_mini(berkeley),6,18,6,12,524,50
1,1,3,refcoconeg_v001_train2014_mini(berkeley),18,72,18,54,524,50
2,1,4,refcoconeg_v001_train2014_mini(berkeley),17,85,17,68,524,50
3,1,5,refcoconeg_v001_train2014_mini(berkeley),5,30,5,25,524,50
0,1,1,refcoconeg_v001_val2014(berkeley),36,72,36,36,291875,40504


In [9]:
df_aggs.groupby("dataset").agg(
    num_refs=("num_refs", "sum"),
    sent_count=("sent_count", "sum"),
    total_pos_sents=("total_pos_sents", "sum"),
    total_neg_sents=("total_neg_sents", "sum"),
    total_ann_count=("ann_count", "min"),
    total_img_count=("img_count", "min"),
)

Unnamed: 0_level_0,num_refs,sent_count,total_pos_sents,total_neg_sents,total_ann_count,total_img_count
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
refcoconeg_v001_train2014(berkeley),67082,318499,67082,251417,604907,82783
refcoconeg_v001_train2014_mini(berkeley),46,205,46,159,524,50
refcoconeg_v001_train2017(berkeley),95899,455467,95899,359568,860001,118287
refcoconeg_v001_train2017_mini(berkeley),45,197,45,152,470,49
refcoconeg_v001_val2014(berkeley),32926,156170,32926,123244,291875,40504
refcoconeg_v001_val2014_mini(berkeley),46,200,46,154,531,50
refcoconeg_v001_val2017(berkeley),4109,19543,4109,15434,36781,5000
refcoconeg_v001_val2017_mini(berkeley),34,152,34,118,382,48


In [None]:
from IPython.display import display

print(df.describe())
with pd.option_context("display.max_rows", 100, "display.max_columns", 10):
    display(df)

In [None]:
anns = {ann["id"] for ann in coco.anns.values()}
print(min(anns), max(anns))

In [None]:
import pickle

refs = pickle.load(
    open(
        "/home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001/refs(berkeley).p",
        "rb",
    )
)
for idx, ref in enumerate(reversed(refs)):
    if idx > 1:
        break
    # print(ref["category_id"], ref["ann_id"])
    print(ref)
    # if ref["sent_ids"]:
    #     print(ref["sent_ids"])

In [None]:
print(list(coco.imgs.values())[-5:])

In [None]:
df_counts = (
    pd.DataFrame(list(coco_dist.get_cat_counts().values()))
    .sort_values("ann_count", ascending=False)
    .reset_index(drop=True)
)
total_anns = df_counts.ann_count.sum()
df_counts["ann_count_pdf"] = df_counts.ann_count / total_anns
df_counts["ann_count_cdf"] = df_counts.ann_count_pdf.cumsum()
display(df_counts)

In [None]:
df_

## Add frequency bins based on annotation count Cumulative Distribution Function


In [None]:
df_counts["freq_bin_2"] = df_counts.ann_count_cdf.apply(
    lambda x: "high" if x < 0.5 else "low"
)
df_counts["freq_bin_3"] = df_counts.ann_count_cdf.apply(
    lambda x: "high" if x < 0.333 else "medium" if x <= 0.667 else "low"
)
display(df_counts)

In [None]:
plt.figure(figsize=(25, 15))
color_map = plt.get_cmap("magma")
fig = sns.barplot(
    data=df_counts.sort_values(["img_count"], ascending=False),
    x="name",
    y="img_count",
    hue=df_counts.freq_bin_3.values,
)
fig.set_xticklabels(fig.get_xticklabels(), rotation=45, horizontalalignment="right")
fig.set_title(f"Per-category Annotated Image Counts")
plt.tight_layout()

In [None]:
plt.figure(figsize=(25, 15))
color_map = plt.get_cmap("magma")
fig = sns.barplot(
    data=df_counts.sort_values(["ann_count"], ascending=False),
    x="name",
    y="ann_count",
    hue="freq_bin_3",
)
fig.set_xticklabels(fig.get_xticklabels(), rotation=45, horizontalalignment="right")
fig.set_title(f"Per-category Annotation Counts")
plt.tight_layout()