## neg_refcocov001.ipynb

Create a COCO formatted dataset that uses a simplistic method to create false-premise referring expressions, along with correcting expressions. The method is to swap nouns with categories from a sibling class in the same COCO supercategory.

The referring expressions follow same format as refcoco/refcocog/refcoco+/R-refcoco/etc, i.e., a coco formatted json file, accompanied by a file with a `.p` extension, which contains the referring expression. The `.p` file is a python pickle file. These datasets can be loaded using the common `refer.py`, or the `COCO` class in `github.com/GiscardBiamby/cocobetter.git`.

### Assumptions:

- Negative samples are what we call the false premise referring expressions. I.e., expressions that specify objects that don't exist in the image.
- Each negative sample has a "parent", which is the possitive sample (a referring expression).
- In this simplified dataset, the positive samples are just class names, and the negative samples are some other COCO class name that is swapped in for the positive one.
- Positive/negative pairs are only created for unambiguous cases (ignoring noisy dataset issues, like unlabelled objects). This means we only create positive and negative referring expressions for "human" if the image contains exactly one human ground truth annotation. If there are two or more humans, we cannot easily create a referring expression that refers to one specific human, so we avoid trying to do that.

### Example: 

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
%pip list | grep json

In [None]:
import argparse
import copy
import csv
import decimal
import json
import os
import typing
from collections import Counter, defaultdict
from copy import deepcopy
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Set, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import PIL.Image as pil_img
import seaborn as sns
import simdjson as json
from IPython.display import display
from PIL import Image
from pycocotools.coco import COCO, Ann, Cat, Image, Ref
from pycocotools.helpers import CocoClassDistHelper, CocoJsonBuilder
from pycocotools.helpers.coco_builder import COCOShrinker
from tqdm.auto import tqdm

In [None]:
COCO_DIR = Path("/shared/gbiamby/data/coco")
IMG_DIR = COCO_DIR / "val2017"

dataset_json = Path(COCO_DIR / "annotations" / "instances_val2017.json")
coco_dist = CocoClassDistHelper(dataset_json)
coco = COCO(dataset_json)
for key in list(coco.anns.keys()):
    ann = coco.anns[key]
    if "is_neg" not in ann:
        ann["is_neg"] = False
    cat = coco.cats[ann["category_id"]]
    ann["supercategory"] = cat["supercategory"]
    ann["cat_name"] = cat["name"]

In [None]:
pd.options.display.float_format = "{:,}".format
pd.set_option("display.max_colwidth", None)

In [None]:
list(coco.imgs.values())[:3]

In [None]:
type(coco.cats)

In [None]:
# import spacy

# spacy.prefer_gpu()
# import spacy_transformers

# nlp = spacy.load("en_core_web_trf")

In [None]:
# doc = nlp("This is a sentence.")
# print("noun chunks: ", list(doc.noun_chunks))
# print([(w.text, w.pos_) for w in doc])

In [None]:
def get_img_info(img_dir: Path, img: dict):
    img_path = img_dir / img["file_name"]
    img = deepcopy(img)
    result = {
        "filename": img_path.name,
        "suffix": img_path.suffix,
        "img_dim": np.asarray(pil_img.open(img_path).convert("L")).shape,
        "image_height": np.asarray(pil_img.open(img_path).convert("L")).shape[0],
        "image_width": np.asarray(pil_img.open(img_path).convert("L")).shape[1],
    }
    result["area"] = result["img_dim"][0] * result["img_dim"][1]
    img.update(result)
    return img


print(f"Found {len(coco.imgs)} images to process.")
tqdm._instances.clear()
df_imgs = pd.DataFrame(
    get_img_info(IMG_DIR, img) for img in tqdm(list(coco.imgs.values()))
)
display(df_imgs)

In [None]:
df_anns = pd.DataFrame(coco.anns.values()).drop(columns=["segmentation", "bbox"])
display(df_anns)

In [None]:
df_img_cat_counts = (
    df_anns.groupby(["image_id", "supercategory", "category_id", "cat_name"])
    .agg(total_anns=("id", "count"))
    .reset_index()
)
display(df_img_cat_counts)

In [None]:
df_imgs.merge(df_anns, how="inner", left_on="id", right_on="image_id")

In [None]:
df_positive_cats = df_img_cat_counts[df_img_cat_counts.total_anns == 1].set_index(
    ["image_id", "category_id"]
)
# df_positive_cats["]
display(df_positive_cats)
display(df_positive_cats.loc[581317])
display(df_positive_cats.loc[581317, 77])

In [None]:
def test_dataframe_indexing():
    # Get one cat when there are many:
    cats = df_positive_cats.loc[581317]
    display(cats)
    print(len(cats))
    print(cats.sample(n=1, replace=False))

    # When there is one cat
    cats = df_positive_cats.loc[581615]
    display(cats)
    print(len(cats))
    print(cats.sample(n=1, replace=False))

    # # image_id doesn't exist
    # cats = df_positive_cats.loc[58131887]
    # display(cats)

    # Check when category doesn't exist but img does


test_dataframe_indexing()
# df_positive_cats[df_positive_cats.index["image_id"]==581317]

In [None]:
def get_img2cats(df: pd.DataFrame) -> dict[int, dict[str, Any]]:
    img_cats = df.reset_index().to_dict(orient="records")
    img2cats = defaultdict(dict)
    for img_cat in img_cats:
        img2cats[img_cat["image_id"]][img_cat["category_id"]] = img_cat

    return img2cats


cat_counts_pos_samples: dict = get_img2cats(df_positive_cats)
cat_counts_all: dict = get_img2cats(df_img_cat_counts)
print(len(cat_counts_pos_samples), len(df_img_cat_counts))

print(cat_counts_pos_samples[581317])

In [None]:
def get_sibling_lookup(
    coco: COCO, img_cat_counts: dict[int, dict[int, dict]]
) -> dict[int, set[int]]:
    """
    Return a dictionary allowing lookup of the sibling categories, given a category_id.
    """
    sup_to_children = defaultdict(set)
    for cat_id, cat in coco.cats.items():
        sup = cat["supercategory"]
        sup_to_children[sup].add(cat_id)
    cat_to_siblings = defaultdict(set)
    for cat_id, cat in coco.cats.items():
        super_cat = cat["supercategory"]
        children = sup_to_children[super_cat]
        # don't count current cat as it's own sibling:
        siblings = children - set([cat_id])
        if len(siblings) == 0:
            print(
                f"Category {cat} has no siblings based on supercategory. Using all other classes as siblings instead."
            )
            siblings = {cat["id"] for cat in coco.cats.values()} - set([cat_id])
            print("\tAjusted siblings: ", len(siblings))
        cat_to_siblings[cat_id] = cat_to_siblings[cat_id].union(siblings)
    return cat_to_siblings


sibling_lookup = get_sibling_lookup(coco, cat_counts_all)
print("Num sibling lookups: ", len(sibling_lookup))

In [None]:
def add_sentence(
    sentence_id: int,
    cat_id: int,
    coco: COCO,
    ref: Ref,
    exist: bool,
    pos_sent: dict[str, Any] = None,
    true_cat_id: int = None,
):
    """
    Add sentence, and sent_ids to the ref object. exist=True/False means it is a positive/negative sample, resp.
    """
    cat = coco.cats[cat_id]
    s = f"The {cat['name'].lower()}"
    sent = {
        "tokens": s.split(" "),
        "raw": s,
        "sent_id": sentence_id,
        "sent": s,  # TODO: what exactly is the dif between "raw" and "sent"?
        "exist": exist,
    }
    if exist == False:
        assert pos_sent is not None
        assert true_cat_id is not None
        sent["source_sent"] = pos_sent["sent_id"]
        sent["true_cat_id"] = true_cat_id
    ref["sent_ids"].append(sent["sent_id"])
    ref["sentences"].append(sent)
    return sent


def coco_negref(
    args,
    img_cat_counts_all: dict[int, dict[int, dict[str, Any]]],
    img_cat_counts_pos_samples: dict[int, dict[int, dict[str, Any]]],
    dataset_name: str,
    split_by: str = "berkeley",
):
    """
    Generate COCO with negated annotations.

    Negated Anns are added to every image in the dataset. Negative classes are detected
    as those not appearing in an image ground truth.
    """
    np.random.seed(args.seed)
    coco_annotations_file: Path = args.coco_ann_path
    output_path: Path = args.output_path.absolute()
    output_path.mkdir(exist_ok=True, parents=True)
    assert coco_annotations_file.exists(), str(coco_annotations_file)

    sibling_lookup = get_sibling_lookup(coco, img_cat_counts_all)
    coco_original = COCO(str(coco_annotations_file))
    coco_builder = CocoJsonBuilder(
        coco_original.dataset["categories"],
        dest_path=output_path,
        dest_name=f"instances.json",
        source_coco=coco_original,
        is_ref_dataset=True,
        dataset_name=dataset_name,
        split_by=split_by,
    )
    current_sent_id = 1

    for i, (img_id, img) in tqdm(
        enumerate(coco_original.imgs.items()), total=len(coco_original.imgs)
    ):
        annotations: list[Ann] = deepcopy(coco_original.imgToAnns[img_id])

        # For each pos_candidate we will create some negative samples.
        # each pos_candidate looks like this: {'image_id': 581317, 'category_id': 77, 'supercategory': 'electronic',
        #     'cat_name': 'cell phone', 'total_anns': 1}
        pos_candidates = []
        # Sample the positive classes. Primarily this means classes with exactly
        # one annotation in the image, therefore we can create unambiguous refering expressions for them:
        img_cat_counts: list[dict[int, dict]] = list(
            img_cat_counts_pos_samples[img_id].values()
        )
        if img_cat_counts is not None and len(img_cat_counts):
            pos_candidates = np.random.choice(
                img_cat_counts, args.num_pos_parents_per_image
            ).tolist()
        refs: list[Ref] = []
        # All anns pass through to the new dataset (but won't be used unless a ref points to them).
        # Each ref points to an ann_id. Each ref has a list of sentences.

        ## What we're building looks like this:
        # img: {"id": , "height": , ...}
        #     anns: [{"id": , "category_id": , bbox: , "segmentation": , ...}]
        #     refs: [{"ref_id": , "ann_id": , "category_id": , "image_id": , sent_ids: ,
        #        "sentences": [
        #             {
        #                 'tokens': ['the', 'man', 'in', 'yellow', 'coat'],
        #                 'raw': 'the man in yellow coat',
        #                 'sent_id': 8,
        #                 'sent': 'the man in yellow coat',  # TODO: what exactly is the dif between "raw" and "sent"?
        #                 'exist': True,
        #                 ...,
        #             }
        #     ]}, ...]

        for pos_candidate in pos_candidates:
            ann = next(ann for ann in annotations if ann["category_id"] == pos_candidate["category_id"])
            assert ann is not None
            ref = {
                "image_id": img_id,
                "split": "val",  # TODO: what to use for this, it is just train/val/test?
                "file_name": img["file_name"],
                "category_id": ann["category_id"],
                "ann_id": ann["id"],
                "sent_ids": [],
                "ref_id": -1,
                "sentences": [],
            }
            pos_sent = add_sentence(
                current_sent_id, ann["category_id"], coco, ref, exist=True
            )
            current_sent_id += 1

            # Sample candidate negative classes (siblings)
            siblings = sibling_lookup[pos_candidate["category_id"]]
            if len(siblings) > 0:
                neg_samples = set(
                    np.random.choice(list(siblings), args.num_negs_per_pos)
                )
                for ns in neg_samples:
                    neg_sent = add_sentence(
                        current_sent_id,
                        ann["category_id"],
                        coco,
                        ref,
                        exist=False,
                        pos_sent=pos_sent,
                        true_cat_id=ann["category_id"],
                    )
                    current_sent_id += 1
            else:
                print("Empty siblings for cat: ", pos_candidate)
            refs.append(ref)

        # Add the negated annotations:
        new_img: Image = img
        coco_builder.add_image(new_img, annotations, refs)
    neg_coco_path = coco_builder.save()

    # Output a miniature version of the dataset file just for debugging/inspection:
    print("\n\n")
    print("Building shrunken version")
    num_images = 50
    mini_dataset_name = f"{dataset_name}_mini"
    shrinker = COCOShrinker(
        neg_coco_path,
        is_ref_dataset=True,
        split_by=split_by,
        dataset_name=dataset_name,
    )
    shrink_path = Path(f"./output/ref_seg/{mini_dataset_name}")
    shrinker.shrink(
        "instances.json",
        size=num_images,
        output_dir=shrink_path,
        is_ref_dataset=True,
        dataset_name=mini_dataset_name,
        split_by="berkeley",
    )


if __name__ == "__main__":
    dataset_name = f"refcoconeg_v001"
    args = argparse.Namespace(
        **{
            "coco_ann_path": dataset_json,
            "output_path": Path(f"./output/ref_seg/{dataset_name}").resolve(),
            "num_pos_parents_per_image": 1,
            "num_negs_per_pos": 5,
            "seed": 42,
        }
    )
    print(args)
    coco_negref(
        args,
        cat_counts_all,
        cat_counts_pos_samples,
        dataset_name=dataset_name,
        split_by="berkeley",
    )

### Test Creating the new CocoNegRef Dataset, and Output Stats

In [None]:
VALID_SPLITS = {
    "R-refcoco": ["unc"],
    "R-refcoco+": ["unc"],
    "R-refcocog": ["umd"],
    "refclef": ["berkeley", "unc"],
    "refcoco": ["google"],
    "refcoco+": ["unc"],
    "refcocog": ["google", "umd"],
    "refcoconeg_v001": ["berkeley"],
    "refcoconeg_v001_mini": ["berkeley"],
}


def build_refcoco(refseg_path: Path, dataset_name: str, split_by: str = None) -> COCO:
    assert dataset_name in VALID_SPLITS, dataset_name
    if split_by is None:
        split_by = VALID_SPLITS[dataset_name][0]
    else:
        assert split_by in VALID_SPLITS[dataset_name]
    coco = COCO(
        refseg_path / dataset_name / "instances.json",
        is_ref_dataset=True,
        dataset_name=dataset_name,
        split_by=split_by,
    )
    return coco


coconegref = build_refcoco(
    Path(f"./output/ref_seg/").resolve(), dataset_name, split_by="berkeley"
)
coconegref_stats = CocoClassDistHelper(
    Path(f"./output/ref_seg").resolve(),
    is_ref_dataset=True,
    dataset_name=dataset_name,
    split_by="berkeley",
)
df, df_agg = coconegref_stats.get_ref_stats()
# display(df)
display(df_agg)

df_agg_2 = df_agg.groupby(lambda x: True).agg(
    num_refs=("num_refs", "sum"),
    sent_count=("sent_count", "sum"),
    total_pos_sents=("total_pos_sents", "sum"),
    total_neg_sents=("total_neg_sents", "sum"),
)
display(df_agg_2)

In [None]:
from IPython.display import display

print(df.describe())
with pd.option_context("display.max_rows", 100, "display.max_columns", 10):
    display(df)

In [None]:
anns = {ann["id"] for ann in coco.anns.values()}
print(min(anns), max(anns))

In [None]:
import pickle

refs = pickle.load(
    open(
        "/home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoconeg_v001/refs(berkeley).p",
        "rb",
    )
)
for idx, ref in enumerate(reversed(refs)):
    if idx > 1:
        break
    # print(ref["category_id"], ref["ann_id"])
    print(ref)
    # if ref["sent_ids"]:
    #     print(ref["sent_ids"])

In [None]:
print(list(coco.imgs.values())[-5:])

In [None]:
df_counts = (
    pd.DataFrame(list(coco_dist.get_cat_counts().values()))
    .sort_values("ann_count", ascending=False)
    .reset_index(drop=True)
)
total_anns = df_counts.ann_count.sum()
df_counts["ann_count_pdf"] = df_counts.ann_count / total_anns
df_counts["ann_count_cdf"] = df_counts.ann_count_pdf.cumsum()
display(df_counts)

In [None]:
df_

## Add frequency bins based on annotation count Cumulative Distribution Function


In [None]:
df_counts["freq_bin_2"] = df_counts.ann_count_cdf.apply(
    lambda x: "high" if x < 0.5 else "low"
)
df_counts["freq_bin_3"] = df_counts.ann_count_cdf.apply(
    lambda x: "high" if x < 0.333 else "medium" if x <= 0.667 else "low"
)
display(df_counts)

In [None]:
plt.figure(figsize=(25, 15))
color_map = plt.get_cmap("magma")
fig = sns.barplot(
    data=df_counts.sort_values(["img_count"], ascending=False),
    x="name",
    y="img_count",
    hue=df_counts.freq_bin_3.values,
)
fig.set_xticklabels(fig.get_xticklabels(), rotation=45, horizontalalignment="right")
fig.set_title(f"Per-category Annotated Image Counts")
plt.tight_layout()

In [None]:
plt.figure(figsize=(25, 15))
color_map = plt.get_cmap("magma")
fig = sns.barplot(
    data=df_counts.sort_values(["ann_count"], ascending=False),
    x="name",
    y="ann_count",
    hue="freq_bin_3",
)
fig.set_xticklabels(fig.get_xticklabels(), rotation=45, horizontalalignment="right")
fig.set_title(f"Per-category Annotation Counts")
plt.tight_layout()