## neg_refcocov002.ipynb Create a false premise referring expressions dataset

Create a COCO formatted dataset that uses `gpt-3.5-turbo` to create false premise referring expressions that refer to objects that do not exist in the image. 

## Types of Modifications

We ask GPT to modify the ground truth referring expressions for each image.
We categorize each FP according to the type of modification. We have three categories:

- Modify the main subject of the sentence. This means changing from one noun or noun phrase to another one. "A woman..." -> "A cat..."
- Modify an attribute of the main subject. "A tall man..." -> "A short man"
- Modify some other portion of the description. This usually means either modifying a spatial relation, or a participatory object that the expression relates somehow to the main subject, or sometimes an attribute of the participatory object.

## File Format
The referring expressions follow same format as refcoco/refcocog/refcoco+/R-refcoco/etc, i.e., a COCO formatted json file, accompanied by a file with a `.p` extension, which contains the true and false referring expressions. The `.p` file is a python pickle file. These datasets can be loaded using the common `refer.py`, or the `COCO` class in `github.com/GiscardBiamby/cocobetter.git`. Examples can be found later in this notebook.



In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
%pip list | grep json

fastjsonschema                    2.18.0
json5                             0.9.14
jsonpointer                       2.4
jsons                             1.6.3
jsonschema                        4.19.1
jsonschema-specifications         2023.7.1
pysimdjson                        5.0.2
python-json-logger                2.0.7
python-lsp-jsonrpc                1.1.1
ujson                             5.8.0
Note: you may need to restart the kernel to use updated packages.


In [10]:
import argparse
import ast
import copy
import csv
import decimal
import json
import os
import pickle
import typing
from collections import Counter, defaultdict
from copy import deepcopy
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Set, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import PIL.Image as pil_img
import regex as re
import seaborn as sns
import simdjson as json
from IPython.display import display
from PIL import Image
from pycocotools.coco import COCO, Ann, Cat, Image, Ref
from pycocotools.helpers import CocoClassDistHelper, CocoJsonBuilder
from pycocotools.helpers.coco_builder import COCOShrinker
from simplediff import diff, string_diff
from tqdm.auto import tqdm

# from geo_llm_ret.ref_datasets import build_ref_coco

In [4]:
COCO_DIR = Path("/shared/gbiamby/data/coco")
IMG_DIR = COCO_DIR / "val2017"

In [7]:
VALID_SPLITS = {
    "R-refcoco": ["unc"],
    "R-refcoco+": ["unc"],
    "R-refcocog": ["umd"],
    "refclef": ["berkeley", "unc"],
    "refcoco": ["google"],
    "refcoco+": ["unc"],
    "refcocog": ["google", "umd"],
}


def build_refcoco(refseg_path: Path, dataset_name: str, split_by: str = None) -> COCO:
    assert dataset_name in VALID_SPLITS, dataset_name
    if split_by is None:
        split_by = VALID_SPLITS[dataset_name][0]
    else:
        assert split_by.replace("_enhanced", "") in VALID_SPLITS[dataset_name]
    coco = COCO(
        refseg_path / dataset_name / "instances.json",
        is_ref_dataset=True,
        dataset_name=dataset_name,
        split_by=split_by,
    )
    return coco


IMG_DIR = Path("/shared/gbiamby/data/coco/train2014")
PROJ_ROOT = Path("../../../../../").resolve()
assert PROJ_ROOT.exists()
# REFSEG_DIR = Path("/shared/gbiamby/data/refer_seg")
REFSEG_DIR = Path("output/ref_seg")
# refcoco = build_refcoco(REFSEG_DIR, "refcocog", "google_enhanced")
refcoco = build_refcoco(REFSEG_DIR, "refcoco", "google_enhanced")

Loading refs from '/home/gbiamby/proj/geo-llm-ret/lib/cocobetter/PythonAPI/notebooks/ref_correct/output/ref_seg/refcoco/refs(google_enhanced).p'
Loaded 50000 refs
loading annotations into memory...
Done (t=3.33s)
creating index...
index created!


In [8]:
api_results_dir = (
    PROJ_ROOT / "output/refcoco_google-gb006_remove_guidelines-gpt-3.5-turbo"
)
assert api_results_dir.exists(), str(api_results_dir)
assert api_results_dir.is_dir(), str(api_results_dir)

In [9]:
def load_api_responses(api_results_dir: Path, max_results: int = None) -> list[dict]:
    response_files = sorted(api_results_dir.glob("responses/img_id_*.json"))
    if max_results is not None and max_results > 0:
        response_files = response_files[:max_results]
    results = []
    for f in tqdm(response_files):
        with open(f, "r", encoding="utf-8") as json_file:
            result = json.load(json_file)
            results.append(result)

    print(f"Loaded {len(results)} responses from {api_results_dir}")
    return results


api_responses = load_api_responses(api_results_dir)
print("Example response: ")
display(api_responses[0])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19994/19994 [00:00<00:00, 22008.36it/s]

Loaded 19994 responses from /home/gbiamby/proj/geo-llm-ret/output/refcoco_google-gb006_remove_guidelines-gpt-3.5-turbo
Example response: 





{'api_response': {'choices': [{'finish_reason': 'stop',
    'index': 0,
    'message': {'content': 'Altered Descriptions: ["woman behind guy in black", "spectator behind the guy in white", "audience in back", "guy in white trying to catch banana", "woman in front", "woman in white"]',
     'role': 'assistant'}}],
  'created': 1699353174,
  'id': 'chatcmpl-8IDg6XzsduM14vNplz1QPGIyJbtIi',
  'model': 'gpt-3.5-turbo-0613',
  'object': 'chat.completion',
  'usage': {'completion_tokens': 47,
   'prompt_tokens': 511,
   'total_tokens': 558}},
 'image_id': 100012,
 'request_info': {'ann_ids': [521105, 501646],
  'image_id': 100012,
  'ref_ids': [41276, 41277],
  'sent_ids': [[117346, 117347, 117348], [117349, 117350, 117351]],
  'sentences': ['man behind guy in white',
   'player behind the guy in white',
   'player in back',
   'guy in white trying to catch frisbee',
   'man in front',
   'man in white']}}

## Check Quality of the API Results - Filter Bad Responses

In [12]:
def parse_result(image_id: int, reply: str, warnings, errors) -> list[str]:
    def parse_result_main(reply: str):
        # print(f"ChatGPT Reply: \n\t{reply}")
        matches = re.match(
            # '.*Descriptions:*\\s*(\\(.{1,} sentence[s]{0,1}\\):){0,1}\\s*(?P<descriptions>\\[\\".*\\"\\])',
            '.*Description[s]{0,1}:*\\s*(\\(.{1,} sentence[s]{0,1}\\):){0,1}\\s*\\[{0,1}(?P<descriptions>\\".*\\")\\]{0,1}',
            reply,
            re.MULTILINE | re.DOTALL,
        )
        if matches is None:
            return None
        list_str = matches.group("descriptions")
        if not list_str.startswith("["):
            list_str = "[" + list_str
        if not list_str.endswith("]"):
            list_str = list_str + "]"
        new_sents = ast.literal_eval(list_str)
        # print("New Sents: ", new_sent)
        if new_sents is None:
            errors.append(
                {
                    "image_id": image_id,
                    "msg": f"No FP sents found (fp_sents is None)",
                    "raw_reply": reply,
                }
            )
            return None
        return new_sents

    def parse_result_multiline_list(reply: str):
        matches = re.match(
            '(?:.*Description[s]{0,1}:[ ]*(\\(.+ sentence[s]{0,1}\\):){0,1})\\n*?(?P<descriptions>\\n\\d\\.[ ]*\\"[^\\n\\"]+\\")+',
            reply,
            re.MULTILINE | re.DOTALL,
        )
        if matches is None:
            return None
        captures = matches.capturesdict()
        if (captures is None or len(captures) == 0) or (
            captures is not None and "descriptions" not in captures
        ):
            return None
        new_sents = []
        for cap in captures["descriptions"]:
            matches = re.match('(?:[\\d]\\.)\\s*?\\"(?P<sent>[^\\"]+)\\"', cap.strip())
            # print("match: ", matches)
            # print("sent: ", matches.groupdict()["sent"])
            new_sents.append(matches.groupdict()["sent"])
        return new_sents

    try:
        reply = reply.replace('\\"', '"')
        new_sents = parse_result_main(reply)
        if new_sents is None:
            new_sents = parse_result_multiline_list(reply)
            # if new_sent is None:
            #     print(reply)
        return new_sents
    except Exception as ex:
        errors.append(
            {
                "image_id": image_id,
                "msg": str(ex) + " ex type: " + str(type(ex)),
                "raw_reply": reply,
            }
        )
        return None


def verify_results(result: list[dict], refcoco: COCO):
    image_id = result["image_id"]
    warnings, errors = [], []
    raw_reply: list[str] = result["api_response"]["choices"][0]["message"]["content"]
    fp_sents = parse_result(image_id, raw_reply, warnings, errors)
    request_info = result["request_info"]
    gt_sents = request_info["sentences"]

    if fp_sents is None:
        return warnings, errors
    assert isinstance(fp_sents, list)

    # Ensure correct number of FP sentences were generated:
    if len(fp_sents) != len(gt_sents):
        errors.append(
            {
                "image_id": image_id,
                "msg": "Wrong number of FP sentences",
                "msg_detail": f"len(fp_sents):{len(fp_sents)}!=len(gt_sents):{len(gt_sents)}",
                "fp_sents": fp_sents,
                "gt_sents": gt_sents,
                "reply": result,
                "raw_reply": raw_reply,
            }
        )

    for sent, fp_sent in zip(gt_sents, fp_sents):
        # Warn if FP sentence is same as original sentence:
        if sent.lower() == fp_sent.lower():
            warnings.append(
                {
                    "image_id": image_id,
                    "msg": "FP is exact match for GT sentence",
                    "msg_detail": f"{sent}=={fp_sent}",
                    "fp_sents": fp_sent,
                    "gt_sent": sent,
                    "reply": result,
                    "raw_reply": raw_reply,
                }
            )
            continue

        # diff_result = string_diff(sent.lower(), fp_sent.lower())
        # num_matching_spans = len([res[0] for res in diff_result if res[0] == "="])

        # if num_matching_spans not in {1, 2}:
        #     warnings.append(
        #         {
        #             "image_id": image_id,
        #             "msg": "Wrong num_matching_spans",
        #             "msg_detail": f":{num_matching_spans}, diff:{diff_result}",
        #             "fp_sents": fp_sent,
        #             "gt_sent": sent,
        #             "reply": result,
        #             "raw_reply": raw_reply,
        #         }
        #     )
        # print("")
    result["fp_sents"] = fp_sents
    # print("warnings: ", len(warnings), "errors: ", len(errors))
    return warnings, errors


def check_fpsent_counts(results: list[dict]):
    has_fpsents_count = 0
    num_results = 0
    for result in results:
        num_results += 1
        if "fp_sents" in result and len(result["fp_sents"]) > 0:
            has_fpsents_count += 1

    print("num_results: ", num_results)
    print("has_fpsents_count: ", has_fpsents_count)


# Check all results:
api_responses = load_api_responses(api_results_dir)[:200000]
warnings = []
errors = []
for result in tqdm(api_responses):
    _warnings, _errors = verify_results(result, refcoco)
    warnings.extend(_warnings)
    errors.extend(_errors)

check_fpsent_counts(api_responses)

# Summarize Results:
print(f"Found {len(warnings)} warnings")
print(f"Found {len(errors)} errors")
# print("")
# print("=" * 220)
# print("Warnings:")
# for warn in warnings[:3]:
#     print("")
#     print("=" * 100)
#     print(json.dumps(warn, indent=4))
# print("")
# print("=" * 220)
# print("Errors:")
# for err in errors[:3]:
#     print("")
#     print("=" * 100)
#     print(json.dumps(err, indent=4))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19994/19994 [00:04<00:00, 4949.62it/s]


Loaded 19994 responses from /home/gbiamby/proj/geo-llm-ret/output/refcoco_google-gb006_remove_guidelines-gpt-3.5-turbo


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19994/19994 [00:00<00:00, 30355.00it/s]

num_results:  19994
has_fpsents_count:  19897
Found 30 errors





### Save Intermediate Results: API Responses With Parsed Results

In [13]:
parsed_results_path = api_results_dir / "parsed_results_001.pkl"
pickle.dump(api_responses, open(parsed_results_path, "wb"))

## Show Error Counts Grouped By Type

In [14]:
df_errors = pd.DataFrame(errors)
# display(df_errors)
df_err_counts = (
    df_errors.groupby(["msg"])
    .agg(
        total=("image_id", "count"),
        uniqe_imgs=("image_id", "nunique"),
    )
    .sort_values("total", ascending=False)
)
display(df_err_counts)

Unnamed: 0_level_0,total,uniqe_imgs
msg,Unnamed: 1_level_1,Unnamed: 2_level_1
Wrong number of FP sentences,29,29
"EOL while scanning string literal (<unknown>, line 1) ex type: <class 'SyntaxError'>",1,1


This amount of errors seems acceptable. The top two types of error are:

_refcocog_

- (59 errors) Invalid pythong list syntaks for the sentences, e.g., unmatched string quotes, missing commas
- (55 errors) Wrong number of false premise sentences returned by chat-gpt. If we wanted to, we could use whatever sentences gpt was able to provide.


_refcoco_

- (29 errors) Wrong number of false premise sentences returned by chat-gpt. If we wanted to, we could use whatever sentences gpt was able to provide.Wrong number of false premise sentences returned by chat-gpt. If we wanted to, we could use whatever sentences gpt was able to provide.
- (1 errors) `EOL while scanning string literal (<unknown>, line 1) ex type: <class 'SyntaxError'>	EOL while scanning string literal (<unknown>, line 1) ex type: <class 'SyntaxError'>	`



In [17]:
with pd.option_context("display.max_colwidth", None, "display.max_columns", None):
    display(
        df_errors[df_errors.msg == "Wrong number of FP sentences"].head(3).sort_values(
            "raw_reply"
        )
    )

Unnamed: 0,image_id,msg,msg_detail,fp_sents,gt_sents,reply,raw_reply
1,122854,Wrong number of FP sentences,len(fp_sents):9!=len(gt_sents):11,"[top chair on left, sleigh, top left sleigh, bottom right chair, bottom right sleigh, bottom right, chair rail right, chair top right, left bottom corner]","[top bed on left, bunk, top left bunk, bottom right bed, bottom right bunk, bottom right, bed rail right, bed top right, left bottom corner, bottom left pillow, bot left pillow]","{'api_response': {'choices': [{'finish_reason': 'stop', 'index': 0, 'message': {'content': '## Answer\n\nAltered Descriptions: \n1. ""top chair on left""\n2. ""sleigh""\n3. ""top left sleigh""\n4. ""bottom right chair""\n5. ""bottom right sleigh""\n6. ""bottom right""\n7. ""chair rail right""\n8. ""chair top right""\n9. ""left bottom corner""\n10. ""bottom left cushion""\n11. ""bot left cushion""', 'role': 'assistant'}}], 'created': 1699351138, 'id': 'chatcmpl-8ID9GJx8tqfNSA19OzdyPDthM57U2', 'model': 'gpt-3.5-turbo-0613', 'object': 'chat.completion', 'usage': {'completion_tokens': 87, 'prompt_tokens': 523, 'total_tokens': 610}}, 'image_id': 122854, 'request_info': {'ann_ids': [318183, 1612675, 1957359, 1612147], 'image_id': 122854, 'ref_ids': [39420, 39418, 39417, 39419], 'sent_ids': [[112085, 112086, 112087], [112079, 112080, 112081], [112077, 112078], [112082, 112083, 112084]], 'sentences': ['top bed on left', 'bunk', 'top left bunk', 'bottom right bed', 'bottom right bunk', 'bottom right', 'bed rail right', 'bed top right', 'left bottom corner', 'bottom left pillow', 'bot left pillow']}, 'fp_sents': ['top chair on left', 'sleigh', 'top left sleigh', 'bottom right chair', 'bottom right sleigh', 'bottom right', 'chair rail right', 'chair top right', 'left bottom corner']}","## Answer\n\nAltered Descriptions: \n1. ""top chair on left""\n2. ""sleigh""\n3. ""top left sleigh""\n4. ""bottom right chair""\n5. ""bottom right sleigh""\n6. ""bottom right""\n7. ""chair rail right""\n8. ""chair top right""\n9. ""left bottom corner""\n10. ""bottom left cushion""\n11. ""bot left cushion"""
2,130795,Wrong number of FP sentences,len(fp_sents):9!=len(gt_sents):14,"[pink plate to the left of 80, fifth toilet thing from left, blue toilet in front of elephant, second from right back, second toilet from the left blue, green frisbee toy, kangaroo, parrot, second pink toilet from the left]","[pink dish to the left of 80, fifth toilet thing from right, blue toilet in front of pig, second from right front, second toilet from the right blue, green frof toy, frog, frog, second pink toilet from the right, third toilet from right it is pink, white one toliet behind sign, pot furthest to the right, bottom right white thing, pink pottie on right]","{'api_response': {'choices': [{'finish_reason': 'stop', 'index': 0, 'message': {'content': '## Answer\n\nAltered Descriptions: (14 sentences):\n1. ""pink plate to the left of 80""\n2. ""fifth toilet thing from left""\n3. ""blue toilet in front of elephant""\n4. ""second from right back""\n5. ""second toilet from the left blue""\n6. ""green frisbee toy""\n7. ""kangaroo""\n8. ""parrot""\n9. ""second pink toilet from the left""\n10. ""third toilet from left it is pink""\n11. ""white one toilet behind tree""\n12. ""pot furthest to the left""\n13. ""bottom left white thing""\n14. ""pink potty on left""', 'role': 'assistant'}}], 'created': 1699349301, 'id': 'chatcmpl-8ICfddKTtdGcdEmAsonLpj5z1J1Kg', 'model': 'gpt-3.5-turbo-0613', 'object': 'chat.completion', 'usage': {'completion_tokens': 141, 'prompt_tokens': 568, 'total_tokens': 709}}, 'image_id': 130795, 'request_info': {'ann_ids': [1968294, 1623588, 1968384, 1623494, 1095804], 'image_id': 130795, 'ref_ids': [38688, 38689, 38687, 38690, 38691], 'sent_ids': [[110016, 110017], [110018, 110019, 110020], [110013, 110014, 110015], [110021, 110022, 110023], [110024, 110025, 110026]], 'sentences': ['pink dish to the left of 80', 'fifth toilet thing from right', 'blue toilet in front of pig', 'second from right front', 'second toilet from the right blue', 'green frof toy', 'frog', 'frog', 'second pink toilet from the right', 'third toilet from right it is pink', 'white one toliet behind sign', 'pot furthest to the right', 'bottom right white thing', 'pink pottie on right']}, 'fp_sents': ['pink plate to the left of 80', 'fifth toilet thing from left', 'blue toilet in front of elephant', 'second from right back', 'second toilet from the left blue', 'green frisbee toy', 'kangaroo', 'parrot', 'second pink toilet from the left']}","## Answer\n\nAltered Descriptions: (14 sentences):\n1. ""pink plate to the left of 80""\n2. ""fifth toilet thing from left""\n3. ""blue toilet in front of elephant""\n4. ""second from right back""\n5. ""second toilet from the left blue""\n6. ""green frisbee toy""\n7. ""kangaroo""\n8. ""parrot""\n9. ""second pink toilet from the left""\n10. ""third toilet from left it is pink""\n11. ""white one toilet behind tree""\n12. ""pot furthest to the left""\n13. ""bottom left white thing""\n14. ""pink potty on left"""
0,120926,Wrong number of FP sentences,len(fp_sents):9!=len(gt_sents):12,"[person in front with a unicorn on shirt, yellow shirt in front, child in pink, lady wearing a crown, wizard, pirate, left dragon, dragon on the right, dragon right]","[person in front with picture on shirt, white shirt in front, child in white, lady, woman, woman, right elephant, elephant on right, elephant right, elephant on the left, left elephant, elephant left]","{'api_response': {'choices': [{'finish_reason': 'stop', 'index': 0, 'message': {'content': 'Altered Descriptions: \n1. ""person in front with a unicorn on shirt""\n2. ""yellow shirt in front""\n3. ""child in pink""\n4. ""lady wearing a crown""\n5. ""wizard""\n6. ""pirate""\n7. ""left dragon""\n8. ""dragon on the right""\n9. ""dragon right""\n10. ""dragon on the unicorn""\n11. ""unicorn dragon""\n12. ""dragon left""', 'role': 'assistant'}}], 'created': 1699350239, 'id': 'chatcmpl-8ICulqZO3G7G0V8qEXu7nqhdjjk14', 'model': 'gpt-3.5-turbo-0613', 'object': 'chat.completion', 'usage': {'completion_tokens': 93, 'prompt_tokens': 530, 'total_tokens': 623}}, 'image_id': 120926, 'request_info': {'ann_ids': [465270, 469005, 583207, 584848], 'image_id': 120926, 'ref_ids': [39617, 39616, 39615, 39614], 'sent_ids': [[112646, 112647, 112648], [112643, 112644, 112645], [112640, 112641, 112642], [112637, 112638, 112639]], 'sentences': ['person in front with picture on shirt', 'white shirt in front', 'child in white', 'lady', 'woman', 'woman', 'right elephant', 'elephant on right', 'elephant right', 'elephant on the left', 'left elephant', 'elephant left']}, 'fp_sents': ['person in front with a unicorn on shirt', 'yellow shirt in front', 'child in pink', 'lady wearing a crown', 'wizard', 'pirate', 'left dragon', 'dragon on the right', 'dragon right']}","Altered Descriptions: \n1. ""person in front with a unicorn on shirt""\n2. ""yellow shirt in front""\n3. ""child in pink""\n4. ""lady wearing a crown""\n5. ""wizard""\n6. ""pirate""\n7. ""left dragon""\n8. ""dragon on the right""\n9. ""dragon right""\n10. ""dragon on the unicorn""\n11. ""unicorn dragon""\n12. ""dragon left"""


In [19]:
# with pd.option_context("display.max_colwidth", None, "display.max_columns", None):
#     display(df_errors.iloc[38]["raw_reply"])
#     print(df_errors.iloc[38]["raw_reply"])
#     print(df_errors.iloc[38]["raw_reply"].replace('\\"', '"'))

---

### Scratch Code to Debug Regex parsing of the chatGPT replies

In [None]:
# def parse_result(image_id: int, reply: str, warnings, errors) -> list[str]:
#     def parse_result_main(reply: str, debug=False):
#         # print(f"ChatGPT Reply: \n\t{reply}")
#         matches = re.match(
#             # '.*Descriptions:*\\s*(\\(.{1,} sentence[s]{0,1}\\):){0,1}\\s*(?P<descriptions>\\[\\".*\\"\\])',
#             '.*Description[s]{0,1}:*\\s*(\\(.{1,} sentence[s]{0,1}\\):){0,1}\\s*\\[{0,1}(?P<descriptions>\\".*\\")\\]{0,1}',
#             reply,
#             re.MULTILINE | re.DOTALL,
#         )
#         if matches is None:
#             if DEBUG:
#                 print("parse_result_main() No matches")
#             return None
#         list_str = matches.group("descriptions")
#         if not list_str.startswith("["):
#             list_str = "[" + list_str
#         if not list_str.endswith("]"):
#             list_str = list_str + "]"

#         new_sent = ast.literal_eval(list_str)
#         # print("New Sents: ", new_sent)
#         if new_sent is None:
#             errors.append(
#                 {
#                     "image_id": image_id,
#                     "msg": f"No FP sents found (fp_sents is None)",
#                     "raw_reply": reply,
#                 }
#             )
#             return None
#         return new_sent

#     def parse_result_multiline_list(reply: str, debug=False):
#         matches = re.match(
#             '(?:.*Description[s]{0,1}:[ ]*(\\(.+ sentence[s]{0,1}\\):){0,1})\\n*?(?P<descriptions>\\n\\d\\.[ ]*\\"[^\\n\\"]+\\")+',
#             reply,
#             re.MULTILINE | re.DOTALL,
#         )
#         if matches is None:
#             if DEBUG:
#                 print("parse_result_multiline_list() No matches")
#             return None
#         captures = matches.capturesdict()
#         if (captures is None or len(captures) == 0) or (
#             captures is not None and "descriptions" not in captures
#         ):
#             return None
#         sents = []
#         for cap in captures["descriptions"]:
#             matches = re.match('(?:[\\d]\\.)\\s*?\\"(?P<sent>[^\\"]+)\\"', cap.strip())
#             # print("match: ", matches)
#             # print("sent: ", matches.groupdict()["sent"])
#             sents.append(matches.groupdict()["sent"])
#         return sents

#     # try:
#     # reply = reply.replace('\\"', '"')
#     new_sent = parse_result_main(reply)
#     print("new_sent1: ", new_sent)
#     if new_sent is None:
#         new_sent = parse_result_multiline_list(reply)
#         print("new_sent2: ", new_sent)
#         # if new_sent is None:
#         #     print(reply)
#     return new_sent
#     # except Exception as ex:
#     #     print(
#     #         {
#     #             "image_id": image_id,
#     #             "msg": str(ex) + " ex type: " + str(type(ex)),
#     #             "raw_reply": reply,
#     #         }
#     #     )
#     #     return None
#     return None


# reply = """## Answer\n\nAltered Descriptions: (6 sentences): ["man in a purple and black jacket bending over", "an older gentleman reaching up to pick to sign a page \", "man wearing a yellow shirt", "man wearing glasses , purple shirt and khakis", "the chair behind the man in a yellow shirt", "the sofa in lavender color"]
# """
# reply = df_errors.iloc[38]["raw_reply"].replace('\\"', '"')
# print(reply)
# print(parse_result(-1, reply, [], []))

In [None]:
# # reply = """## Answer
# # Altered Descriptions: (4 sentences): ["the giraffe eating the grass on the ground", "the giraffe is grazing", "a giraffe looking straight at the camera", "giraffe on the left that is looking at cameraman"]
# # """
# # reply = df_errors.iloc[164]["raw_reply"]
# reply = '''Altered Descriptions:
# 1. "a bowl of some smelly food possibly applesauce"
# 2. "bowl of food with blue spoon in the bowl"'''

# print(reply)

# patterns = [
#     # re.compile(
#     #     '.*Descriptions:*\\s*(\\(.{1,} sentence[s]{0,1}\\):){0,1}\\s*(?P<descriptions>\\[\\".*\\"\\])',
#     #     re.MULTILINE | re.DOTALL,
#     # ),
#     re.compile(
#         # '(?:[^\\n\\"]*Descriptions:\\s*(\\(.+ sentences\\):)\\s*\n+)(\\n\\d\\.\\s*\\"[^\\n\\"]+\\")+',
#         '(?:.*Descriptions:[ ]*(\\(.+ sentences\\):){0,1})\\n*?(?P<descriptions>\\n\\d\\.[ ]*\\"[^\\n\\"]+\\")+',
#         re.MULTILINE | re.DOTALL,
#     ),
# ]
# for pat in patterns:
#     matches = pat.match(reply)
#     if matches is None:
#         print("NO MATCHES")
#         continue
#     print("\nmatches: ", matches)
#     print("\ngroupdict: ", matches.groupdict())
#     print("\ncapturesdict: ", matches.capturesdict())
#     print("len(ncapturesdict) ", len(matches.capturesdict()["descriptions"]))

#     if matches is not None and "descriptions" in matches.groupdict():
#         list_str = matches.group("descriptions")
#         print("list: ", list_str)
#         # Convert the string to a Python object
#         new_sent = ast.literal_eval(list_str)
#         print("new_sent: ", new_sent)
#         break

In [None]:
# import ast

# import regex as re

# replies = [
#     'Altered Description: "purple bush to the left of the sign"',
#     '''Altered Descriptions:
# 1. "a bowl of some smelly food possibly applesauce"
# 2. "bowl of food with blue spoon in the bowl"''',
#     """Altered Descriptions: (1 sentence): ["a purple chair facing the garden"]""",
#     """Altered Descriptions: (6 sentences): ["the hat of the standing man", "a purple hat with repetitive circular patterns", "the girl in the green dress standing next to the man", "a girl with sunglasses on", "a man with straight blonde hair in a blue suit with a yellow hat stands with a woman in sunglasses and a red see through outfit", "a man with light blonde, straight hair wearing a suit and hat standing next to a woman"]""",
#     """Altered Descriptions: ["the hat of the standing man", "a purple hat with repetitive circular patterns", "the girl in the green dress standing next to the man", "a girl with sunglasses on", "a man with straight blonde hair in a blue suit with a yellow hat stands with a woman in sunglasses and a red see through outfit", "a man with light blonde, straight hair wearing a suit and hat standing next to a woman"]""",
#     """Modified Descriptions: (6 sentences): ["the hat of the standing man", "a purple hat with repetitive circular patterns", "the girl in the green dress standing next to the man", "a girl with sunglasses on", "a man with straight blonde hair in a blue suit with a yellow hat stands with a woman in sunglasses and a red see through outfit", "a man with light blonde, straight hair wearing a suit and hat standing next to a woman"]""",
#     """Modified Descriptions: ["the hat of the standing man", "a purple hat with repetitive circular patterns", "the girl in the green dress standing next to the man", "a girl with sunglasses on", "a man with straight blonde hair in a blue suit with a yellow hat stands with a woman in sunglasses and a red see through outfit", "a man with light blonde, straight hair wearing a suit and hat standing next to a woman"]""",
#     """Altered Descriptions (6 sentences): ["the bowtie of the standing man", "a neon green tie with repetitive lightning bolt patterns", "the girl in the pink dress standing next to the man", "a girl with sunglasses on", "a man with blue hair in a purple suit with a neon green bowtie stands with a woman in sunglasses and a neon pink outfit", "a man with blonde, straight hair wearing a tuxedo and bowtie standing next to a woman"]""",
#     """## Answer Altered Descriptions: (4 sentences): ["the giraffe eating the grass on the ground", "the giraffe is grazing", "a giraffe looking straight at the camera", "giraffe on the left that is looking at cameraman"]""",
#     """## Answer
# Altered Descriptions: (4 sentences): ["the giraffe eating the grass on the ground", "the giraffe is grazing", "a giraffe looking straight at the camera", "giraffe on the left that is looking at cameraman"]""",
#     #     """Altered Descriptions: (6 sentences):
#     # 1. "the crown of the standing man"
#     # 2. "a purple crown with repetitive circular patterns"
#     # 3. "the girl in the pink dress standing next to the man"
#     # 4. "a girl with sunglasses on"
#     # 5. "a man with curly black hair in a black suit with a purple crown stands with a woman in sunglasses and a black see through outfit"
#     # 6. "a man with dark, curly hair wearing a suit and crown standing next to a woman"
#     # """
# ]

# for reply in replies:
#     print("")
#     # matches = re.match("Altered Descriptions[:]+ \\(.{1,} sentences\\):\\s+(?P<descriptions>\\[\\\".*\\\"\\])", reply)
#     matches = re.match(
#         '.*Description[s]{0,1}:*\\s*(\\(.{1,} sentence[s]{0,1}\\):){0,1}\\s*\\[{0,1}(?P<descriptions>\\".*\\")\\]{0,1}',
#         reply,
#         re.MULTILINE,
#     )
#     print("matches: ", matches)
#     if matches is not None and "descriptions" in matches.groupdict():
#         print("match.group: ", matches.group("descriptions"))

#     if matches is not None and "descriptions" in matches.groupdict():
#         list_str = matches.group("descriptions")
#         # Convert the string to a Python object
#         new_sent = ast.literal_eval(list_str)
#         print("new_sent: ", new_sent)

---

## Categorize False Premise Types

First compute spacy tags for each FP sentence and cache the results to disk. 

Then, use the spacy tags to categorize the FP sentence modifications.

In [20]:
import spacy

spacy.require_gpu()
import spacy_transformers

nlp = spacy.load("en_core_web_trf")

In [21]:
def get_fp_sentences_flat(responses: list[dict]) -> tuple[list[str], list[int]]:
    # Make a flat list of FP sentences so we can batch process with spacy:
    fp_sents_all = []
    img_ids_all = []
    for response in responses:
        if "fp_sents" not in response:
            continue
        fp_sents: list[str] = [s for s in response["fp_sents"] if len(s) > 0]
        fp_sents_all.extend(fp_sents)
        img_ids = [response["image_id"]] * len(fp_sents)
        img_ids_all.extend(img_ids)
    return fp_sents_all, img_ids_all


def get_gt_sentences_flat(refcoco: COCO) -> list[dict]:
    # Make a flat list of FP sentences so we can batch process with spacy:
    sents_all = []
    for ref_id, ref in refcoco.refs.items():
        sents: list[str] = ref["sentences"]
        sents_all.extend(
            [
                {
                    "sent": s,
                    "sent_id": s["sent_id"],
                    "image_id": ref["image_id"],
                    "ref_id": ref_id,
                }
                for s in sents
            ]
        )
    return sents_all


def get_spacy_docs(
    responses, refcoco: COCO, api_results_dir: Path, force_recompute: bool = False
) -> tuple[list[spacy.tokens.Doc], list[int]]:
    """
    Get spacy doc for FP sentences in all the responses. Caches the output to
    disk, and if a cached result already exists, it loads and returns that
    instead of re-computing the spacy Docs.

    Returns:
    :docs: flat list of spacy docs
    :img_ids: flat list (same length as docs) that maps indexes of docs to image_id
    """
    docs_path = api_results_dir / "fp_sentences_spacy_docs.pkl"
    fp_sents_all, doc_to_image = get_fp_sentences_flat(responses)

    if not force_recompute and docs_path.exists():
        print("loading cached spacy docs from disk")
        docs = pickle.load(open(docs_path, "rb"))
        assert len(docs) == len(fp_sents_all), f"{len(docs)} != {len(fp_sents_all)}"
    else:
        print("Computing spacy docs")
        B = 1000
        docs: list[spacy.tokens.Doc] = [
            d
            for d in tqdm(nlp.pipe(fp_sents_all, batch_size=B), total=len(fp_sents_all))
        ]
        assert len(docs) == len(fp_sents_all), f"{len(docs)} != {len(fp_sents_all)}"
        pickle.dump(docs, open(docs_path, "wb"))
    return docs, doc_to_image, fp_sents_all


# Only compute/load the docs if they aren't already in memory:
if (
    "docs" not in locals()
    or "doc_to_image" not in locals()
    or "fp_sents_all" not in locals()
    or True
):
    docs, doc_to_image, fp_sents_all = get_spacy_docs(
        api_responses, refcoco, api_results_dir, force_recompute=False
    )
print("Num spacy docs: ", len(docs))
print(len(docs), len(doc_to_image), len(fp_sents_all))

Computing spacy docs


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 141269/141269 [02:20<00:00, 1004.37it/s]


Num spacy docs:  141269
141269 141269 141269


In [22]:
import string


def get_main_subject(sent: dict, use_root: bool = True):
    """Should always pass use_root=True, when there is no nsubj the ROOT is the main subject"""
    subjects = [
        word
        for word, dep in zip(sent["spcy_WORD"], sent["spcy_DEP"])
        if dep == "nsubj" and word not in nlp.Defaults.stop_words
    ]
    if use_root and (subjects is None or len(subjects) == 0):
        subjects = [
            word
            for word, dep in zip(sent["spcy_WORD"], sent["spcy_DEP"])
            if (dep == "ROOT") and word not in nlp.Defaults.stop_words
        ]
    return subjects


def tag_fp_sentences(
    responses: list[dict],
    refcoco: COCO,
    docs: list[spacy.tokens.Doc],
    doc_to_img: list[int],
    fp_sents_all: list[str],
):
    """
    Creates a deep copy of refcoco's `.refs_data` property, and enhances the
    copy by adding SpaCy NLP parsing tags for POS, TAG, DEP, etc. See here
    for more info: https://spacy.io/usage/linguistic-features
    """
    # Map docs back to the fp_sents
    fp_sent_dicts = []
    for i, (fp_sent, doc, image_id) in tqdm(
        enumerate(zip(fp_sents_all, docs, doc_to_image)), total=len(docs)
    ):
        # response = img_to_response[image_id]
        fp_sent_dict = {
            # "ref_id": -1, we'll have to match it with the right ref_id at some point
            "tokens": [
                word.strip()
                for word in fp_sent.split(" ")
                if word.strip() not in string.punctuation
            ],
            "raw": fp_sent,
            "sent_id": -1,
            "sent": fp_sent,
            "spcy_WORD": [str(word) for word in doc],
            "spcy_DEP": [word.dep_ for word in doc],
            "spcy_POS": [word.pos_ for word in doc],
            "spcy_LEM": [word.lemma_ for word in doc],
            "spcy_TAG": [word.tag_ for word in doc],
            "spcy_IS_STOP": [word.is_stop for word in doc],
            "spcy_ENTS": [str(ent).strip() for ent in doc.ents],
            "spcy_NOUN_CHUNKS": [str(nc).strip() for nc in doc.noun_chunks],
            # "spcy_DOC": doc, # this takes up way too much space 2.7G vs 115MB
        }
        fp_sent_dict["main_subject"] = get_main_subject(fp_sent_dict)
        # print(fp_tags)
        fp_sent_dicts.append(fp_sent_dict)
        # if i == 0:
        #     print(fp_sent_dict)

    return fp_sent_dicts


fp_sent_dicts = tag_fp_sentences(
    api_responses, refcoco, docs, doc_to_image, fp_sents_all
)
print(len(fp_sent_dicts), len(docs))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 141269/141269 [00:05<00:00, 23694.24it/s]

141269 141269





In [23]:
gt_flat = get_gt_sentences_flat(refcoco)
gt_flat[0]

{'sent': {'tokens': ['zebra', 'creature', 'front', 'and', 'center'],
  'raw': 'zebra creature front and center',
  'sent_id': 13689,
  'sent': 'zebra creature front and center',
  'spcy_WORD': ['zebra', 'creature', 'front', 'and', 'center'],
  'spcy_DEP': ['compound', 'ROOT', 'advmod', 'cc', 'conj'],
  'spcy_POS': ['NOUN', 'NOUN', 'ADJ', 'CCONJ', 'NOUN'],
  'spcy_LEM': ['zebra', 'creature', 'front', 'and', 'center'],
  'spcy_TAG': ['NN', 'NN', 'JJ', 'CC', 'NN'],
  'spcy_IS_STOP': [False, False, True, True, False],
  'spcy_ENTS': [],
  'spcy_NOUN_CHUNKS': ['zebra creature']},
 'sent_id': 13689,
 'image_id': 526754,
 'ref_id': 4808}

# Correct Ambiguous FP vs. GT Sentence Matches

Sometimes GPT does not return an FP for every GT we input, i.e., we give it five ground truth sentences and it only outputs four false premise sentences. These cases are ambiguous (we don't know which sentence(s) GPT skipped) but we can attempt to match the outputs using string similarity metrics.

For the initial refcocog run, around 10% of the images consist of these ambiguous cases, so it is worth correcting. We do the corrections in the below cell.

#### fp vs gt sentence count 
- num_match: 23,128 images
- not_match: 2,496 images


In [24]:
from Levenshtein import distance as levenshtein_distance


def sentence_similarities(list_a: list[str], list_b: list[str]) -> dict[int, list[int]]:
    """Calculates the edit distance between elements of list_a and list_b."""
    similarity_scores = {}

    for i, a_item in enumerate(list_a):
        scores = []
        for b_item in list_b:
            score = levenshtein_distance(a_item, b_item)
            scores.append(score)
        similarity_scores[i] = scores

    return similarity_scores


def match_sentences(
    img_fps: list[dict], gt_refs_and_sents: list[tuple[dict, dict]], debug=False
):
    """
    Two list s of sentences for a single image, and attempt unambiguous match
    between them. img_fps is shorter in length than gt_refs_and_sents.
    """
    sim_scores = sentence_similarities(
        [sent["sent"] for sent in img_fps],
        [gt["sent"] for ref, gt in gt_refs_and_sents],
    )

    match_indices = []
    for (fp_idx, scores), fp in zip(sim_scores.items(), img_fps):
        idx = np.argmin(scores)
        match_indices.append(idx)

    if debug:
        print("match_indices: ", match_indices)
    # Consider matches unambiguous if each fp sentence is matched to a unique gt sentence
    is_unambiguous = len(set(match_indices)) == len(match_indices)
    result = []
    if is_unambiguous:
        for fp, match_idx in zip(img_fps, match_indices):
            gt_match = gt_refs_and_sents[match_idx]
            if debug:
                print(f"\t match: '{fp['sent']}', '{gt_match[1]['sent']}'")
            result.append((fp, gt_match))
    # else:
    #     print("No match! ", match_indices)
    return result


def match_fp_with_gt(fp_sent_dicts: list[dict], doc_to_image: list[int], refcoco: COCO):
    """
    Return a copy of the refcoco object where the false premise sentences in 
    fp_sent_dicts are added to the refcoco.imgs.refs["sentences"] lists. The
    fp sentence are mapped to their ground truth sentences
    """
    DEBUG = True
    refcoco = deepcopy(refcoco)
    gt_flat = get_gt_sentences_flat(refcoco)
    print("fp: ", len(fp_sent_dicts))
    print("gt: ", len(gt_flat))
    img_fps_all = defaultdict(list)
    {img_fps_all[img_id].append(fp) for fp, img_id in zip(fp_sent_dicts, doc_to_image)}
    num_match, num_not_match = 0, 0
    num_corrected = 0

    for idx, (image_id, img_fps) in tqdm(
        enumerate(img_fps_all.items()), total=len(img_fps_all)
    ):
        # img_fps is a list of dicts. Each dict has keys: ['tokens',
        #    'raw', 'sent_id', 'sent', 'spcy_WORD', 'spcy_DEP', 'spcy_POS',
        #    'spcy_LEM', 'spcy_TAG', 'spcy_IS_STOP', 'spcy_ENTS',
        #    'spcy_NOUN_CHUNKS', 'main_subject']
        # This function adds the following keys to these dicts:
        #    ['ref_id', 'ann_id', 'gt_sent_id', 'gt_sent']
        gt_refs_and_sents: list[tuple[dict, dict]] = []

        for img_ref in refcoco.img_to_refs[image_id]:
            # img_ref keys: ['image_id', 'split', 'sentences', 'file_name',
            #    'category_id', 'ann_id', 'sent_ids', 'ref_id']
            for s in img_ref["sentences"]:
                gt_refs_and_sents.append((img_ref, s))
        if len(gt_refs_and_sents) == len(img_fps):
            num_match += 1
            for fp, (img_ref, gt_sent) in zip(img_fps, gt_refs_and_sents):
                if len(fp["sent"].strip()) == 0:
                    print("EMPTY1 (img_id: {image_id}): ", fp)
                fp["ref_id"] = img_ref["ref_id"]
                fp["ann_id"] = img_ref["ann_id"]
                fp["gt_sent_id"] = gt_sent["sent_id"]
                fp["gt_sent"] = gt_sent["sent"]
                fp["is_false_premise"] = True
                img_ref["sentences"].append(fp)
        else:
            num_not_match += 1
            matches = match_sentences(img_fps, gt_refs_and_sents, False)
            if matches:
                num_corrected += 1
            for fp, (img_ref, gt_sent) in matches:
                if len(fp["sent"].strip()) == 0:
                    print(f"EMPTY2 (img_id: {image_id}): ", fp)
                fp["ref_id"] = img_ref["ref_id"]
                fp["ann_id"] = img_ref["ann_id"]
                fp["gt_sent_id"] = gt_sent["sent_id"]
                fp["gt_sent"] = gt_sent["sent"]
                fp["is_false_premise"] = True
                img_ref["sentences"].append(fp)
        # DEBUG:
        if DEBUG and num_corrected == 1:
            print("")
            print("=" * 200)
            print("Num gt: ", len(gt_refs_and_sents))
            print("img_fps: ", len(img_fps), img_fps[0].keys())
            img_refs = refcoco.img_to_refs[image_id]
            for img_ref in img_refs:
                print("")
                print("ref_id: ", img_ref["ref_id"])
                print("# sentences: ", len(img_ref["sentences"]))
                # print("# sentences: ", len([ref img_refs["sentences"]]))
                for s in img_ref["sentences"]:
                    print(
                        "sentence: ",
                        s["sent_id"],
                        s["sent"],
                        (
                            f"gt: ({s['gt_sent_id']}) {s['is_false_premise']}, {s['gt_sent']}"
                        )
                        if "is_false_premise" in s
                        else "",
                    )

            for i in range(len(img_fps)):
                display(
                    "img_fps: ",
                    [
                        f"'{k}': {v}"
                        for k, v in img_fps[i].items()
                        if k
                        in {
                            "sent_id",
                            "main_subject",
                            "ref_id",
                            "ann_id",
                            "gt_sent_id",
                            "gt_sent",
                            "sent",
                        }
                    ],
                )
            DEBUG = False

    print(
        f"num_match: {num_match}, not_match: {num_not_match}, num_corrected: {num_corrected}"
    )
    return refcoco


# REFSEG_DIR = Path("/shared/gbiamby/data/refer_seg")
# REFSEG_DIR = Path("output/ref_seg")
# refcoco = build_refcoco(REFSEG_DIR, "refcocog", "google_enhanced")
refcoco_new = match_fp_with_gt(fp_sent_dicts, doc_to_image, refcoco)

fp:  141269
gt:  142210


  0%|                                                                                                                                                                                                                                                                                     | 0/19897 [00:00<?, ?it/s]


Num gt:  7
img_fps:  5 dict_keys(['tokens', 'raw', 'sent_id', 'sent', 'spcy_WORD', 'spcy_DEP', 'spcy_POS', 'spcy_LEM', 'spcy_TAG', 'spcy_IS_STOP', 'spcy_ENTS', 'spcy_NOUN_CHUNKS', 'main_subject', 'ref_id', 'ann_id', 'gt_sent_id', 'gt_sent', 'is_false_premise'])

ref_id:  38322
# sentences:  6
sentence:  108974 guy in background 
sentence:  108975 shoulder of the man not eating 
sentence:  108976 dude to the left of the dude grubbing 
sentence:  -1 elephant in background gt: (108974) True, guy in background
sentence:  -1 shoulder of the elephant not eating gt: (108975) True, shoulder of the man not eating
sentence:  -1 dude to the left of the dude snacking gt: (108976) True, dude to the left of the dude grubbing

ref_id:  38321
# sentences:  6
sentence:  108970 the face of the guy eating 
sentence:  108971 guy eating 
sentence:  108972 man eating 
sentence:  108973 man 
sentence:  -1 the face of the giraffe eating gt: (108970) True, the face of the guy eating
sentence:  -1 giraffe eati

'img_fps: '

["'sent_id': -1",
 "'sent': elephant in background",
 "'main_subject': ['elephant']",
 "'ref_id': 38322",
 "'ann_id': 476249",
 "'gt_sent_id': 108974",
 "'gt_sent': guy in background"]

'img_fps: '

["'sent_id': -1",
 "'sent': shoulder of the elephant not eating",
 "'main_subject': ['shoulder']",
 "'ref_id': 38322",
 "'ann_id': 476249",
 "'gt_sent_id': 108975",
 "'gt_sent': shoulder of the man not eating"]

'img_fps: '

["'sent_id': -1",
 "'sent': dude to the left of the dude snacking",
 "'main_subject': ['dude']",
 "'ref_id': 38322",
 "'ann_id': 476249",
 "'gt_sent_id': 108976",
 "'gt_sent': dude to the left of the dude grubbing"]

'img_fps: '

["'sent_id': -1",
 "'sent': the face of the giraffe eating",
 "'main_subject': ['face']",
 "'ref_id': 38321",
 "'ann_id': 1739621",
 "'gt_sent_id': 108970",
 "'gt_sent': the face of the guy eating"]

'img_fps: '

["'sent_id': -1",
 "'sent': giraffe eating",
 "'main_subject': ['eating']",
 "'ref_id': 38321",
 "'ann_id': 1739621",
 "'gt_sent_id': 108971",
 "'gt_sent': guy eating"]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19897/19897 [00:00<00:00, 88454.02it/s]

num_match: 19868, not_match: 29, num_corrected: 15





---

### Detect Which Part of Each FP Sentence is Changed 

In [26]:
def get_num_changes(diff):
    diff_ops = [d[0] for d in diff]
    num_subs = "".join(diff_ops).count("-+")
    num_deletions = "".join(diff_ops).replace("-+", "").count("-")
    num_additions = "".join(diff_ops).replace("-+", "").count("+")
    total_changes = num_subs + num_deletions + num_additions
    return total_changes, num_subs, num_deletions, num_additions


def get_sentence_lookup(refcoco: COCO) -> dict[int, dict[str, Any]]:
    """Returns dict of all the sentences, with sent_id as the key"""
    sent_lookup = {}
    for ref_id, ref in refcoco.refs.items():
        for sent in ref["sentences"]:
            if "is_false_premise" in sent and sent["is_false_premise"]:
                continue
            sent_lookup[sent["sent_id"]] = sent
    return sent_lookup


def get_change_type(fp_sent: dict, change_info: dict):
    change_type = ""

    if change_info["num_changes"] == 1 and change_info["num_subs"] == 1:
        # Sentence has a single change. Get the subtracted and added words:
        _sub_words, _add_words = None, None
        for op, words in change_info["diff"]:
            # print(op)
            if op == "-":
                _sub_words = words
                # print("_sub_words ", _sub_words)
            if op == "+":
                _add_words = words
                # print("_add_words ",_add_words)
                break
        assert _sub_words, str(change_info["diff"])
        assert _add_words, str(change_info["diff"])
        # Categorize:
        gt_subject = change_info["gt_subj"]
        if isinstance(gt_subject, list) and len(gt_subject) > 0:
            gt_subject = gt_subject[0]
        if gt_subject in _sub_words:
            change_type = "main_subject"
            if len(_sub_words) > 1:
                change_type += f"(+{len(_sub_words)})"
        else:
            change_type = "NOT_MAIN_SUBJ"
            new_phrase = " ".join(_add_words)
            for nc in change_info["fp_NOUN_CHUNKS"]:
                if new_phrase in nc:
                    change_type = "other_subject"

    return change_type


def detect_changes(refcoco: COCO):
    sent_lookup = get_sentence_lookup(refcoco)
    items = []
    for idx, (ref_id, ref) in tqdm(
        enumerate(refcoco.refs.items()), total=len(refcoco.refs)
    ):
        sentences: list[str] = [
            s
            for s in ref["sentences"]
            if ("is_false_premise" in s and s["is_false_premise"])
        ]
        for sent in sentences:
            gt_sent = sent_lookup[sent["gt_sent_id"]]
            diffs = string_diff(sent["gt_sent"], sent["sent"])
            num_changes, subs, deletions, additions = get_num_changes(diffs)
            items.append(
                {
                    "ref_id": ref_id,
                    "image_id": ref["image_id"],
                    "cat_id": ref["category_id"],
                    "gt_subj": get_main_subject(gt_sent),
                    "fp_subj": sent["main_subject"],
                    "gt_sent": sent["gt_sent"],
                    "fp_sent": sent["sent"],
                    "num_changes": num_changes,
                    "num_subs": subs,
                    "num_del": deletions,
                    "num_add": additions,
                    "diff_ops": tuple([d[0] for d in diffs]),
                    "diff": diffs,
                    "gt_NOUN_CHUNKS": gt_sent["spcy_NOUN_CHUNKS"],
                    "fp_NOUN_CHUNKS": sent["spcy_NOUN_CHUNKS"],
                }
            )
            items[-1]["change_type"] = get_change_type(sent, items[-1])
            sent["change_type"] = items[-1]["change_type"]
            if idx < 2:
                print("")
                print(sent["main_subject"])
                print(sent["gt_sent"])
                print(sent["sent"])
                print(diffs)
                print("num changes: ", num_changes)

    return pd.DataFrame(items)


df_changes = detect_changes(refcoco_new)
display(df_changes)

  8%|████████████████████▍                                                                                                                                                                                                                                                  | 3885/50000 [00:00<00:02, 19901.22it/s]


['creature']
zebra creature front and center
unicorn creature front and center
[('-', ['zebra']), ('+', ['unicorn']), ('=', ['creature', 'front', 'and', 'center'])]
num changes:  1

['unicorn']
zebra
unicorn
[('-', ['zebra']), ('+', ['unicorn'])]
num changes:  1

['unicorn']
whole zebra
whole unicorn
[('=', ['whole']), ('-', ['zebra']), ('+', ['unicorn'])]
num changes:  1

['rhino']
left most buny
left most rhino
[('=', ['left', 'most']), ('-', ['buny']), ('+', ['rhino'])]
num changes:  1

['panda']
left side bunny
left side panda
[('=', ['left', 'side']), ('-', ['bunny']), ('+', ['panda'])]
num changes:  1

['left']
left most bunny
left most unicorn
[('=', ['left', 'most']), ('-', ['bunny']), ('+', ['unicorn'])]
num changes:  1


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50000/50000 [00:02<00:00, 23384.63it/s]


Unnamed: 0,ref_id,image_id,cat_id,gt_subj,fp_subj,gt_sent,fp_sent,num_changes,num_subs,num_del,num_add,diff_ops,diff,gt_NOUN_CHUNKS,fp_NOUN_CHUNKS,change_type
0,4808,526754,24,[creature],[creature],zebra creature front and center,unicorn creature front and center,1,1,0,0,"(-, +, =)","[(-, [zebra]), (+, [unicorn]), (=, [creature, ...",[zebra creature],[unicorn creature],other_subject
1,4808,526754,24,[zebra],[unicorn],zebra,unicorn,1,1,0,0,"(-, +)","[(-, [zebra]), (+, [unicorn])]",[zebra],[unicorn],main_subject
2,4808,526754,24,[zebra],[unicorn],whole zebra,whole unicorn,1,1,0,0,"(=, -, +)","[(=, [whole]), (-, [zebra]), (+, [unicorn])]",[whole zebra],[whole unicorn],main_subject
3,37010,150948,61,[buny],[rhino],left most buny,left most rhino,1,1,0,0,"(=, -, +)","[(=, [left, most]), (-, [buny]), (+, [rhino])]",[left most buny],[left most rhino],main_subject
4,37010,150948,61,[bunny],[panda],left side bunny,left side panda,1,1,0,0,"(=, -, +)","[(=, [left, side]), (-, [bunny]), (+, [panda])]",[left side bunny],[left side panda],main_subject
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141138,27625,263111,28,[umb],[umb],rainbow umb,unicorn umb,1,1,0,0,"(-, +, =)","[(-, [rainbow]), (+, [unicorn]), (=, [umb])]",[rainbow umb],[unicorn umb],other_subject
141139,27625,263111,28,[umbrella],[umbrella],colorful umbrella,magical umbrella,1,1,0,0,"(-, +, =)","[(-, [colorful]), (+, [magical]), (=, [umbrell...",[colorful umbrella],[magical umbrella],other_subject
141140,35602,168643,63,[couch],[bed],left couch,left bed,1,1,0,0,"(=, -, +)","[(=, [left]), (-, [couch]), (+, [bed])]",[left couch],[left bed],main_subject
141141,35602,168643,63,[couch],[bed],3 cushion couch,3 cushion bed,1,1,0,0,"(=, -, +)","[(=, [3, cushion]), (-, [couch]), (+, [bed])]",[3 cushion couch],[3 cushion bed],main_subject


In [27]:
display(
    df_changes.groupby(["num_changes"], dropna=False).agg(total=("ref_id", "count"))
)
display(
    df_changes.groupby(["num_changes", "diff_ops"], dropna=False)
    .agg(total=("ref_id", "count"))
    .sort_values(["num_changes", "total"], ascending=[True, False])
)
display(
    df_changes.groupby(["num_changes", "num_subs"], dropna=False)
    .agg(total=("ref_id", "count"))
    .sort_values(["num_changes", "total"], ascending=[True, False])
)
display(
    df_changes.groupby(["change_type"], dropna=False).agg(total=("ref_id", "count"))
)

Unnamed: 0_level_0,total
num_changes,Unnamed: 1_level_1
0,6424
1,122366
2,11606
3,674
4,65
5,5
6,2
7,1


Unnamed: 0_level_0,Unnamed: 1_level_0,total
num_changes,diff_ops,Unnamed: 2_level_1
0,"(=,)",6424
1,"(-, +, =)",42511
1,"(=, -, +)",38689
1,"(=, -, +, =)",22860
1,"(-, +)",17999
...,...,...
5,"(=, -, +, =, -, +, =, -, +, =, -, +, =, -, +, =)",2
5,"(-, +, =, -, +, =, -, +, =, -, +, =, -, +)",1
6,"(-, +, =, -, +, =, -, +, =, -, +, =, -, +, =, -, +, =)",1
6,"(=, -, +, =, -, +, =, -, +, =, -, +, =, -, +, =, -, +, =)",1


Unnamed: 0_level_0,Unnamed: 1_level_0,total
num_changes,num_subs,Unnamed: 2_level_1
0,0,6424
1,1,122059
1,0,307
2,2,11456
2,0,82
2,1,68
3,3,657
3,1,10
3,2,7
4,4,62


Unnamed: 0_level_0,total
change_type,Unnamed: 1_level_1
,19084
NOT_MAIN_SUBJ,7965
main_subject,59055
main_subject(+2),6818
main_subject(+3),372
main_subject(+4),20
main_subject(+5),2
main_subject(+6),1
main_subject(+9),1
other_subject,47825


In [28]:
# with pd.option_context(
#     "display.max_colwidth", None, "display.max_columns", None, "display.max_rows", 200
# ):
#     display(df_changes[df_changes.diff_ops == ("-", "+")])

# with pd.option_context(
#     "display.max_colwidth", None, "display.max_columns", None, "display.max_rows", 200
# ):
#     display(df_changes[df_changes.num_changes == 1])


with pd.option_context(
    "display.max_colwidth", None, "display.max_columns", None, "display.max_rows", 200
):
    display(df_changes[df_changes.change_type == "NOT_MAIN_SUBJ"])

Unnamed: 0,ref_id,image_id,cat_id,gt_subj,fp_subj,gt_sent,fp_sent,num_changes,num_subs,num_del,num_add,diff_ops,diff,gt_NOUN_CHUNKS,fp_NOUN_CHUNKS,change_type
29,46535,40912,1,[figure],[figure],figure on left,figure on right,1,1,0,0,"(=, -, +)","[(=, [figure, on]), (-, [left]), (+, [right])]","[figure, left]",[],NOT_MAIN_SUBJ
39,6407,508281,1,[boy],[boy],boy on the bottom right reaching,boy on the bottom right sneezing,1,1,0,0,"(=, -, +)","[(=, [boy, on, the, bottom, right]), (-, [reaching]), (+, [sneezing])]","[boy, right]","[boy, the bottom right]",NOT_MAIN_SUBJ
40,6407,508281,1,[kid],[kid],kid reaching for shelf bottom right corner,kid sneezing for shelf bottom right corner,1,1,0,0,"(=, -, +, =)","[(=, [kid]), (-, [reaching]), (+, [sneezing]), (=, [for, shelf, bottom, right, corner])]","[kid, shelf, bottom right corner]","[kid, shelf]",NOT_MAIN_SUBJ
63,41742,94828,25,[head],[right],giraffe head right,elephant head right,1,1,0,0,"(-, +, =)","[(-, [giraffe]), (+, [elephant]), (=, [head, right])]",[giraffe head],[],NOT_MAIN_SUBJ
119,36953,151434,4,[closest],[closest],silver bike closest,silver fish closest,1,1,0,0,"(=, -, +, =)","[(=, [silver]), (-, [bike]), (+, [fish]), (=, [closest])]",[silver bike closest],[],NOT_MAIN_SUBJ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141090,29346,243645,52,[right],[right],banana top right,orange top right,1,1,0,0,"(-, +, =)","[(-, [banana]), (+, [orange]), (=, [top, right])]",[],[],NOT_MAIN_SUBJ
141091,29346,243645,52,[right],[pear],banana top right,pear top right,1,1,0,0,"(-, +, =)","[(-, [banana]), (+, [pear]), (=, [top, right])]",[],[],NOT_MAIN_SUBJ
141108,6613,506030,1,[person],[dancing],person on right reaching out,person on right dancing,1,1,0,0,"(=, -, +)","[(=, [person, on, right]), (-, [reaching, out]), (+, [dancing])]","[person, right]",[right],NOT_MAIN_SUBJ
141120,31988,211955,1,[girl],[girl],little girl on the right leaning over,little girl on the right dancing,1,1,0,0,"(=, -, +)","[(=, [little, girl, on, the, right]), (-, [leaning, over]), (+, [dancing])]","[little girl, the right]","[little girl, the right]",NOT_MAIN_SUBJ


In [34]:
def scrub(refcoco: COCO):
    refcoco = deepcopy(refcoco)
    total_sents = 0
    total_sents_kept = 0
    for ref_id, ref in tqdm(list(refcoco.refs.items()), total=len(refcoco.refs)):
        sentences_new = []
        total_sents += len(ref["sentences"])
        for s in ref["sentences"]:
            if "is_false_premise" not in s:
                s["is_false_premise"] = False
            s["exist"] = not s["is_false_premise"]
            if s["is_false_premise"] and (s["sent"] == s["gt_sent"]):
                continue
            # if not s["change_type"]:
            #     s["change_type"]
            sentences_new.append(s)
        total_sents_kept += len(sentences_new)
        ref["sentences"] = sentences_new
    print(f"total_sents: {total_sents}, total_kept: {total_sents_kept}")
    return refcoco


refcoco_new_scrubbed = scrub(refcoco_new)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 295284.94it/s]


total_sents: 283353, total_kept: 276930


## Save new RefCOCO Dataset

Enhanced version augments the `ref["sentences"]` dictionaries with spacy tagging info (parts of speech, dependency parsing, etc.


---

In [35]:
print(refcoco_new_scrubbed.refs[1000])
print("")
print(refcoco_new_scrubbed.refs_data[1000])
print(len(refcoco_new_scrubbed.refs_data), len(refcoco_new_scrubbed.refs))

{'sent_ids': [2836, 2837, 2838], 'file_name': 'COCO_train2014_000000570951_4.jpg', 'ann_id': 500240, 'ref_id': 1000, 'image_id': 570951, 'split': 'val', 'sentences': [{'tokens': ['chef', 'in', 'back'], 'raw': 'chef in back', 'sent_id': 2836, 'sent': 'chef in back', 'spcy_WORD': ['chef', 'in', 'back'], 'spcy_DEP': ['ROOT', 'prep', 'pobj'], 'spcy_POS': ['PROPN', 'ADP', 'NOUN'], 'spcy_LEM': ['chef', 'in', 'back'], 'spcy_TAG': ['NNP', 'IN', 'NN'], 'spcy_IS_STOP': [False, True, True], 'spcy_ENTS': [], 'spcy_NOUN_CHUNKS': ['chef', 'back'], 'is_false_premise': False, 'exist': True}, {'tokens': ['right', 'guy'], 'raw': 'right guy', 'sent_id': 2837, 'sent': 'right guy', 'spcy_WORD': ['right', 'guy'], 'spcy_DEP': ['intj', 'ROOT'], 'spcy_POS': ['INTJ', 'NOUN'], 'spcy_LEM': ['right', 'guy'], 'spcy_TAG': ['UH', 'NN'], 'spcy_IS_STOP': [False, False], 'spcy_ENTS': [], 'spcy_NOUN_CHUNKS': ['right guy'], 'is_false_premise': False, 'exist': True}, {'tokens': ['cook', 'in', 'the', 'back'], 'raw': 'cook i

In [36]:
import shutil


def save_refs(refcoco: COCO, save_dir: Path, split_by: str):
    assert save_dir.exists(), str(save_dir)
    refs_path = save_dir / f"refs({split_by}).p"
    print("Saving refs: ", refs_path)
    pickle.dump(refcoco.refs_data, open(refs_path, "wb"))


def make_new_dataset(refcoco: COCO, save_dir: Path, dataset_name: str, split_by: str):
    new_dataset_path = save_dir / "refer_seg" / f"{dataset_name}"
    new_dataset_path.mkdir(exist_ok=True, parents=True)
    # Copy coco instances.json:
    source_path = refcoco.DATA_DIR / "instances.json"
    assert source_path.exists(), str(source_path)
    print("saving instances.json: ", new_dataset_path / "instances.json")
    shutil.copy(source_path, new_dataset_path / "instances.json")
    save_refs(refcoco, new_dataset_path, split_by)
    print("Saved new refer_seg dataset to: ", new_dataset_path)
    return new_dataset_path


new_ds_path = make_new_dataset(
    refcoco_new_scrubbed, api_results_dir, "fprefcoco_v002", "berkeley"
)

saving instances.json:  /home/gbiamby/proj/geo-llm-ret/output/refcoco_google-gb006_remove_guidelines-gpt-3.5-turbo/refer_seg/fprefcoco_v002/instances.json
Saving refs:  /home/gbiamby/proj/geo-llm-ret/output/refcoco_google-gb006_remove_guidelines-gpt-3.5-turbo/refer_seg/fprefcoco_v002/refs(berkeley).p
Saved new refer_seg dataset to:  /home/gbiamby/proj/geo-llm-ret/output/refcoco_google-gb006_remove_guidelines-gpt-3.5-turbo/refer_seg/fprefcoco_v002


## Load and Inspect the Newly Saved RefCOCO Dataset

In [37]:
VALID_SPLITS = {
    "R-refcoco": ["unc"],
    "R-refcoco+": ["unc"],
    "R-refcocog": ["umd"],
    "refclef": ["berkeley", "unc"],
    "refcoco": ["google"],
    "refcoco+": ["unc"],
    "refcocog": ["google", "umd"],
    "fprefcoco_v002": ["berkeley"],
    "fprefcocog_v002": ["berkeley"],
}


def build_refcoco(refseg_path: Path, dataset_name: str, split_by: str = None) -> COCO:
    assert dataset_name in VALID_SPLITS, dataset_name
    if split_by is None:
        split_by = VALID_SPLITS[dataset_name][0]
    else:
        assert split_by in VALID_SPLITS[dataset_name]
    coco = COCO(
        refseg_path / dataset_name / "instances.json",
        is_ref_dataset=True,
        dataset_name=dataset_name,
        split_by=split_by,
    )
    return coco


df_aggs = []
for ds_name in ["fprefcoco_v002"]:
    print("\n\n")
    print("=" * 220)
    print(f"Dataset: {ds_name}(berkeley)")
    coconegref_stats = CocoClassDistHelper(
        new_ds_path.parent,
        is_ref_dataset=True,
        dataset_name=ds_name,
        split_by="berkeley",
    )
    df_refcoco, df_refcoco_agg = coconegref_stats.get_ref_stats()
    df_aggs.append(df_refcoco_agg)


df_aggs = pd.concat(df_aggs)


display(df_aggs)




Dataset: fprefcoco_v002(berkeley)
Loading refs from '/home/gbiamby/proj/geo-llm-ret/output/refcoco_google-gb006_remove_guidelines-gpt-3.5-turbo/refer_seg/fprefcoco_v002/refs(berkeley).p'
Loaded 50000 refs
loading annotations into memory...
Done (t=9.04s)
creating index...
index created!
num images: 19994
num annotations: 196771
pos/neg sentence_counts:  142210 134720


Unnamed: 0,pos_sent_count,neg_sent_count,dataset,num_refs,sent_count,total_pos_sents,total_neg_sents,ann_count,img_count
0,1,1,fprefcoco_v002(berkeley),2,4,2,2,196771,19994
1,2,0,fprefcoco_v002(berkeley),119,238,238,0,196771,19994
2,2,1,fprefcoco_v002(berkeley),589,1767,1178,589,196771,19994
3,2,2,fprefcoco_v002(berkeley),7451,29804,14902,14902,196771,19994
4,3,0,fprefcoco_v002(berkeley),515,1545,1545,0,196771,19994
5,3,1,fprefcoco_v002(berkeley),781,3124,2343,781,196771,19994
6,3,2,fprefcoco_v002(berkeley),3442,17210,10326,6884,196771,19994
7,3,3,fprefcoco_v002(berkeley),36735,220410,110205,110205,196771,19994
8,4,0,fprefcoco_v002(berkeley),8,32,32,0,196771,19994
9,4,1,fprefcoco_v002(berkeley),5,25,20,5,196771,19994


In [44]:
display(
    df_aggs.groupby(lambda x: True).agg(
        total_refs=("num_refs", "sum"),
        sent_count=("sent_count", "sum"),
        total_pos_sents=("total_pos_sents", "sum"),
        total_neg_sents=("total_neg_sents", "sum"),
    )
)

Unnamed: 0,total_refs,sent_count,total_pos_sents,total_neg_sents
True,50000,276930,142210,134720


In [45]:
def show_a_refexp(refcoco: COCO):
    ref = refcoco.refs[1000]
    print("ref has keys: ", ref.keys())
    print(f"ref has {len(ref['sentences'])} sentences")
    for s in ref["sentences"]:
        # print(s.keys())
        print(
            f"sent_id:{s['sent_id']}, is_FP:{s['is_false_premise']}, sent: '{s['sent']}'"
        )
        if s["is_false_premise"]:
            print("\tchange_type: ", s["change_type"])
            print(f"\tparent_sent_id: {s['gt_sent_id']}, parent_sent: '{s['gt_sent']}'")


show_a_refexp(coconegref_stats)

ref has keys:  dict_keys(['sent_ids', 'file_name', 'ann_id', 'ref_id', 'image_id', 'split', 'sentences', 'category_id'])
ref has 6 sentences
sent_id:2836, is_FP:False, sent: 'chef in back'
sent_id:2837, is_FP:False, sent: 'right guy'
sent_id:2838, is_FP:False, sent: 'cook in the back'
sent_id:-1, is_FP:True, sent: 'plumber in back'
	change_type:  main_subject
	parent_sent_id: 2836, parent_sent: 'chef in back'
sent_id:-1, is_FP:True, sent: 'right alien'
	change_type:  main_subject
	parent_sent_id: 2837, parent_sent: 'right guy'
sent_id:-1, is_FP:True, sent: 'engineer in the back'
	change_type:  main_subject
	parent_sent_id: 2838, parent_sent: 'cook in the back'


---