In [1]:
import os 
os.getcwd()

'/sfs/weka/scratch/jq2uw/MME/instruct_vlm_edit/data_raw'

In [2]:
import json
import pandas as pd
from pathlib import Path

DATA_ROOT = Path("./")
AOKVQA_DIR = DATA_ROOT / "aokvqa"
COCO_DIR = DATA_ROOT / "coco"

JSON_FILES = {
    "train": AOKVQA_DIR / "aokvqa_v1p0_train.json",
    "val": AOKVQA_DIR / "aokvqa_v1p0_val.json", # important use val as test with ground truth available
    "test": AOKVQA_DIR / "aokvqa_v1p0_test.json",
}

def choose_answer(item):
    if "choices" in item and "correct_choice_idx" in item:
        idx = item.get("correct_choice_idx")
        if isinstance(idx, int) and 0 <= idx < len(item["choices"]):
            return item["choices"][idx]
    das = item.get("direct_answers") or []
    return das[0] if das else ""

def choose_rationale(item):
    rats = item.get("rationales") or []
    return rats[0] if rats else ""

def format_choices(item):
    opts = item.get("choices") or []
    labels = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    return "\n".join(f"({labels[i]}) {opt}" for i, opt in enumerate(opts))

def coco_image_path(split, image_id):
    split_dir = f"{split}2017"
    filename = f"{image_id:012d}.jpg"
    return str(COCO_DIR / split_dir / filename)


def map_image_path(p):
    s = str(p)
    s = s.replace("coco/", "data/images/aokvqa/")
    return s
    
def build_df(split, json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        items = json.load(f)
    rows = []
    for it in items:
        rows.append({
            "image_path": coco_image_path(split, it["image_id"]),
            "question": it.get("question", ""),
            "answer": choose_answer(it),
            "rationale": choose_rationale(it),
            "choices": format_choices(it),
        })
    df = pd.DataFrame(rows, columns=["image_path", "question", "answer", "rationale", "choices"])
    df["image_path"] = df["image_path"].apply(map_image_path)
    return df



In [3]:

df_train = build_df("train", JSON_FILES["train"])
df_val = build_df("val", JSON_FILES["val"])
df_test = build_df("test", JSON_FILES["test"])
df_val

Unnamed: 0,image_path,question,answer,rationale,choices
0,data/images/aokvqa/val2017/000000461751.jpg,What is in the motorcyclist's mouth?,cigarette,He's smoking while riding.,(A) toothpick\n(B) food\n(C) popsicle stick\n(...
1,data/images/aokvqa/val2017/000000377368.jpg,Which number birthday is probably being celebr...,thirty,There is a birthday cake on the table with the...,(A) one\n(B) ten\n(C) nine\n(D) thirty
2,data/images/aokvqa/val2017/000000563603.jpg,What best describes the pool of water?,dirty,The pool is dark brown.,(A) frozen\n(B) fresh\n(C) dirty\n(D) boiling
3,data/images/aokvqa/val2017/000000329542.jpg,What is the white substance on top of the cupc...,icing,This is frosting used to decorate and add more...,(A) butter\n(B) mayo\n(C) ice cream\n(D) icing
4,data/images/aokvqa/val2017/000000182202.jpg,What type of device is sitting next to the lap...,mobile phone,It has the name of it on the top,(A) mouse\n(B) mobile phone\n(C) pen\n(D) keyb...
...,...,...,...,...,...
1140,data/images/aokvqa/val2017/000000127092.jpg,What must be activated so the parked cars stay...,emergency brake,Cars are parked on a hill on a street.,(A) headlights\n(B) sunroof\n(C) radio\n(D) em...
1141,data/images/aokvqa/val2017/000000479030.jpg,What mechanism might all different modes of tr...,brakes,None of the other options make sense with this...,(A) hammer time\n(B) brakes\n(C) bat signal\n(...
1142,data/images/aokvqa/val2017/000000345397.jpg,What brand of hair product does he have?,head and shoulders,A dove bottle is on the shelf in the shower.,(A) finesse\n(B) dove\n(C) loreal\n(D) head an...
1143,data/images/aokvqa/val2017/000000313588.jpg,What is the pattern on the woman's coat called?,plaid,The woman's jacket is in a crisscross pattern.,(A) pinstripe\n(B) polka dot\n(C) houndstooth\...


cleaning

In [4]:
import re
def extract_choice_pairs(s: str):
    """Order-agnostic parse of lines like '(A) foo', '(B) bar', ...
    Returns list of (letter, text) in the order they appear.
    """
    pairs = re.findall(r"\(([A-D])\)\s*(.+)", s)
    return [(ltr, txt.strip()) for (ltr, txt) in pairs]


def extract_choices(question: str):
    """Order-agnostic: return only the option texts in the order they appear."""
    return [txt for (_ltr, txt) in extract_choice_pairs(question)]

# uses your extract_choice_pairs()

def add_label_letter_df(df):
    out = df.copy()
    label_letters = []
    for _, r in out.iterrows():
        pairs = extract_choice_pairs(r["choices"])
        letter = None
        ans = str(r["answer"]).strip().lower()
        for ltr, txt in pairs:
            if str(txt).strip().lower() == ans:
                letter = ltr
                break
        label_letters.append(letter)
    out["label_letter"] = label_letters
    return out

def find_bad_rows(df):
    bad_idx = []
    for i, r in df.iterrows():
        pairs = extract_choice_pairs(r["choices"])
        # bad if not exactly 4 parsed options or no matching letter found
        if len(pairs) != 4 or pd.isna(r["label_letter"]):
            bad_idx.append(i)
    return df.loc[bad_idx]

# apply to your built DataFrames via a simple cleaner

def drop_bad_rows(df: pd.DataFrame) -> pd.DataFrame:
    """Return a DataFrame with the same columns, dropping rows where
    - choices don't parse to exactly 4 options, or
    - answer doesn't match any option (no label_letter).
    """
    ann = add_label_letter_df(df)
    bad = find_bad_rows(ann)
    keep_idx = ann.index.difference(bad.index)
    return df.loc[keep_idx].copy()



In [5]:
df_train = drop_bad_rows(df_train)
df_val = drop_bad_rows(df_val)
# df_test = drop_bad_rows(df_test)
df_val

Unnamed: 0,image_path,question,answer,rationale,choices
0,data/images/aokvqa/val2017/000000461751.jpg,What is in the motorcyclist's mouth?,cigarette,He's smoking while riding.,(A) toothpick\n(B) food\n(C) popsicle stick\n(...
1,data/images/aokvqa/val2017/000000377368.jpg,Which number birthday is probably being celebr...,thirty,There is a birthday cake on the table with the...,(A) one\n(B) ten\n(C) nine\n(D) thirty
2,data/images/aokvqa/val2017/000000563603.jpg,What best describes the pool of water?,dirty,The pool is dark brown.,(A) frozen\n(B) fresh\n(C) dirty\n(D) boiling
3,data/images/aokvqa/val2017/000000329542.jpg,What is the white substance on top of the cupc...,icing,This is frosting used to decorate and add more...,(A) butter\n(B) mayo\n(C) ice cream\n(D) icing
4,data/images/aokvqa/val2017/000000182202.jpg,What type of device is sitting next to the lap...,mobile phone,It has the name of it on the top,(A) mouse\n(B) mobile phone\n(C) pen\n(D) keyb...
...,...,...,...,...,...
1140,data/images/aokvqa/val2017/000000127092.jpg,What must be activated so the parked cars stay...,emergency brake,Cars are parked on a hill on a street.,(A) headlights\n(B) sunroof\n(C) radio\n(D) em...
1141,data/images/aokvqa/val2017/000000479030.jpg,What mechanism might all different modes of tr...,brakes,None of the other options make sense with this...,(A) hammer time\n(B) brakes\n(C) bat signal\n(...
1142,data/images/aokvqa/val2017/000000345397.jpg,What brand of hair product does he have?,head and shoulders,A dove bottle is on the shelf in the shower.,(A) finesse\n(B) dove\n(C) loreal\n(D) head an...
1143,data/images/aokvqa/val2017/000000313588.jpg,What is the pattern on the woman's coat called?,plaid,The woman's jacket is in a crisscross pattern.,(A) pinstripe\n(B) polka dot\n(C) houndstooth\...


In [6]:
PARQUET_DIR = AOKVQA_DIR / "parquet"
PARQUET_DIR.mkdir(parents=True, exist_ok=True)

df_train.to_parquet(PARQUET_DIR / "train.parquet", index=False)
df_test.to_parquet(PARQUET_DIR / "val.parquet", index=False)
df_val.to_parquet(PARQUET_DIR / "test.parquet", index=False)

In [7]:
from tokens import HF_TOKEN
from huggingface_hub import HfApi, create_repo, upload_folder, upload_file
from huggingface_hub.utils import disable_progress_bars
disable_progress_bars()

api = HfApi(token=HF_TOKEN)
repo_id = "JJoy333/RationaleVQA"
create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)


upload_folder(
    folder_path=str(PARQUET_DIR),
    repo_id=repo_id,
    repo_type="dataset",
    path_in_repo="AOKVQA"
)


CommitInfo(commit_url='https://huggingface.co/datasets/JJoy333/RationaleVQA/commit/a6520ecaffd2f978cb972743bdc4be09c42d6c84', commit_message='Upload folder using huggingface_hub', commit_description='', oid='a6520ecaffd2f978cb972743bdc4be09c42d6c84', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/JJoy333/RationaleVQA', endpoint='https://huggingface.co', repo_type='dataset', repo_id='JJoy333/RationaleVQA'), pr_revision=None, pr_num=None)