In [1]:
import os 
os.getcwd()

'/sfs/weka/scratch/jq2uw/MME/instruct_vlm_edit/data_raw'

In [2]:
import json
import pandas as pd
from pathlib import Path

DATA_ROOT = Path("./")
AOKVQA_DIR = DATA_ROOT / "aokvqa"
COCO_DIR = DATA_ROOT / "coco"

JSON_FILES = {
    "train": AOKVQA_DIR / "aokvqa_v1p0_train.json",
    "val": AOKVQA_DIR / "aokvqa_v1p0_val.json", # important use val as test with ground truth available
    "test": AOKVQA_DIR / "aokvqa_v1p0_test.json",
}

def choose_answer(item):
    if "choices" in item and "correct_choice_idx" in item:
        idx = item.get("correct_choice_idx")
        if isinstance(idx, int) and 0 <= idx < len(item["choices"]):
            return item["choices"][idx]
    das = item.get("direct_answers") or []
    return das[0] if das else ""

def choose_rationale(item):
    rats = item.get("rationales") or []
    return rats[0] if rats else ""

def format_choices(item):
    opts = item.get("choices") or []
    labels = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    return "\n".join(f"({labels[i]}) {opt}" for i, opt in enumerate(opts))

def coco_image_path(split, image_id):
    split_dir = f"{split}2017"
    filename = f"{image_id:012d}.jpg"
    return str(COCO_DIR / split_dir / filename)


def map_image_path(p):
    s = str(p)
    s = s.replace("coco/", "data/images/aokvqa/")
    return s
    
def build_df(split, json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        items = json.load(f)
    rows = []
    for it in items:
        rows.append({
            "image_path": coco_image_path(split, it["image_id"]),
            "question": it.get("question", ""),
            "answer": choose_answer(it),
            "rationale": choose_rationale(it),
            "choices": format_choices(it),
        })
    df = pd.DataFrame(rows, columns=["image_path", "question", "answer", "rationale", "choices"])
    df["image_path"] = df["image_path"].apply(map_image_path)
    return df



In [3]:

df_train = build_df("train", JSON_FILES["train"])
df_val = build_df("val", JSON_FILES["val"])
df_test = build_df("test", JSON_FILES["test"])
df_train

Unnamed: 0,image_path,question,answer,rationale,choices
0,data/images/aokvqa/train2017/000000299207.jpg,What is the man by the bags awaiting?,cab,"A train would not be on the street, he would n...",(A) skateboarder\n(B) train\n(C) delivery\n(D)...
1,data/images/aokvqa/train2017/000000039446.jpg,Where does this man eat pizza?,office,The man is eating pizza at a work desk in an o...,(A) office\n(B) cafe\n(C) motel\n(D) outside
2,data/images/aokvqa/train2017/000000312452.jpg,What is the occupation of the person driving?,farmer,The place is full of sheep that shows the pers...,(A) waiter\n(B) farmer\n(C) cashier\n(D) musician
3,data/images/aokvqa/train2017/000000046408.jpg,How were the drivers of the cars able to park ...,airport workers,These drivers work at the airport.,(A) firemen\n(B) airport workers\n(C) police\n...
4,data/images/aokvqa/train2017/000000282150.jpg,How many people can ride this motorcycle at a ...,two,Two people can be on the bike.,(A) four\n(B) two\n(C) three\n(D) one
...,...,...,...,...,...
17051,data/images/aokvqa/train2017/000000003366.jpg,The Horse and rider here are part of what?,parade,There are people with small flags lining the s...,(A) runaway horse\n(B) parade\n(C) rodeo round...
17052,data/images/aokvqa/train2017/000000069366.jpg,Who need to obey the stop sign shown?,runners,It is a footpath and people need to yield to p...,(A) cars\n(B) buses\n(C) motorcycles\n(D) runners
17053,data/images/aokvqa/train2017/000000529087.jpg,What is the red object sitting on the stove?,teapot,The object is a teapot.,(A) mug\n(B) teapot\n(C) bag\n(D) container
17054,data/images/aokvqa/train2017/000000230160.jpg,To which elevation will the persons pictured h...,lower,They are on the downhill section and there is ...,(A) sea level\n(B) higher\n(C) same\n(D) lower


In [4]:
PARQUET_DIR = AOKVQA_DIR / "parquet"
PARQUET_DIR.mkdir(parents=True, exist_ok=True)

df_train.to_parquet(PARQUET_DIR / "train.parquet", index=False)
df_test.to_parquet(PARQUET_DIR / "val.parquet", index=False)
df_val.to_parquet(PARQUET_DIR / "test.parquet", index=False)

In [5]:
from tokens import HF_TOKEN
from huggingface_hub import HfApi, create_repo, upload_folder, upload_file
from huggingface_hub.utils import disable_progress_bars
disable_progress_bars()

api = HfApi(token=HF_TOKEN)
repo_id = "JJoy333/RationaleVQA"
create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)


upload_folder(
    folder_path=str(PARQUET_DIR),
    repo_id=repo_id,
    repo_type="dataset",
    path_in_repo="AOKVQA"
)


CommitInfo(commit_url='https://huggingface.co/datasets/JJoy333/RationaleVQA/commit/8ac42552d5b5cf773acbf5eea2d6cb98e695bcc5', commit_message='Upload folder using huggingface_hub', commit_description='', oid='8ac42552d5b5cf773acbf5eea2d6cb98e695bcc5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/JJoy333/RationaleVQA', endpoint='https://huggingface.co', repo_type='dataset', repo_id='JJoy333/RationaleVQA'), pr_revision=None, pr_num=None)