In [None]:
import os 
os.getcwd()

In [None]:
import json
import pandas as pd
from pathlib import Path

DATA_ROOT = Path("./")
AOKVQA_DIR = DATA_ROOT / "aokvqa"
COCO_DIR = DATA_ROOT / "coco"

JSON_FILES = {
    "train": AOKVQA_DIR / "aokvqa_v1p0_train.json",
    "val": AOKVQA_DIR / "aokvqa_v1p0_val.json",
    "test": AOKVQA_DIR / "aokvqa_v1p0_test.json",
}

def choose_answer(item):
    if "choices" in item and "correct_choice_idx" in item:
        idx = item.get("correct_choice_idx")
        if isinstance(idx, int) and 0 <= idx < len(item["choices"]):
            return item["choices"][idx]
    das = item.get("direct_answers") or []
    return das[0] if das else ""

def choose_rationale(item):
    rats = item.get("rationales") or []
    return rats[0] if rats else ""

def format_choices(item):
    opts = item.get("choices") or []
    labels = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    return "\n".join(f"({labels[i]}) {opt}" for i, opt in enumerate(opts))

def coco_image_path(split, image_id):
    split_dir = f"{split}2017"
    filename = f"{image_id:012d}.jpg"
    return str(COCO_DIR / split_dir / filename)


def map_image_path(p):
    s = str(p)
    s = s.replace("coco/", "data/images/aokvqa/")
    return s
    
def build_df(split, json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        items = json.load(f)
    rows = []
    for it in items:
        rows.append({
            "image_path": coco_image_path(split, it["image_id"]),
            "question": it.get("question", ""),
            "answer": choose_answer(it),
            "rationale": choose_rationale(it),
            "choices": format_choices(it),
        })
    df = pd.DataFrame(rows, columns=["image_path", "question", "answer", "rationale", "choices"])
    df["image_path"] = df["image_path"].apply(map_image_path)
    return df



In [None]:

df_train = build_df("train", JSON_FILES["train"])
df_val = build_df("val", JSON_FILES["val"])
df_test = build_df("test", JSON_FILES["test"])
df_train

In [None]:
PARQUET_DIR = AOKVQA_DIR / "parquet"
PARQUET_DIR.mkdir(parents=True, exist_ok=True)

df_train.to_parquet(PARQUET_DIR / "train.parquet", index=False)
df_val.to_parquet(PARQUET_DIR / "val.parquet", index=False)
df_test.to_parquet(PARQUET_DIR / "test.parquet", index=False)

In [None]:
from tokens import HF_TOKEN
from huggingface_hub import HfApi, create_repo, upload_folder, upload_file
from huggingface_hub.utils import disable_progress_bars
disable_progress_bars()

api = HfApi(token=HF_TOKEN)
repo_id = "JJoy333/RationaleVQA"
create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)


upload_folder(
    folder_path=str(PARQUET_DIR),
    repo_id=repo_id,
    repo_type="dataset",
    path_in_repo="AOKVQA"
)
