In [None]:
import os 
os.getcwd()

In [None]:
from pathlib import Path
import shutil

DATA_RAW = Path("./")
SRC = DATA_RAW / "fvqa" / "new_dataset_release/images"
DST = Path("../data") / "images" / "fvqa"

if not SRC.exists():
    raise FileNotFoundError(str(SRC))

if DST.exists():
    print(f"Destination exists, skipping: {DST}")
else:
    DST.parent.mkdir(parents=True, exist_ok=True)
    shutil.copytree(SRC, DST)
    print(f"Copied folder: {SRC} -> {DST}")

In [None]:
import json
import pandas as pd
from pathlib import Path

DATA_ROOT = Path("./")
FVQA_DIR = DATA_ROOT / "fvqa"
IMG_DIR = FVQA_DIR / "Images"
QS_JSON = FVQA_DIR / "new_dataset_release" / "all_qs_dict_release.json"
SPLIT_DIR = FVQA_DIR / "Name_Lists"
def load_split_list(path):
    if not path.exists():
        return set()
    with open(path, "r", encoding="utf-8") as f:
        return {line.strip() for line in f if line.strip()}

SPLIT_FILES = {
    "train": SPLIT_DIR / "train_list_0.txt",
    "val": SPLIT_DIR / "val_list_0.txt", 
    "test": SPLIT_DIR / "test_list_0.txt",
}
split_sets = {k: load_split_list(v) for k, v in SPLIT_FILES.items()}

with open(QS_JSON, "r", encoding="utf-8") as f:
    qs_dict = json.load(f)

def record_to_row(rec):
    return {
        "image_path": str(IMG_DIR / rec.get("img_file", "")),
        "question": rec.get("question", ""),
        "answer": rec.get("answer", ""),
        "rationale": rec.get("fact_surface", ""),
        "choices": "",
    }

rows_by_split = {"train": [], "val": [], "test": []}
for rec in qs_dict.values():
    img = rec.get("img_file", "")
    if img in split_sets["test"]:
        split = "test"
    elif img in split_sets["val"]:
        split = "val"
    elif img in split_sets["train"]:
        split = "train"
    else:
        split = "train"
    rows_by_split[split].append(record_to_row(rec))

# df_train = pd.DataFrame(rows_by_split["train"], columns=["image_path", "question", "answer", "rationale", "choices"])
# df_val = pd.DataFrame(rows_by_split["val"], columns=["image_path", "question", "answer", "rationale", "choices"])
# df_test = pd.DataFrame(rows_by_split["test"], columns=["image_path", "question", "answer", "rationale", "choices"])


In [None]:
import re
import pandas as pd

def map_image_path(p):
    s = str(p)
    s = s.replace("fvqa/Images", "data/image/fvqa").replace("fvqa/images", "data/image/fvqa")
    return s

def clean_rationale(text):
    if not text:
        return ""
    t = str(text).replace("[[", "").replace("]]", "")
    t = re.sub(r"[^A-Za-z0-9\s\.,!?;:'\"()\-/]", "", t)  # remove special chars like *
    t = re.sub(r"\s+", " ", t).strip()
    if not t.endswith("."):
        t = t.rstrip(".") + "."
    return t

def build_df(rows_by_split, split):
    df = pd.DataFrame(rows_by_split[split], columns=["image_path", "question", "answer", "rationale", "choices"])
    df["image_path"] = df["image_path"].apply(map_image_path)
    df["rationale"] = df["rationale"].apply(clean_rationale)
    return df

In [None]:
df_train = build_df(rows_by_split, "train")
df_test = build_df(rows_by_split, "test")
df_test

In [None]:
print(df_train.image_path.nunique())
# print(df_val.image_path.nunique())
print(df_test.image_path.nunique())
# count files under ../data/images/fvqa
len(list(Path("../data/images/fvqa").glob("*.JPEG")))


In [None]:
PARQUET_DIR = FVQA_DIR / "parquet"
PARQUET_DIR.mkdir(parents=True, exist_ok=True)

df_train.to_parquet(PARQUET_DIR / "train.parquet", index=False)
# df_val.to_parquet(PARQUET_DIR / "val.parquet", index=False)
df_test.to_parquet(PARQUET_DIR / "test.parquet", index=False)

In [None]:
from tokens import HF_TOKEN
from huggingface_hub import HfApi, create_repo, upload_folder, upload_file
from huggingface_hub.utils import disable_progress_bars
disable_progress_bars()

api = HfApi(token=HF_TOKEN)
repo_id = "JJoy333/RationaleVQA"
create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)


upload_folder(
    folder_path=str(PARQUET_DIR),
    repo_id=repo_id,
    repo_type="dataset",
    path_in_repo="FVQA"
)
