In [None]:
import os 
os.getcwd()

In [None]:
from pathlib import Path
import shutil

DATA_RAW = Path("./")
SRC = DATA_RAW / "fvqa" / "new_dataset_release/images"
DST = Path("../data") / "images" / "fvqa"

if not SRC.exists():
    raise FileNotFoundError(str(SRC))

if DST.exists():
    print(f"Destination exists, skipping: {DST}")
else:
    DST.parent.mkdir(parents=True, exist_ok=True)
    shutil.copytree(SRC, DST)
    print(f"Copied folder: {SRC} -> {DST}")

In [None]:
import json
import pandas as pd
from pathlib import Path

DATA_ROOT = Path("./")
FVQA_DIR = DATA_ROOT / "fvqa"
IMG_DIR = FVQA_DIR / "Images"
QS_JSON = FVQA_DIR / "new_dataset_release" / "all_qs_dict_release.json"
SPLIT_DIR = FVQA_DIR / "Name_Lists"
def load_split_list(path):
    if not path.exists():
        return set()
    with open(path, "r", encoding="utf-8") as f:
        return {line.strip() for line in f if line.strip()}

SPLIT_FILES = {
    "train": SPLIT_DIR / "train_list_0.txt",
    "val": SPLIT_DIR / "val_list_0.txt", 
    "test": SPLIT_DIR / "test_list_0.txt",
}
split_sets = {k: load_split_list(v) for k, v in SPLIT_FILES.items()}

with open(QS_JSON, "r", encoding="utf-8") as f:
    qs_dict = json.load(f)

def record_to_row(rec):
    return {
        "image_path": str(IMG_DIR / rec.get("img_file", "")),
        "question": rec.get("question", ""),
        "answer": rec.get("answer", ""),
        "rationale": rec.get("fact_surface", ""),
        "choices": "",
    }

rows_by_split = {"train": [], "val": [], "test": []}
for rec in qs_dict.values():
    img = rec.get("img_file", "")
    if img in split_sets["test"]:
        split = "test"
    elif img in split_sets["val"]:
        split = "val"
    elif img in split_sets["train"]:
        split = "train"
    else:
        split = "train"
    rows_by_split[split].append(record_to_row(rec))

# df_train = pd.DataFrame(rows_by_split["train"], columns=["image_path", "question", "answer", "rationale", "choices"])
# df_val = pd.DataFrame(rows_by_split["val"], columns=["image_path", "question", "answer", "rationale", "choices"])
# df_test = pd.DataFrame(rows_by_split["test"], columns=["image_path", "question", "answer", "rationale", "choices"])


In [None]:
import re
import pandas as pd

def map_image_path(p):
    s = str(p)
    s = s.replace("fvqa/Images", "data/images/fvqa").replace("fvqa/images", "data/images/fvqa")
    return s

def clean_rationale(text):
    if not text:
        return ""
    t = str(text).replace("[[", "").replace("]]", "")
    t = re.sub(r"[^A-Za-z0-9\s\.,!?;:'\"()\-/]", "", t)  # remove special chars like *
    t = re.sub(r"\s+", " ", t).strip()
    if not t.endswith("."):
        t = t.rstrip(".") + "."
    return t

def build_df(rows_by_split, split):
    df = pd.DataFrame(rows_by_split[split], columns=["image_path", "question", "answer", "rationale", "choices"])
    df["image_path"] = df["image_path"].apply(map_image_path)
    df["rationale"] = df["rationale"].apply(clean_rationale)
    return df

In [None]:
df_train = build_df(rows_by_split, "train")
df_val = build_df(rows_by_split, "val")
df_test = build_df(rows_by_split, "test")
df_test

In [None]:
print(df_train.image_path.nunique())
print(df_val.image_path.nunique())
print(df_test.image_path.nunique())
# count files under ../data/images/fvqa
len(list(Path("../data/images/fvqa").glob("*.JPEG")))


# GPT4o-mini 
generate negative samples B, C, D

In [None]:
# ! pip install openai
import os, json, random
from openai import OpenAI           # pip install --upgrade openai
from tokens import openai_key
client = OpenAI(api_key=openai_key)
MODEL_NAME = "4o-mini"          # official public model name

In [None]:
import time
import re
from typing import List
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def _clean_item(s: str) -> str:
    # Only strip bullet formats like "(A)", "A.", "A)", not words starting with A-D
    s = re.sub(r"^\s*\([A-Da-d]\)\.?\s*", "", s.strip())  # strip (A), (B), etc.
    s = re.sub(r"^\s*[A-Da-d]\.\s*", "", s.strip())  # strip "A.", "B.", etc.
    s = re.sub(r"^\s*[\-\*\[\]]\s*", "", s.strip())  # strip bullets like "-", "*", "[", "]"
    s = s.strip().strip('"').strip("'")
    return s

def generate_distractors(client: OpenAI, question: str, correct: str, n: int = 3, model: str = None, temperature: float = 0.2, max_retries: int = 3) -> List[str]:
    """Return n wrong answers (strings) for the given question, distinct from correct."""
    model = model or MODEL_NAME
    prompt = (
        f"Generate {n} different incorrect answers to the question: {question}\n"
        f"They must be different from the correct answer: {correct}\n"
        f"Each answer should be a short word or phrase. "
        f"Return them as a single line joined by the '|' character only."
    )
    backoff = 1.0
    for attempt in range(max_retries):
        try:
            resp = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
                max_tokens=80,
            )
            text = resp.choices[0].message.content.strip()
            parts = [p for p in (seg.strip() for seg in text.split("|")) if p]
            parts = [_clean_item(p) for p in parts]
            # de-dup, filter out the correct answer
            uniq = []
            for p in parts:
                if p and p.lower() != str(correct).lower() and p not in uniq:
                    uniq.append(p)
            # pad if fewer than n
            while len(uniq) < n:
                uniq.append(f"Option{len(uniq)+1}")
            return uniq[:n]
        except Exception as e:
            if attempt == max_retries - 1:
                # last resort fallbacks
                return [f"Option{i}" for i in range(1, n+1)]
            time.sleep(backoff)
            backoff *= 2

def format_choices(correct: str, wrongs: List[str]) -> str:
    """Format as required MCQ string with A=correct, B/C/D=wrongs."""
    wrongs = (wrongs + ["B", "C", "D"])[:3]  # ensure length 3
    return f"(A) {correct}\n(B) {wrongs[0]}\n(C) {wrongs[1]}\n(D) {wrongs[2]}"

def add_mcq_choices(df, batch_size: int = 50, max_workers: int = 10):
    """Add/overwrite df['choices'] using OpenAI for wrong answers. Batch processing with parallel API calls."""
    questions = df["question"].tolist()
    answers = df["answer"].tolist()
    n_total = len(questions)
    
    def process_single(idx, question, answer):
        """Process a single item and return index + result."""
        wrongs = generate_distractors(client, question, answer, n=3, model=MODEL_NAME)
        return idx, wrongs
    
    # Process in batches with parallel workers
    results = [None] * n_total
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_single, idx, q, a): idx 
                   for idx, (q, a) in enumerate(zip(questions, answers))}
        
        # Collect results with progress bar
        for future in tqdm(as_completed(futures), total=n_total, desc="Generating distractors"):
            idx, wrongs = future.result()
            results[idx] = wrongs
    
    # Format choices
    df["choices"] = [
        format_choices(corr, w) for corr, w in zip(answers, results)
    ]
    return df

# Example:
# df_tmp = df_test.sample(10)
# add_mcq_choices(df_tmp)


In [None]:

# # Example:
# df_tmp = df_test.sample(10)
# add_mcq_choices(df_tmp)


In [None]:
# Process df_test in chunks of 100 rows
chunk_size = 100
df_result_chunks = []

for i in tqdm(range(0, len(df_test), chunk_size), desc="Processing chunks"):
    df_sub = df_test.iloc[i:i+chunk_size].copy()
    df_sub = add_mcq_choices(df_sub)
    df_result_chunks.append(df_sub)
    print(f"Processed rows {i} to {min(i+chunk_size, len(df_test))}")

df_test_final = pd.concat(df_result_chunks, ignore_index=True)
print(f"Completed: {len(df_test_final)} rows processed")



In [None]:
# Process df_train in chunks of 100 rows
chunk_size = 100
df_result_chunks = []

for i in tqdm(range(0, len(df_train), chunk_size), desc="Processing chunks"):
    df_sub = df_train.iloc[i:i+chunk_size].copy()
    df_sub = add_mcq_choices(df_sub)
    df_result_chunks.append(df_sub)
    print(f"Processed rows {i} to {min(i+chunk_size, len(df_train))}")

df_train_final = pd.concat(df_result_chunks, ignore_index=True)
print(f"Completed: {len(df_train_final)} rows processed")
df_train_final.head()


In [None]:
PARQUET_DIR = FVQA_DIR / "parquet"
PARQUET_DIR.mkdir(parents=True, exist_ok=True)

df_train_final.to_parquet(PARQUET_DIR / "train.parquet", index=False)
df_val.to_parquet(PARQUET_DIR / "val.parquet", index=False)
df_test_final.to_parquet(PARQUET_DIR / "test.parquet", index=False)

In [None]:
from tokens import HF_TOKEN
from huggingface_hub import HfApi, create_repo, upload_folder, upload_file
from huggingface_hub.utils import disable_progress_bars
disable_progress_bars()

api = HfApi(token=HF_TOKEN)
repo_id = "JJoy333/RationaleVQA"
create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)


upload_folder(
    folder_path=str(PARQUET_DIR),
    repo_id=repo_id,
    repo_type="dataset",
    path_in_repo="FVQA"
)
