<a href="https://colab.research.google.com/github/JenanHajajra/VLM-Project---NLP/blob/main/FinalProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part 1: Environment Setup and Imports

In [None]:
# --- Install core deps (first run only) ---
!pip -q install "transformers>=4.43.0" accelerate sentencepiece safetensors bitsandbytes pillow

# (Optional) mount Drive if your images live there
from google.colab import drive
drive.mount('/content/drive')

import os, torch, re
from PIL import Image

# Pick your image root here if using Drive
IMG_ROOT = "/content/drive/MyDrive/NLP_Project"  # <-- change if needed
print("GPU available:", torch.cuda.is_available())


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
GPU available: True


# Part 2: Load VLMs


In [None]:
# --- Configuration: choose your VLM here ---
MODEL_CHOICE = "llava"   # "llava" or "qwen"

# You can also override the exact checkpoints here if you like
LLAVA_ID = "llava-hf/llava-1.5-7b-hf"
QWEN_ID  = "Qwen/Qwen2-VL-7B-Instruct"  # strong, instruction-tuned

# Shared, strict VQA-style instruction (keeps answers short & comparable)
VQA_INSTRUCTION = (
    "Answer concisely based ONLY on the image.\n"
    "- If the answer is yes/no → reply exactly 'yes' or 'no'.\n"
    "- If the answer is a number → output only the numeral (e.g., 2).\n"
    "- Otherwise → reply with a short phrase (<= 3 words)."
)

device = "cuda" if torch.cuda.is_available() else "cpu"

def tidy_text(txt: str) -> str:
    txt = txt.strip()
    txt = re.sub(r"\s+", " ", txt)
    return txt


# VLM Function



LLaVA loader + ask() function:

In [None]:
from transformers import AutoProcessor, LlavaForConditionalGeneration

llava_model = None
llava_processor = None
llava_quant_mode = None

def load_llava(model_id=LLAVA_ID):
    global llava_model, llava_processor, llava_quant_mode
    try:
        llava_model = LlavaForConditionalGeneration.from_pretrained(
            model_id,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            device_map="auto",
        )
        llava_quant_mode = "fp16"
    except Exception as e:
        print("fp16 load failed, falling back to 4-bit NF4 quantization:", e)
        llava_model = LlavaForConditionalGeneration.from_pretrained(
            model_id,
            load_in_4bit=True,
            device_map="auto",
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.float16,
            low_cpu_mem_usage=True,
        )
        llava_quant_mode = "4-bit"
    llava_processor = AutoProcessor.from_pretrained(model_id, use_fast=False)
    llava_model.eval()
    print(f"[LLaVA] Loaded {model_id} in {llava_quant_mode}")

@torch.inference_mode()
def ask_llava(image_path: str, question: str, max_new_tokens: int = 24) -> str:
    assert llava_model is not None and llava_processor is not None, "Call load_llava() first."
    image = Image.open(image_path).convert("RGB")

    # LLaVA chat-style prompt (must include <image>)
    prompt = f"USER: <image>\n{VQA_INSTRUCTION}\nQuestion: {question}\nASSISTANT:"

    inputs = llava_processor(images=image, text=prompt, return_tensors="pt").to(llava_model.device)
    output_ids = llava_model.generate(
        **inputs,
        do_sample=False,                 # deterministic for evaluation
        max_new_tokens=max_new_tokens,
        pad_token_id=llava_processor.tokenizer.eos_token_id,
        eos_token_id=llava_processor.tokenizer.eos_token_id,
    )
    # Keep only newly generated tokens (drop the prompt)
    input_len = inputs["input_ids"].shape[1]
    gen_ids = output_ids[:, input_len:]
    text = llava_processor.batch_decode(gen_ids, skip_special_tokens=True)[0]
    return tidy_text(text)


**Qwen2-VL loader + ask() function**

In [None]:
# --- Qwen2-VL loader + ask() (fixed) ---
!pip -q install "transformers>=4.44.0" bitsandbytes  # ensure recent transformers
from transformers import AutoProcessor, BitsAndBytesConfig
from transformers import Qwen2VLForConditionalGeneration
import torch
from PIL import Image
import re

qwen_model = None
qwen_processor = None
qwen_quant_mode = None

def load_qwen(model_id=QWEN_ID):
    global qwen_model, qwen_processor, qwen_quant_mode

    try:
        # Try fp16 first
        qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_id,
            torch_dtype=torch.float16,
            device_map="auto",
        )
        qwen_quant_mode = "fp16"
    except Exception as e:
        print("fp16 load failed, falling back to 4-bit NF4 quantization:", e)
        bnb_cfg = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.float16,
        )
        qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_id,
            quantization_config=bnb_cfg,
            device_map="auto",
        )
        qwen_quant_mode = "4-bit"

    qwen_processor = AutoProcessor.from_pretrained(model_id)
    qwen_model.eval()
    print(f"[Qwen2-VL] Loaded {model_id} in {qwen_quant_mode}")

def tidy_text(txt: str) -> str:
    txt = txt.strip()
    txt = re.sub(r"\s+", " ", txt)
    return txt

@torch.inference_mode()
def ask_qwen(image_path: str, question: str, max_new_tokens: int = 24) -> str:
    """
    Qwen2-VL chat with a single user message that contains the image and text.
    Returns a concise answer suitable for VQA scoring.
    """
    assert qwen_model is not None and qwen_processor is not None, "Call load_qwen() first."
    image = Image.open(image_path).convert("RGB")

    content = [
        {"type": "image", "image": image},
        {"type": "text",  "text": f"{VQA_INSTRUCTION}\nQuestion: {question}"},
    ]
    messages = [{"role": "user", "content": content}]

    # Build chat prompt (text) and tensors
    chat_text = qwen_processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Important: pass lists so batch dims align
    inputs = qwen_processor(
        text=[chat_text],
        images=[image],
        return_tensors="pt"
    ).to(qwen_model.device)

    out = qwen_model.generate(
        **inputs,
        do_sample=False,  # deterministic for evaluation
        max_new_tokens=max_new_tokens,
        pad_token_id=qwen_processor.tokenizer.eos_token_id,
        eos_token_id=qwen_processor.tokenizer.eos_token_id,
    )

    # Keep only the newly generated tokens
    input_len = inputs["input_ids"].shape[1]
    gen_ids = out[:, input_len:]
    text = qwen_processor.batch_decode(gen_ids, skip_special_tokens=True)[0]
    return tidy_text(text)


**Unified loader + unified ask_vlm(...)**

In [None]:
# --- Unified load/ask interface ---

def load_vlm(which: str = MODEL_CHOICE):
    which = which.lower()
    if which == "llava":
        load_llava(LLAVA_ID)
    elif which == "qwen":
        load_qwen(QWEN_ID)
    else:
        raise ValueError("MODEL_CHOICE must be 'llava' or 'qwen'.")

def ask_vlm(image_path: str, question: str, max_new_tokens: int = 24) -> str:
    which = MODEL_CHOICE.lower()
    if which == "llava":
        return ask_llava(image_path, question, max_new_tokens=max_new_tokens)
    elif which == "qwen":
        return ask_qwen(image_path, question, max_new_tokens=max_new_tokens)
    else:
        raise ValueError("MODEL_CHOICE must be 'llava' or 'qwen'.")


In [None]:
# --- Load the chosen model ---
load_vlm(MODEL_CHOICE)
import requests, io

`torch_dtype` is deprecated! Use `dtype` instead!


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

chat_template.json: 0.00B [00:00, ?B/s]

[Qwen2-VL] Loaded Qwen/Qwen2-VL-7B-Instruct in fp16


**Part 3: Experiments**

# Experiment 1: Real images

**Experiment A: real with qwen and Llava - first one **

In [None]:
# ================================
# Experiment 1: lava and qwen - real images --> Real_Generated_Results folder
# ================================
# This cell:
#  - Loads questions from JSON file
#  - For each split (real / generated), finds the image by image_id in the given folder
#  - Asks the selected VLM (MODEL_CHOICE: "llava" or "qwen") the question about the image
#  - Saves results to JSON with fields: image_id, question_id, answer
#
# REQUIREMENTS:
#  - You already ran the earlier cells (setup, loaders, and ask_vlm).
#  - MODEL_CHOICE is set to "llava" or "qwen".
#  - Drive is mounted if images are in Drive.

import os, json, re, glob, time
from pathlib import Path
from typing import Dict, List, Optional

# ---------- CHANGE ME: your project paths ----------
PROJECT_ROOT = Path("/content/drive/MyDrive/NLP_Project")   # <--- your np_project folder
REAL_FOLDER  = PROJECT_ROOT / "real_images"                       # <--- folder name for real images
GEN_FOLDER   = PROJECT_ROOT / "captionsasis_photorealistic"                  # <--- folder name for generated images
QUESTIONS_JSON = PROJECT_ROOT / "filtered_questions.json"           # <--- your questions file path

# Optional: a file with image IDs to include (one per line or a JSON list). Set to None to use all IDs in questions.json
IMAGE_ID_LIST_PATH: Optional[Path] = None  # e.g., PROJECT_ROOT / "image_ids.txt"

# Where to save results
RESULTS_DIR = PROJECT_ROOT / "Real_Generated_results"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# ---------- Helper: load the optional image id whitelist ----------
def load_id_list(path: Optional[Path]) -> Optional[set]:
    if path is None:
        return None
    p = str(path)
    try:
        if p.endswith(".txt"):
            ids = []
            with open(path, "r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    ids.append(int(line))
            return set(ids)
        elif p.endswith(".json"):
            data = json.load(open(path, "r", encoding="utf-8"))
            if isinstance(data, dict) and "ids" in data:
                return set(int(x) for x in data["ids"])
            elif isinstance(data, list):
                return set(int(x) for x in data)
            else:
                raise ValueError("Unsupported JSON format for ID list (use list or {'ids': [...]})")
        else:
            raise ValueError("Unsupported ID list file type (use .txt or .json)")
    except Exception as e:
        print(f"WARNING: Failed to load id list from {path}: {e}")
        return None

# ---------- Helper: index a folder by image_id (expects img{ID}.jpg/png) ----------
VALID_EXTS = {".jpg", ".jpeg", ".png", ".webp"}

def build_index_by_id(folder: Path) -> Dict[int, Path]:
    """
    Scans the folder and returns a map: image_id -> full path.
    Expects filenames like 'img73.jpg' or 'img73.png'. Also tolerates
    slight variations by extracting the last integer in the basename.
    """
    idx: Dict[int, Path] = {}
    if not folder.exists():
        print(f"WARNING: folder does not exist: {folder}")
        return idx

    files = []
    for ext in VALID_EXTS:
        files.extend(folder.rglob(f"*{ext}"))

    pat_primary = re.compile(r"^img(\d+)$")  # exact match without extension
    for p in files:
        stem = p.stem.lower()  # filename without extension
        m = pat_primary.match(stem)
        image_id = None
        if m:
            # clean case: img73 -> 73
            image_id = int(m.group(1))
        else:
            # fallback: grab the last integer sequence in the stem
            nums = re.findall(r"(\d+)", stem)
            if nums:
                image_id = int(nums[-1])

        if image_id is not None:
            # Keep first occurrence; warn on duplicates
            if image_id in idx:
                # only warn if it's a different file
                if idx[image_id] != p:
                    print(f"NOTE: duplicate image_id {image_id} in {folder.name}:")
                    print(f"      keeping {idx[image_id].name}, ignoring {p.name}")
            else:
                idx[image_id] = p

    print(f"Indexed {len(idx)} images in '{folder.name}'")
    return idx

# ---------- Helper: run over one split/folder ----------
def run_split(folder: Path, split_name: str, questions_path: Path, id_whitelist: Optional[set],
              out_path: Path, max_new_tokens: int = 24, progress_every: int = 100):
    """
    - folder: where the images live (e.g., REAL_FOLDER or GEN_FOLDER)
    - split_name: "real" or "generated" (used for logging)
    - questions_path: JSON file as you described
    - id_whitelist: optional set of image_ids to restrict to
    - out_path: JSON output file
    - returns: list of {image_id, question_id, answer} dicts
    """
    # Load questions
    with open(questions_path, "r", encoding="utf-8") as f:
        questions = json.load(f)

    # Build image index for quick lookup
    index = build_index_by_id(folder)

    results: List[dict] = []
    missing_imgs = 0
    asked = 0
    t0 = time.time()

    for i, q in enumerate(questions):
        image_id = int(q["image_id"])
        question  = q["question"]
        qid       = int(q["question_id"])

        if id_whitelist is not None and image_id not in id_whitelist:
            continue

        img_path = index.get(image_id, None)
        if img_path is None:
            missing_imgs += 1
            continue

        try:
            ans = ask_vlm(str(img_path), question, max_new_tokens=max_new_tokens)
        except Exception as e:
            ans = f"__ERROR__: {type(e).__name__}: {e}"

        results.append({
            "image_id": image_id,
            "question_id": qid,
            "answer": ans,
            # (keep minimal as you requested; if you want more fields, add here)
            # "question": question,
            # "split": split_name,
            # "model": MODEL_CHOICE,
            # "image_path": str(img_path),
        })
        asked += 1

        if progress_every and asked % progress_every == 0:
            dt = time.time() - t0
            print(f"[{split_name}] processed {asked} Qs  | missing imgs: {missing_imgs} | elapsed: {dt:.1f}s")

    # Save as JSON array
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"[{split_name}] DONE. Saved {len(results)} items to: {out_path}")
    if missing_imgs:
        print(f"[{split_name}] WARNING: {missing_imgs} questions were skipped (image not found in folder).")

    return results

# ---------- Run both splits ----------
# Ensure your model is loaded (respects MODEL_CHOICE from earlier)
load_vlm(MODEL_CHOICE)

whitelist = load_id_list(IMAGE_ID_LIST_PATH)  # or None
ts = time.strftime("%Y%m%d-%H%M%S")

out_real = RESULTS_DIR / f"exp1_real_{MODEL_CHOICE}_{ts}.json"
out_gen  = RESULTS_DIR / f"exp1_generated_{MODEL_CHOICE}_{ts}.json"

print("Running REAL split...")
res_real = run_split(REAL_FOLDER, "real", QUESTIONS_JSON, whitelist, out_real, max_new_tokens=24)

print("\nRunning GENERATED split...")
res_gen  = run_split(GEN_FOLDER, "generated", QUESTIONS_JSON, whitelist, out_gen, max_new_tokens=24)

print("\nSummary:")
print(f"  Real:      {len(res_real)} answers  -> {out_real}")
print(f"  Generated: {len(res_gen)} answers  -> {out_gen}")


`torch_dtype` is deprecated! Use `dtype` instead!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

KeyboardInterrupt: 

# General Code for EXP

In [None]:
# ===========================================
# General VLM Experiment Runner (JSON output)
# - Works with LLaVA or Qwen2-VL via ask_vlm()
# - Supports both question schemas:
#   v1: {image_id:int, question:str, question_id:int}
#   v2: {image_id:str, question:str, question_id:str, answer:optional}
# - Expects images named like "img73.jpg/png" (extracts last integer)
# - Saves per-split JSON to results/<exp_name>/
# ===========================================

import os, json, re, time
from pathlib import Path
from typing import Dict, List, Optional

# ---------- Project root & default results root ----------
# (Change PROJECT_ROOT once; per-experiment cells override only folders & files)
PROJECT_ROOT = Path("/content/drive/MyDrive/NLP_Project")
DEFAULT_RESULTS_ROOT = PROJECT_ROOT / "results"
DEFAULT_RESULTS_ROOT.mkdir(parents=True, exist_ok=True)

# ---------- Filename parsing ----------
VALID_EXTS = {".jpg", ".jpeg", ".png", ".webp"}

def build_index_by_id(folder: Path) -> Dict[int, Path]:
    """
    Index a folder recursively: map image_id (int) -> Path.
    Accepts filenames like 'img73.jpg', 'img00073.png', or any basename with digits.
    Uses the *last* integer in the stem as the id.
    """
    idx: Dict[int, Path] = {}
    if not folder.exists():
        print(f"[WARN] folder does not exist: {folder}")
        return idx

    files = []
    for ext in VALID_EXTS:
        files.extend(folder.rglob(f"*{ext}"))

    pat_exact = re.compile(r"^img(\d+)$", re.IGNORECASE)

    for p in files:
        stem = p.stem.lower()
        image_id = None
        m = pat_exact.match(stem)
        if m:
            image_id = int(m.group(1))
        else:
            nums = re.findall(r"(\d+)", stem)
            if nums:
                image_id = int(nums[-1])

        if image_id is not None:
            if image_id not in idx:
                idx[image_id] = p
            elif idx[image_id] != p:
                # Keep first, warn on duplicates
                print(f"[NOTE] duplicate image_id {image_id} in {folder.name}; keeping {idx[image_id].name}, ignoring {p.name}")

    print(f"[index] {folder.name}: {len(idx)} images indexed")
    return idx

# ---------- Question loading (schema-agnostic) ----------
def _to_int_or_none(x):
    try:
        return int(str(x).strip())
    except:
        return None

def load_questions_generic(questions_json: Path) -> List[dict]:
    """
    Returns a list of dicts with unified keys:
      { 'image_id': int, 'question_id': str, 'question': str, 'gt_answer': Optional[str] }
    Works for both schemas you showed.
    """
    with open(questions_json, "r", encoding="utf-8") as f:
        data = json.load(f)

    out = []
    for q in data:
        iid = _to_int_or_none(q.get("image_id"))
        if iid is None:
            continue
        qid = str(q.get("question_id"))
        question = str(q.get("question", "")).strip()
        gt = q.get("answer")  # may be absent
        if not question:
            continue
        out.append({"image_id": iid, "question_id": qid, "question": question, "gt_answer": gt})
    return out

# ---------- Optional whitelist ----------
def load_id_list(path: Optional[Path]) -> Optional[set]:
    if path is None:
        return None
    try:
        if str(path).endswith(".txt"):
            ids = []
            with open(path, "r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if line:
                        ids.append(int(line))
            return set(ids)
        elif str(path).endswith(".json"):
            data = json.load(open(path, "r", encoding="utf-8"))
            if isinstance(data, dict) and "ids" in data:
                return set(int(x) for x in data["ids"])
            elif isinstance(data, list):
                return set(int(x) for x in data)
    except Exception as e:
        print(f"[WARN] failed to load id list {path}: {e}")
    return None

# ---------- Core split runner ----------
def run_split(
    split_name: str,
    image_folder: Path,
    questions: List[dict],
    out_path: Path,
    id_whitelist: Optional[set] = None,
    max_new_tokens: int = 24,
    progress_every: int = 100,
    minimal_json: bool = True,
):
    """
    - split_name: label like 'real', 'generated', 'cartoon', etc.
    - image_folder: Path to images for this split
    - questions: unified list from load_questions_generic()
    - out_path: where to write JSON array
    - minimal_json=True writes: {image_id, question_id, answer}
      If False, include extra fields for debugging/repro.
    """
    index = build_index_by_id(image_folder)

    results: List[dict] = []
    missing = 0
    asked = 0
    t0 = time.time()

    for q in questions:
        iid = q["image_id"]
        if id_whitelist is not None and iid not in id_whitelist:
            continue

        qid = q["question_id"]
        question = q["question"]
        img_path = index.get(iid)

        if img_path is None:
            missing += 1
            continue

        try:
            pred = ask_vlm(str(img_path), question, max_new_tokens=max_new_tokens)
        except Exception as e:
            pred = f"__ERROR__: {type(e).__name__}: {e}"

        if minimal_json:
            row = {"image_id": iid, "question_id": qid, "answer": pred}
        else:
            row = {
                "image_id": iid,
                "question_id": qid,
                "answer": pred,
                "question": question,
                "split": split_name,
                "model": MODEL_CHOICE,
                "image_path": str(img_path),
                "gt_answer_from_file": q.get("gt_answer"),
            }
        results.append(row)
        asked += 1

        if progress_every and asked % progress_every == 0:
            dt = time.time() - t0
            print(f"[{split_name}] {asked} Qs | missing imgs: {missing} | {dt:.1f}s")

    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"[{split_name}] saved {len(results)} rows -> {out_path}")
    if missing:
        print(f"[{split_name}] WARNING: {missing} questions skipped (image not found).")

    return results

# ---------- One-call experiment runner ----------
def run_experiment(
    exp_name: str,
    split_folders: Dict[str, Path],
    questions_path: Path,
    results_root: Path = DEFAULT_RESULTS_ROOT,
    image_id_list_path: Optional[Path] = None,
    max_new_tokens: int = 24,
    progress_every: int = 100,
    minimal_json: bool = True,
):
    """
    exp_name: name for this experiment (creates results/<exp_name>/)
    split_folders: mapping like {"real": REAL_FOLDER, "generated": GEN_FOLDER}
    questions_path: path to questions JSON (v1 or v2)
    results_root: base results folder (default results/)
    """
    # Ensure model is loaded (uses global MODEL_CHOICE from your earlier cell)
    load_vlm(MODEL_CHOICE)

    # Load questions and optional whitelist
    questions = load_questions_generic(questions_path)
    whitelist = load_id_list(image_id_list_path)

    # Resolve outputs
    ts = time.strftime("%Y%m%d-%H%M%S")
    exp_dir = results_root / exp_name
    exp_dir.mkdir(parents=True, exist_ok=True)

    # Run each split
    all_stats = {}
    for split_name, folder in split_folders.items():
        out_path = exp_dir / f"{split_name}_{MODEL_CHOICE}_{ts}.json"
        print(f"\n>>> Running split '{split_name}' from {folder}")
        res = run_split(
            split_name=split_name,
            image_folder=folder,
            questions=questions,
            out_path=out_path,
            id_whitelist=whitelist,
            max_new_tokens=max_new_tokens,
            progress_every=progress_every,
            minimal_json=minimal_json,
        )
        all_stats[split_name] = len(res)

    print("\n=== Experiment done ===")
    for k, v in all_stats.items():
        print(f"  {k}: {v} answers")
    print(f"Results in: {exp_dir}")


# EXPERIMENT 2 -Real vs Generated (RealAsIs) - DONE



In [None]:
# --------- EXPERIMENT 2 CELL: second question schema ---------

EXP_NAME = "exp2_real_vs_generated"                         # results/exp2_real_vs_generated/
QUESTIONS_JSON = PROJECT_ROOT / "Lama_Questions" /"vqa_pairs_from_caption_new.json"         # v2 schema with string IDs and optional 'answer'

SPLITS = {
    "real":      PROJECT_ROOT / "real_images",
    "generated": PROJECT_ROOT / "captionsasis_photorealistic",
}

IMAGE_ID_LIST = None  # or a path to filter your 632 ids

run_experiment(
    exp_name=EXP_NAME,
    split_folders=SPLITS,
    questions_path=QUESTIONS_JSON,
    results_root=DEFAULT_RESULTS_ROOT,  # will write to results/exp2_real_vs_generated/
    image_id_list_path=IMAGE_ID_LIST,
    max_new_tokens=24,
    progress_every=100,
    minimal_json=True,
)


Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/674 [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

[LLaVA] Loaded llava-hf/llava-1.5-7b-hf in fp16

>>> Running split 'real' from /content/drive/MyDrive/NLP_Project/real_images
[index] real_images: 632 images indexed
[real] 100 Qs | missing imgs: 0 | 26.6s
[real] 200 Qs | missing imgs: 0 | 41.7s
[real] 300 Qs | missing imgs: 0 | 56.7s
[real] 400 Qs | missing imgs: 3 | 72.3s
[real] 500 Qs | missing imgs: 6 | 87.0s
[real] 600 Qs | missing imgs: 6 | 101.7s
[real] 700 Qs | missing imgs: 6 | 116.1s
[real] 800 Qs | missing imgs: 6 | 130.8s
[real] 900 Qs | missing imgs: 9 | 145.4s
[real] 1000 Qs | missing imgs: 9 | 160.3s
[real] 1100 Qs | missing imgs: 9 | 175.1s
[real] 1200 Qs | missing imgs: 9 | 189.9s
[real] 1300 Qs | missing imgs: 9 | 204.8s
[real] 1400 Qs | missing imgs: 12 | 219.9s
[real] 1500 Qs | missing imgs: 15 | 234.5s
[real] 1600 Qs | missing imgs: 15 | 249.5s
[real] 1700 Qs | missing imgs: 15 | 264.6s
[real] 1800 Qs | missing imgs: 15 | 279.4s
[real] saved 1890 rows -> /content/drive/MyDrive/NLP_Project/results/exp2_real_vs_gener

# EXPERIMENT 3 -RealAsIs vs cartoonAsIs

In [None]:
# --------- EXPERIMENT 3 CELL ----------------------
EXP_NAME = "exp3_RealAsIs_vs_cartoonAsIs"                         # results/exp3_RealAsIs_vs_cartoonAsIs/
QUESTIONS_JSON = PROJECT_ROOT / "Lama_Questions" /"vqa_pairs_from_caption_new.json"
SPLITS = {
    "RealAsIs":      PROJECT_ROOT / "captionsasis_photorealistic",
    "cartoonAsIs": PROJECT_ROOT / "captionasis_cartoonish",
}

IMAGE_ID_LIST = None  # or a path to filter your 632 ids

run_experiment(
    exp_name=EXP_NAME,
    split_folders=SPLITS,
    questions_path=QUESTIONS_JSON,
    results_root=DEFAULT_RESULTS_ROOT,
    image_id_list_path=IMAGE_ID_LIST,
    max_new_tokens=24,
    progress_every=100,
    minimal_json=True,
)

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

chat_template.json: 0.00B [00:00, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


[Qwen2-VL] Loaded Qwen/Qwen2-VL-7B-Instruct in fp16

>>> Running split 'RealAsIs' from /content/drive/MyDrive/NLP_Project/captionsasis_photorealistic
[index] captionsasis_photorealistic: 632 images indexed
[RealAsIs] 100 Qs | missing imgs: 0 | 18.1s
[RealAsIs] 200 Qs | missing imgs: 0 | 33.0s
[RealAsIs] 300 Qs | missing imgs: 0 | 48.0s
[RealAsIs] 400 Qs | missing imgs: 3 | 63.3s
[RealAsIs] 500 Qs | missing imgs: 6 | 78.5s
[RealAsIs] 600 Qs | missing imgs: 6 | 93.2s
[RealAsIs] 700 Qs | missing imgs: 6 | 108.2s
[RealAsIs] 800 Qs | missing imgs: 6 | 123.3s
[RealAsIs] 900 Qs | missing imgs: 9 | 138.6s
[RealAsIs] 1000 Qs | missing imgs: 9 | 153.9s
[RealAsIs] 1100 Qs | missing imgs: 9 | 169.0s
[RealAsIs] 1200 Qs | missing imgs: 9 | 184.2s
[RealAsIs] 1300 Qs | missing imgs: 9 | 199.3s
[RealAsIs] 1400 Qs | missing imgs: 12 | 214.6s
[RealAsIs] 1500 Qs | missing imgs: 15 | 230.2s
[RealAsIs] 1600 Qs | missing imgs: 15 | 245.5s
[RealAsIs] 1700 Qs | missing imgs: 15 | 260.9s
[RealAsIs] 1800 Qs | mi

# EXPERIMINT 4: RealAsIs VS RealSimp

In [None]:
# --------- EXPERIMENT 4 CELL ----------------------
EXP_NAME = "exp4_RealAsIs_vs_RealSimp"
QUESTIONS_JSON = PROJECT_ROOT / "Lama_Questions" /"vqa_pairs_from_both.json" ##############
SPLITS = {
    "RealAsIs":      PROJECT_ROOT / "captionsasis_photorealistic",
    "RealSimp": PROJECT_ROOT / "captionsimplified_photorealistic",
}

IMAGE_ID_LIST = None  # or a path to filter your 632 ids

run_experiment(
    exp_name=EXP_NAME,
    split_folders=SPLITS,
    questions_path=QUESTIONS_JSON,
    results_root=DEFAULT_RESULTS_ROOT,
    image_id_list_path=IMAGE_ID_LIST,
    max_new_tokens=24,
    progress_every=100,
    minimal_json=True,
)


config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/674 [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

[LLaVA] Loaded llava-hf/llava-1.5-7b-hf in fp16

>>> Running split 'RealAsIs' from /content/drive/MyDrive/NLP_Project/captionsasis_photorealistic
[index] captionsasis_photorealistic: 632 images indexed
[RealAsIs] 100 Qs | missing imgs: 0 | 16.8s
[RealAsIs] 200 Qs | missing imgs: 0 | 31.5s
[RealAsIs] 300 Qs | missing imgs: 0 | 46.5s
[RealAsIs] 400 Qs | missing imgs: 3 | 61.6s
[RealAsIs] 500 Qs | missing imgs: 6 | 76.5s
[RealAsIs] 600 Qs | missing imgs: 6 | 91.1s
[RealAsIs] 700 Qs | missing imgs: 6 | 106.1s
[RealAsIs] 800 Qs | missing imgs: 6 | 121.1s
[RealAsIs] 900 Qs | missing imgs: 9 | 135.7s
[RealAsIs] 1000 Qs | missing imgs: 9 | 151.1s
[RealAsIs] 1100 Qs | missing imgs: 9 | 166.4s
[RealAsIs] 1200 Qs | missing imgs: 9 | 181.4s
[RealAsIs] 1300 Qs | missing imgs: 9 | 196.2s
[RealAsIs] 1400 Qs | missing imgs: 12 | 211.1s
[RealAsIs] 1500 Qs | missing imgs: 15 | 225.9s
[RealAsIs] 1600 Qs | missing imgs: 15 | 240.7s
[RealAsIs] 1700 Qs | missing imgs: 15 | 255.4s
[RealAsIs] 1800 Qs | missin

# EXPERIMINT 5: CartoonAsIs VS CartoonSimp

In [None]:
# --------- EXPERIMENT 5 CELL ----------------------
EXP_NAME = "exp5_CartoonAsIs_vs_CartoonSimp"
QUESTIONS_JSON = PROJECT_ROOT / "Lama_Questions" /"vqa_pairs_from_both.json"
SPLITS = {
    "CartoonAsIs":      PROJECT_ROOT / "captionasis_cartoonish",
    "CartoonSimp": PROJECT_ROOT / "captionsimplified_cartoonish",
}

IMAGE_ID_LIST = None  # or a path to filter your 632 ids

run_experiment(
    exp_name=EXP_NAME,
    split_folders=SPLITS,
    questions_path=QUESTIONS_JSON,
    results_root=DEFAULT_RESULTS_ROOT,
    image_id_list_path=IMAGE_ID_LIST,
    max_new_tokens=24,
    progress_every=100,
    minimal_json=True,
)


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

[Qwen2-VL] Loaded Qwen/Qwen2-VL-7B-Instruct in fp16

>>> Running split 'CartoonAsIs' from /content/drive/MyDrive/NLP_Project/captionasis_cartoonish
[index] captionasis_cartoonish: 632 images indexed


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


[CartoonAsIs] 100 Qs | missing imgs: 0 | 27.5s
[CartoonAsIs] 200 Qs | missing imgs: 0 | 41.7s
[CartoonAsIs] 300 Qs | missing imgs: 0 | 57.4s
[CartoonAsIs] 400 Qs | missing imgs: 3 | 72.5s
[CartoonAsIs] 500 Qs | missing imgs: 6 | 87.1s
[CartoonAsIs] 600 Qs | missing imgs: 6 | 101.5s
[CartoonAsIs] 700 Qs | missing imgs: 6 | 116.1s
[CartoonAsIs] 800 Qs | missing imgs: 6 | 131.0s
[CartoonAsIs] 900 Qs | missing imgs: 9 | 145.9s
[CartoonAsIs] 1000 Qs | missing imgs: 9 | 160.9s
[CartoonAsIs] 1100 Qs | missing imgs: 9 | 175.6s
[CartoonAsIs] 1200 Qs | missing imgs: 9 | 190.2s
[CartoonAsIs] 1300 Qs | missing imgs: 9 | 205.4s
[CartoonAsIs] 1400 Qs | missing imgs: 12 | 220.2s
[CartoonAsIs] 1500 Qs | missing imgs: 15 | 234.8s
[CartoonAsIs] 1600 Qs | missing imgs: 15 | 249.1s
[CartoonAsIs] 1700 Qs | missing imgs: 15 | 264.0s
[CartoonAsIs] 1800 Qs | missing imgs: 15 | 278.9s
[CartoonAsIs] saved 1848 rows -> /content/drive/MyDrive/NLP_Project/results/exp5_CartoonAsIs_vs_CartoonSimp/CartoonAsIs_qwen_20

# EXPERIMINT 6: RealSimp VS CartoonSimp

In [None]:
# --------- EXPERIMENT 6 CELL ----------------------
EXP_NAME = "exp6_RealSimp_vs_CartoonSimp"
QUESTIONS_JSON = PROJECT_ROOT / "Lama_Questions" /"vqa_pairs_from_both.json"
SPLITS = {
    "RealSimp":      PROJECT_ROOT / "captionsimplified_photorealistic",
    "CartoonSimp": PROJECT_ROOT / "captionsimplified_cartoonish",
}

IMAGE_ID_LIST = None  # or a path to filter your 632 ids

run_experiment(
    exp_name=EXP_NAME,
    split_folders=SPLITS,
    questions_path=QUESTIONS_JSON,
    results_root=DEFAULT_RESULTS_ROOT,
    image_id_list_path=IMAGE_ID_LIST,
    max_new_tokens=24,
    progress_every=100,
    minimal_json=True,
)


config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/674 [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

[LLaVA] Loaded llava-hf/llava-1.5-7b-hf in fp16

>>> Running split 'RealSimp' from /content/drive/MyDrive/NLP_Project/captionsimplified_photorealistic
[index] captionsimplified_photorealistic: 632 images indexed
[RealSimp] 100 Qs | missing imgs: 0 | 17.3s
[RealSimp] 200 Qs | missing imgs: 0 | 32.6s
[RealSimp] 300 Qs | missing imgs: 0 | 47.8s
[RealSimp] 400 Qs | missing imgs: 3 | 63.5s
[RealSimp] 500 Qs | missing imgs: 6 | 78.6s
[RealSimp] 600 Qs | missing imgs: 6 | 93.7s
[RealSimp] 700 Qs | missing imgs: 6 | 108.7s
[RealSimp] 800 Qs | missing imgs: 6 | 123.9s
[RealSimp] 900 Qs | missing imgs: 9 | 139.2s
[RealSimp] 1000 Qs | missing imgs: 9 | 154.7s
[RealSimp] 1100 Qs | missing imgs: 9 | 170.1s
[RealSimp] 1200 Qs | missing imgs: 9 | 185.4s
[RealSimp] 1300 Qs | missing imgs: 9 | 200.5s
[RealSimp] 1400 Qs | missing imgs: 12 | 216.3s
[RealSimp] 1500 Qs | missing imgs: 15 | 231.3s
[RealSimp] 1600 Qs | missing imgs: 15 | 246.5s
[RealSimp] 1700 Qs | missing imgs: 15 | 261.1s
[RealSimp] 1800 Q

# Similarities and Analysis - ASEEL


#General Evaluation Script

In [None]:
import json
import torch
from transformers import CLIPModel, CLIPProcessor
from collections import Counter
import re

# Setup CLIP
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def preprocess_text(text):
    # Lowercase
    text = text.lower().strip()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Normalize spaces
    text = re.sub(r'\s+', ' ', text)
    # Map common synonyms (extend as needed)
    replacements = {
        "bike": "bicycle",
        "tv": "television",
        "cellphone": "phone",
        "mobile": "phone"
    }
    for k, v in replacements.items():
        if text == k:
            text = v
    return text


def clip_score(pred, refs):
    pred = preprocess_text(pred)
    refs = [preprocess_text(r) for r in refs]
    texts = [pred] + refs
    inputs = clip_processor(text=texts, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        feats = clip_model.get_text_features(**inputs)
    pred_feat, ref_feats = feats[0], feats[1:]
    sims = torch.cosine_similarity(pred_feat.unsqueeze(0), ref_feats, dim=1).cpu().tolist()
    return sims

# --- VQA-style evaluation (10 answers) ---
def evaluate_vqa(preds_path, anno_path):
    with open(preds_path, 'r') as f:
        preds = json.load(f)
    with open(anno_path, 'r') as f:
        annotations = json.load(f)

    gt_map = {ann["question_id"]: ann for ann in annotations}
    results = {"max": [], "majority": [], "weighted": [], "official": []}

    for item in preds:
        qid = item["question_id"]
        pred = item["answer"]
        if qid not in gt_map:
            continue
        ann = gt_map[qid]

        refs = [a["answer"] for a in ann["answers"]]
        counts = Counter(refs)
        total = sum(counts.values())

        # max similarity
        results["max"].append(max(clip_score(pred, refs)))

        # majority (dataset consensus)
        results["majority"].append(clip_score(pred, [ann["multiple_choice_answer"]])[0])

        # weighted by frequency
        sims = clip_score(pred, list(counts.keys()))
        weighted = sum(sim * (counts[ans]/total) for sim, ans in zip(sims, counts.keys()))
        results["weighted"].append(weighted)

        # official VQA metric (string exact match)
        matches = sum(1 for r in refs if preprocess_text(r) == preprocess_text(pred))
        results["official"].append(min(1.0, matches/3.0))

    return {k: sum(v)/len(v) if v else 0.0 for k,v in results.items()}

# --- Generated Q/A evaluation (1 answer) ---
def evaluate_generated(preds_path, anno_path):
    with open(preds_path, 'r') as f:
        preds = json.load(f)
    with open(anno_path, 'r') as f:
        annotations = json.load(f)

    gt_map = {ann["question_id"]: ann for ann in annotations}
    sims = []

    for item in preds:
        qid = item["question_id"]
        pred = item["answer"]
        if qid not in gt_map:
            continue
        ann = gt_map[qid]
        if "answer" in ann:
            sims.append(clip_score(pred, [ann["answer"]])[0])

    return {"clip_similarity": sum(sims)/len(sims) if sims else 0.0}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

**Print low-score Examples**

In [None]:
def inspect_low_scores(preds_path, anno_path, k=10):
    """Print the k lowest CLIP similarities for debugging generated dataset Q/A."""
    # Load data
    with open(preds_path, 'r') as f:
        preds = json.load(f)
    with open(anno_path, 'r') as f:
        annotations = json.load(f)

    # Build GT map
    gt_map = {ann["question_id"]: ann for ann in annotations}

    scored = []
    for item in preds:
        qid = item["question_id"]
        pred = item["answer"]
        if qid not in gt_map:
            continue
        ann = gt_map[qid]
        if "answer" in ann:
            gold = ann["answer"]
            sim = clip_score(pred, [gold])[0]
            scored.append((sim, qid, ann.get("question", ""), gold, pred))

    # Sort by similarity ascending
    scored.sort(key=lambda x: x[0])

    # Print lowest-k examples
    print(f"Lowest {k} scoring examples:")
    for sim, qid, question, gold, pred in scored[:k]:
        print("="*60)
        print(f"QID: {qid}")
        print(f"Question: {question}")
        print(f"Gold Answer: {gold}")
        print(f"Model Prediction: {pred}")
        print(f"CLIP Similarity: {sim:.4f}")

#Experiment 1: Real images

LLava:

In [None]:
real_path = "/content/drive/My Drive/NLP_Project/results/exp1_real_VQA/exp1_real_llava_20250923-170654.json"
anno_path = "/content/drive/My Drive/NLP_Project/VQA/VQA_annotations.json"

real_eval = evaluate_vqa(real_path, anno_path)

print("Real Images:", real_eval)

Real Images: {'max': 0.9837213574749667, 'majority': 0.9628511457545001, 'weighted': 0.9532556863147311, 'official': 0.7494918699186992}


Qwen:

In [None]:
real_path = "/content/drive/My Drive/NLP_Project/results/exp1_real_VQA/exp1_real_qwen_20250925-114223.json"
anno_path = "/content/drive/My Drive/NLP_Project/VQA/VQA_annotations.json"

real_eval = evaluate_vqa(real_path, anno_path)

print("Real Images:", real_eval)

Real Images: {'max': 0.989928912862045, 'majority': 0.9702899942492567, 'weighted': 0.9592556699565271, 'official': 0.8266260162601626}


#Experiment 2: Real vs Generated (as is)

LLava:

In [None]:
real_path = "/content/drive/My Drive/NLP_Project/results/exp2_real_vs_generated/real_llava_20250925-150348.json"
gen_path = "/content/drive/My Drive/NLP_Project/results/exp2_real_vs_generated/generated_llava_20250925-150348.json"
anno_path = "/content/drive/My Drive/NLP_Project/Lama_Questions/vqa_pairs_from_caption_new.json"

real_eval = evaluate_generated(real_path, anno_path)
gen_eval = evaluate_generated(gen_path, anno_path)

print("Real Images:", real_eval)
print("Generated Images:", gen_eval)

Real Images: {'clip_similarity': 0.9227603209712518}
Generated Images: {'clip_similarity': 0.92420543249322}


In [None]:
inspect_low_scores(
    "/content/drive/My Drive/NLP_Project/results/exp2_real_vs_generated/real_llava_20250925-150348.json",
    "/content/drive/My Drive/NLP_Project/Lama_Questions/vqa_pairs_from_caption_new.json",
    k=10  # show 5 worst examples
)

Lowest 10 scoring examples:
QID: 8718_1
Question: What is in the breakfast bowl?
Gold Answer: blueberry sconce and strawberry
Model Prediction: Fruit
CLIP Similarity: 0.6101
QID: 23731_1
Question: What color is the kitty?
Gold Answer: calico
Model Prediction: Brown
CLIP Similarity: 0.6250
QID: 22103_3
Question: What are the people doing?
Gold Answer: feeding
Model Prediction: Feeding giraffe
CLIP Similarity: 0.6555
QID: 8045_1
Question: What is on the platform?
Gold Answer: tub and shower
Model Prediction: Bench
CLIP Similarity: 0.6691
QID: 12706_2
Question: What is the scene around?
Gold Answer: a crowd of pedestrians
Model Prediction: Eiffel tower
CLIP Similarity: 0.6762
QID: 14175_2
Question: What is the woman walking past?
Gold Answer: posters and graffiti
Model Prediction: Fire hydrant
CLIP Similarity: 0.6837
QID: 48708_1
Question: What color is the chair?
Gold Answer: lime green
Model Prediction: Brown
CLIP Similarity: 0.6859
QID: 20470_3
Question: What are the two giraffe heads 

In [None]:
inspect_low_scores(
    "/content/drive/My Drive/NLP_Project/results/exp2_real_vs_generated/generated_llava_20250925-150348.json",
    "/content/drive/My Drive/NLP_Project/Lama_Questions/vqa_pairs_from_caption_new.json",
    k=10  # show 5 worst examples
)

Lowest 10 scoring examples:
QID: 8718_1
Question: What is in the breakfast bowl?
Gold Answer: blueberry sconce and strawberry
Model Prediction: Fruit
CLIP Similarity: 0.6101
QID: 23731_1
Question: What color is the kitty?
Gold Answer: calico
Model Prediction: White
CLIP Similarity: 0.6438
QID: 27065_2
Question: Where is the towel hanging?
Gold Answer: beneath the sink
Model Prediction: Wall
CLIP Similarity: 0.6674
QID: 3466_2
Question: What is the man wearing?
Gold Answer: orange and black shoes
Model Prediction: Shirt
CLIP Similarity: 0.6732
QID: 19817_3
Question: What is the toilet sitting next to?
Gold Answer: sink
Model Prediction: Shower curtain
CLIP Similarity: 0.6847
QID: 1779_1
Question: What is in the intersection?
Gold Answer: street sign and a building
Model Prediction: Car
CLIP Similarity: 0.6943
QID: 12754_1
Question: What is the scene from?
Gold Answer: Mad Men
Model Prediction: Kitchen
CLIP Similarity: 0.6950
QID: 20470_3
Question: What are the two giraffe heads standing

Qwen:

In [None]:
real_path = "/content/drive/My Drive/NLP_Project/results/exp2_real_vs_generated/real_qwen_20250925-124559.json"
gen_path = "/content/drive/My Drive/NLP_Project/results/exp2_real_vs_generated/generated_qwen_20250925-124559.json"
anno_path = "/content/drive/My Drive/NLP_Project/Lama_Questions/vqa_pairs_from_caption_new.json"

real_eval = evaluate_generated(real_path, anno_path)
gen_eval = evaluate_generated(gen_path, anno_path)

print("Real Images:", real_eval)
print("Generated Images:", gen_eval)

Real Images: {'clip_similarity': 0.9248381099688313}
Generated Images: {'clip_similarity': 0.9225965219515341}


Inspect low-score examples:


In [None]:
inspect_low_scores(
    "/content/drive/My Drive/NLP_Project/results/exp2_real_vs_generated/real_qwen_20250925-124559.json",
    "/content/drive/My Drive/NLP_Project/Lama_Questions/vqa_pairs_from_caption_new.json",
    k=10  # show 5 worst examples
)

Lowest 10 scoring examples:
QID: 8718_1
Question: What is in the breakfast bowl?
Gold Answer: blueberry sconce and strawberry
Model Prediction: fruit
CLIP Similarity: 0.6101
QID: 23731_1
Question: What color is the kitty?
Gold Answer: calico
Model Prediction: brown and white
CLIP Similarity: 0.6508
QID: 22103_3
Question: What are the people doing?
Gold Answer: feeding
Model Prediction: feeding giraffe
CLIP Similarity: 0.6555
QID: 44165_1
Question: What is the caretaker doing?
Gold Answer: looking after
Model Prediction: feeding elephant
CLIP Similarity: 0.6710
QID: 12706_2
Question: What is the scene around?
Gold Answer: a crowd of pedestrians
Model Prediction: Eiffel Tower
CLIP Similarity: 0.6762
QID: 1146_1
Question: What is the man wearing?
Gold Answer: black attire
Model Prediction: shirt and tie
CLIP Similarity: 0.6789
QID: 48708_1
Question: What color is the chair?
Gold Answer: lime green
Model Prediction: brown
CLIP Similarity: 0.6859
QID: 8045_1
Question: What is on the platfor

In [None]:
inspect_low_scores(
    "/content/drive/My Drive/NLP_Project/results/exp2_real_vs_generated/generated_qwen_20250925-124559.json",
    "/content/drive/My Drive/NLP_Project/Lama_Questions/vqa_pairs_from_caption_new.json",
    k=10  # show 5 worst examples
)

Lowest 10 scoring examples:
QID: 23731_1
Question: What color is the kitty?
Gold Answer: calico
Model Prediction: Orange/white
CLIP Similarity: 0.6501
QID: 42492_1
Question: What is in the center of the kitchen?
Gold Answer: stove top island
Model Prediction: Man reading.
CLIP Similarity: 0.6562
QID: 49255_2
Question: What is next to the pole?
Gold Answer: a large green tree
Model Prediction: Power lines.
CLIP Similarity: 0.6599
QID: 45099_2
Question: What is in the picture?
Gold Answer: sink and toilet
Model Prediction: A woman taking a selfie in a bathroom.
CLIP Similarity: 0.6785
QID: 44520_2
Question: What are the signs on the pole?
Gold Answer: stop sign and railway crossing sign
Model Prediction: Traffic.
CLIP Similarity: 0.6826
QID: 23731_2
Question: What is the kitty sleeping in?
Gold Answer: an orange chair
Model Prediction: bed
CLIP Similarity: 0.6830
QID: 8718_1
Question: What is in the breakfast bowl?
Gold Answer: blueberry sconce and strawberry
Model Prediction: Yogurt, bl

#Experiment 3: Realistic vs Cartoonish (as is)

LLava:

In [None]:
realistic_path = "/content/drive/My Drive/NLP_Project/results/exp3_RealAsIs_vs_cartoonAsIs/RealAsIs_llava_20250925-155035.json"
cartoonish_path = "/content/drive/My Drive/NLP_Project/results/exp3_RealAsIs_vs_cartoonAsIs/cartoonAsIs_llava_20250925-155035.json"
anno_path = "/content/drive/My Drive/NLP_Project/Lama_Questions/vqa_pairs_from_caption_new.json"

realistic_eval = evaluate_generated(realistic_path, anno_path)
cartoonish_eval = evaluate_generated(cartoonish_path, anno_path)

print("Realistic Images:", realistic_eval)
print("Cartoonish Images:", cartoonish_eval)

Realistic Images: {'clip_similarity': 0.92420543249322}
Cartoonish Images: {'clip_similarity': 0.9222634219303333}


In [None]:
inspect_low_scores(
    "/content/drive/My Drive/NLP_Project/results/exp3_RealAsIs_vs_cartoonAsIs/RealAsIs_llava_20250925-155035.json",
    "/content/drive/My Drive/NLP_Project/Lama_Questions/vqa_pairs_from_caption_new.json",
    k=100  # show 5 worst examples
)

Lowest 100 scoring examples:
QID: 8718_1
Question: What is in the breakfast bowl?
Gold Answer: blueberry sconce and strawberry
Model Prediction: Fruit
CLIP Similarity: 0.6101
QID: 23731_1
Question: What color is the kitty?
Gold Answer: calico
Model Prediction: White
CLIP Similarity: 0.6438
QID: 27065_2
Question: Where is the towel hanging?
Gold Answer: beneath the sink
Model Prediction: Wall
CLIP Similarity: 0.6674
QID: 3466_2
Question: What is the man wearing?
Gold Answer: orange and black shoes
Model Prediction: Shirt
CLIP Similarity: 0.6732
QID: 19817_3
Question: What is the toilet sitting next to?
Gold Answer: sink
Model Prediction: Shower curtain
CLIP Similarity: 0.6847
QID: 1779_1
Question: What is in the intersection?
Gold Answer: street sign and a building
Model Prediction: Car
CLIP Similarity: 0.6943
QID: 12754_1
Question: What is the scene from?
Gold Answer: Mad Men
Model Prediction: Kitchen
CLIP Similarity: 0.6950
QID: 20470_3
Question: What are the two giraffe heads standin

In [None]:
inspect_low_scores(
    "/content/drive/My Drive/NLP_Project/results/exp3_RealAsIs_vs_cartoonAsIs/cartoonAsIs_llava_20250925-155035.json",
    "/content/drive/My Drive/NLP_Project/Lama_Questions/vqa_pairs_from_caption_new.json",
    k=100  # show 5 worst examples
)

Lowest 100 scoring examples:
QID: 8718_1
Question: What is in the breakfast bowl?
Gold Answer: blueberry sconce and strawberry
Model Prediction: Fruit
CLIP Similarity: 0.6101
QID: 10800_1
Question: What is in the bathroom?
Gold Answer: sink, counter, toilet, shower curtain
Model Prediction: Mirror
CLIP Similarity: 0.6219
QID: 49444_1
Question: What is the woman standing in front of?
Gold Answer: an elephant float
Model Prediction: Fence
CLIP Similarity: 0.6409
QID: 23731_1
Question: What color is the kitty?
Gold Answer: calico
Model Prediction: White
CLIP Similarity: 0.6438
QID: 27065_2
Question: Where is the towel hanging?
Gold Answer: beneath the sink
Model Prediction: Wall
CLIP Similarity: 0.6674
QID: 3466_2
Question: What is the man wearing?
Gold Answer: orange and black shoes
Model Prediction: Jeans
CLIP Similarity: 0.6686
QID: 44520_2
Question: What are the signs on the pole?
Gold Answer: stop sign and railway crossing sign
Model Prediction: Traffic
CLIP Similarity: 0.6826
QID: 2

Qwen:

In [None]:
realistic_path = "/content/drive/My Drive/NLP_Project/results/exp3_RealAsIs_vs_cartoonAsIs/RealAsIs_qwen_20250925-163204.json"
cartoonish_path = "/content/drive/My Drive/NLP_Project/results/exp3_RealAsIs_vs_cartoonAsIs/cartoonAsIs_qwen_20250925-163204.json"
anno_path = "/content/drive/My Drive/NLP_Project/Lama_Questions/vqa_pairs_from_caption_new.json"

realistic_eval = evaluate_generated(realistic_path, anno_path)
cartoonish_eval = evaluate_generated(cartoonish_path, anno_path)

print("Realistic Images:", realistic_eval)
print("Cartoonish Images:", cartoonish_eval)

Realistic Images: {'clip_similarity': 0.9225965219515341}
Cartoonish Images: {'clip_similarity': 0.921841015954497}


Low scores:

In [None]:
inspect_low_scores(
    "/content/drive/My Drive/NLP_Project/results/exp3_RealAsIs_vs_cartoonAsIs/RealAsIs_qwen_20250925-163204.json",
    "/content/drive/My Drive/NLP_Project/Lama_Questions/vqa_pairs_from_caption_new.json",
    k=10  # show 5 worst examples
)

Lowest 10 scoring examples:
QID: 23731_1
Question: What color is the kitty?
Gold Answer: calico
Model Prediction: Orange/white
CLIP Similarity: 0.6501
QID: 42492_1
Question: What is in the center of the kitchen?
Gold Answer: stove top island
Model Prediction: Man reading.
CLIP Similarity: 0.6562
QID: 49255_2
Question: What is next to the pole?
Gold Answer: a large green tree
Model Prediction: Power lines.
CLIP Similarity: 0.6599
QID: 45099_2
Question: What is in the picture?
Gold Answer: sink and toilet
Model Prediction: A woman taking a selfie in a bathroom.
CLIP Similarity: 0.6785
QID: 44520_2
Question: What are the signs on the pole?
Gold Answer: stop sign and railway crossing sign
Model Prediction: Traffic.
CLIP Similarity: 0.6826
QID: 23731_2
Question: What is the kitty sleeping in?
Gold Answer: an orange chair
Model Prediction: bed
CLIP Similarity: 0.6830
QID: 8718_1
Question: What is in the breakfast bowl?
Gold Answer: blueberry sconce and strawberry
Model Prediction: Yogurt, bl

In [None]:
inspect_low_scores(
    "/content/drive/My Drive/NLP_Project/results/exp3_RealAsIs_vs_cartoonAsIs/cartoonAsIs_qwen_20250925-163204.json",
    "/content/drive/My Drive/NLP_Project/Lama_Questions/vqa_pairs_from_caption_new.json",
    k=10  # show 5 worst examples
)

Lowest 10 scoring examples:
QID: 23731_1
Question: What color is the kitty?
Gold Answer: calico
Model Prediction: White
CLIP Similarity: 0.6438
QID: 45099_2
Question: What is in the picture?
Gold Answer: sink and toilet
Model Prediction: Woman taking selfie.
CLIP Similarity: 0.6522
QID: 22103_3
Question: What are the people doing?
Gold Answer: feeding
Model Prediction: Feeding giraffe
CLIP Similarity: 0.6555
QID: 8718_1
Question: What is in the breakfast bowl?
Gold Answer: blueberry sconce and strawberry
Model Prediction: Ice cream, blueberries, strawberries.
CLIP Similarity: 0.6678
QID: 27285_2
Question: What is on top of the cabinets?
Gold Answer: oak
Model Prediction: range hood
CLIP Similarity: 0.6679
QID: 3466_2
Question: What is the man wearing?
Gold Answer: orange and black shoes
Model Prediction: Jeans
CLIP Similarity: 0.6686
QID: 45176_1
Question: What is looking away from its image?
Gold Answer: pug dog
Model Prediction: No
CLIP Similarity: 0.6802
QID: 1779_1
Question: What i

#EXPERIMINT 4: RealAsIs VS RealSimp

LLava:

In [None]:
realasis_path = "/content/drive/My Drive/NLP_Project/results/exp4_RealAsIs_vs_RealSimp/RealAsIs_llava_20250926-123231.json"
realsimplified_path = "/content/drive/My Drive/NLP_Project/results/exp4_RealAsIs_vs_RealSimp/RealSimp_llava_20250926-123231.json"
anno_path = "/content/drive/My Drive/NLP_Project/Lama_Questions/vqa_pairs_from_both.json"

realasis_eval = evaluate_generated(realasis_path, anno_path)
realsimplified_eval = evaluate_generated(realsimplified_path, anno_path)

print("as is Images:", realasis_eval)
print("simplified Images:", realsimplified_eval)

as is Images: {'clip_similarity': 0.9267699686866818}
simplified Images: {'clip_similarity': 0.9184878476254352}


Qwen:

In [None]:
realasis_path = "/content/drive/My Drive/NLP_Project/results/exp4_RealAsIs_vs_RealSimp/RealAsIs_qwen_20250926-121715.json"
realsimplified_path = "/content/drive/My Drive/NLP_Project/results/exp4_RealAsIs_vs_RealSimp/RealSimp_qwen_20250926-121715.json"
anno_path = "/content/drive/My Drive/NLP_Project/Lama_Questions/vqa_pairs_from_both.json"

realasis_eval = evaluate_generated(realasis_path, anno_path)
realsimplified_eval = evaluate_generated(realsimplified_path, anno_path)

print("as is Images:", realasis_eval)
print("simplified Images:", realsimplified_eval)

as is Images: {'clip_similarity': 0.9253802693638451}
simplified Images: {'clip_similarity': 0.9180462026557366}


# EXPERIMINT 5: CartoonAsIs VS CartoonSimp

Llava

In [None]:
cartoonasis_path = "/content/drive/My Drive/NLP_Project/results/exp5_CartoonAsIs_vs_CartoonSimp/CartoonAsIs_llava_20250926-124519.json"
cartoonsimplified_path = "/content/drive/My Drive/NLP_Project/results/exp5_CartoonAsIs_vs_CartoonSimp/CartoonSimp_llava_20250926-124519.json"
anno_path = "/content/drive/My Drive/NLP_Project/Lama_Questions/vqa_pairs_from_both.json"

cartoonasis_eval = evaluate_generated(cartoonasis_path, anno_path)
cartoonsimplified_eval = evaluate_generated(cartoonsimplified_path, anno_path)

print("as is Images:", cartoonasis_eval)
print("simplified Images:", cartoonsimplified_eval)

as is Images: {'clip_similarity': 0.9247826178159032}
simplified Images: {'clip_similarity': 0.9182502226460547}


Qwen:

In [None]:
cartoonasis_path = "/content/drive/My Drive/NLP_Project/results/exp5_CartoonAsIs_vs_CartoonSimp/CartoonAsIs_qwen_20250927-081305.json"
cartoonsimplified_path = "/content/drive/My Drive/NLP_Project/results/exp5_CartoonAsIs_vs_CartoonSimp/CartoonSimp_qwen_20250927-081305.json"
anno_path = "/content/drive/My Drive/NLP_Project/Lama_Questions/vqa_pairs_from_both.json"

cartoonasis_eval = evaluate_generated(cartoonasis_path, anno_path)
cartoonsimplified_eval = evaluate_generated(cartoonsimplified_path, anno_path)

print("as is Images:", cartoonasis_eval)
print("simplified Images:", cartoonsimplified_eval)

as is Images: {'clip_similarity': 0.9244877906653272}
simplified Images: {'clip_similarity': 0.9177374112464133}


# EXPERIMINT 6: RealSimp VS CartoonSimp

Llava:

In [None]:
realsimplified_path = "/content/drive/My Drive/NLP_Project/results/exp6_RealSimp_vs_CartoonSimp/RealSimp_llava_20250927-085004.json"
cartoonsimplified_path = "/content/drive/My Drive/NLP_Project/results/exp6_RealSimp_vs_CartoonSimp/CartoonSimp_llava_20250927-085004.json"
anno_path = "/content/drive/My Drive/NLP_Project/Lama_Questions/vqa_pairs_from_both.json"

realsimplified_eval = evaluate_generated(realsimplified_path, anno_path)
cartoonsimplified_eval = evaluate_generated(cartoonsimplified_path, anno_path)

print("real simplified Images:", realsimplified_eval)
print("cartoon simplified Images:", cartoonsimplified_eval)

NameError: name 'evaluate_generated' is not defined

Qwen:

In [None]:
realsimplified_path = "/content/drive/My Drive/NLP_Project/results/exp6_RealSimp_vs_CartoonSimp/RealSimp_qwen_20250927-083701.json"
cartoonsimplified_path = "/content/drive/My Drive/NLP_Project/results/exp6_RealSimp_vs_CartoonSimp/CartoonSimp_qwen_20250927-083701.json"
anno_path = "/content/drive/My Drive/NLP_Project/Lama_Questions/vqa_pairs_from_both.json"

realsimplified_eval = evaluate_generated(realsimplified_path, anno_path)
cartoonsimplified_eval = evaluate_generated(cartoonsimplified_path, anno_path)

print("real simplified Images:", realsimplified_eval)
print("cartoon simplified Images:", cartoonsimplified_eval)