**CELL 1 — Setup (Drive + paths + model id)**

In [None]:
from huggingface_hub import login

# Paste your token below between the quotes ↓↓↓
login(token="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")


In [None]:
!pip -q install transformers accelerate torch --upgrade

from google.colab import drive
drive.mount('/content/drive')

import os, json, csv, uuid
from typing import List, Dict

# ====== PATHS (edit BASE_DIR if your folder name differs) ======
BASE_DIR   = "/content/drive/MyDrive/NLP_Project"
INPUTS_DIR = f"{BASE_DIR}/inputs"          # put your CSV/JSONL here
OUTPUTS_DIR= f"{BASE_DIR}/outputs"
os.makedirs(INPUTS_DIR, exist_ok=True)
os.makedirs(OUTPUTS_DIR, exist_ok=True)

# Choose one input file; both supported:
CSV_PATH   = f"{INPUTS_DIR}/filtered_first_only_ordered.csv"     # columns: image_id, caption
JSONL_PATH = f"{INPUTS_DIR}/filtered_first_only_ordered.jsonl"   # objects: {"image_id": ..., "caption": ...}

# ====== MODEL (LLaMA Instruct) ======
HF_TOKEN = ""  # optional (if your HF account requires login for the model)
MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"  # swap if you prefer another instruct model
#MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**CELL 2 — Load model**

In [None]:
from huggingface_hub import login

# Paste your token below between the quotes ↓↓↓
login(token="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")


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login

# STEP 1: Paste your token here between the quotes ↓↓↓
HF_TOKEN = "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"  # <-- replace this with your real token

# STEP 2: Login with your token
login(token=HF_TOKEN)

# STEP 3: Set dtype and load model
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=dtype,
    device_map="auto",
    token=HF_TOKEN,
)


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

** CELL 3 — Read data (only caption)**


In [None]:
def read_csv(path: str) -> List[Dict]:
    rows = []
    with open(path, "r", newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for r in reader:
            cap = (r.get("caption") or "").strip()
            if not cap:
                continue
            rows.append({
                "image_id": str(r.get("image_id")).strip(),
                "caption": cap,
            })
    return rows

def read_jsonl(path: str) -> List[Dict]:
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            cap = (obj.get("caption") or "").strip()
            if not cap:
                continue
            rows.append({
                "image_id": str(obj.get("image_id")).strip(),
                "caption": cap,
            })
    return rows

def load_input(csv_path: str = None, jsonl_path: str = None) -> List[Dict]:
    if csv_path and os.path.exists(csv_path):
        return read_csv(csv_path)
    if jsonl_path and os.path.exists(jsonl_path):
        return read_jsonl(jsonl_path)
    raise FileNotFoundError("No input file found. Put CSV/JSONL in NLP_Project/inputs and set the path.")

rows = load_input(CSV_PATH, JSONL_PATH)
print(f"Loaded {len(rows)} rows")
rows[:2]


Loaded 637 rows


[{'image_id': '14338',
  'caption': 'The hull of a boat that is producing a wake'},
 {'image_id': '32777', 'caption': 'A man that is walking next to a train.'}]

**CELL 4 — Prompt + generation (caption only)**

In [None]:
# STRICTLY from the single caption
SYSTEM_PROMPT = (
"You are a careful question writer. "
"You will get one short image caption. "
"Write 2–3 diverse, unambiguous question–answer pairs that are strictly inferable from THIS caption only.\n"
"\n"
"Rules:\n"
"- Only ask about facts explicitly supported by the caption; do not use world knowledge or assumptions.\n"
"- Prefer concrete details: counts, objects, attributes, relations, locations, actions.\n"
"- Avoid speculation and avoid anything not stated in the text.\n"
"- Keep each answer short (1–3 words) or 'yes'/'no'.\n"
"\n"
"Example:\n"
'Caption: "Two bears are lying down on the ice."\n'
"Pairs:\n"
'  - Question: How many bears are lying on the ice? → Answer: two\n'
'  - Question: What are the two animals lying on the ice? → Answer: bears\n'
'  - Question: What are the bears doing? → Answer: lying down\n'
'  - Question: Two bears are lying down on what? → Answer: ice\n'
'  - Question: Where are the bears lying? → Answer: on the ice\n'
'  - Question: Are the bears on the ice? → Answer: yes\n'
"\n"
"Output ONLY valid JSON exactly in this schema:\n"
'{\n  "pairs": [\n    {"question": "...", "answer": "..."},\n    {"question": "...", "answer": "..."}\n  ]\n}\n'
"No extra text."
)

def build_user_prompt(caption: str) -> str:
    return f'Caption: "{caption}"\n'

@torch.inference_mode()
def generate_pairs_from_caption(
    caption: str,
    temperature=0.1, top_p=0.9, max_new_tokens=200, retries=2
):
    """Return up to 3 {'question': str, 'answer': str} strictly from caption."""
    user_prompt = build_user_prompt(caption)
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_prompt},
    ]
    input_ids = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, return_tensors="pt"
    ).to(model.device)

    for _ in range(retries + 1):
        out = model.generate(
            input_ids=input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=(temperature > 0.0),
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.eos_token_id,
        )
        text = tokenizer.decode(out[0][input_ids.shape[-1]:], skip_special_tokens=True).strip()

        # Try to parse JSON (be tolerant to accidental leading/trailing text)
        try:
            s = text.find("{")
            e = text.rfind("}")
            payload = text[s:e+1] if (s != -1 and e != -1 and e > s) else text
            data = json.loads(payload)
            pairs = data.get("pairs", [])
            cleaned = []
            for p in pairs:
                q = (p.get("question") or "").strip()
                a = (p.get("answer") or "").strip()
                if q and a:
                    cleaned.append({"question": q, "answer": a})
            if 1 <= len(cleaned) <= 4:
                return cleaned[:3]
        except Exception:
            # relax sampling a bit and retry
            temperature = min(0.6, temperature + 0.2)
            top_p = min(0.95, top_p + 0.03)
            continue
    return []


**CELL 5 — Run for all rows & save outputs**

In [None]:
OUT_JSONL = f"{OUTPUTS_DIR}/vqa_pairs_from_caption_new.jsonl"
OUT_CSV   = f"{OUTPUTS_DIR}/vqa_pairs_from_caption_new.csv"
OUT_JSON = f"{OUTPUTS_DIR}/vqa_pairs_from_caption_new.json"


all_records = []
bad = 0

for i, r in enumerate(rows, 1):
    img_id = r["image_id"]
    cap    = r["caption"]
    pairs  = generate_pairs_from_caption(cap)

    if not pairs:
        bad += 1
        continue

    for k, p in enumerate(pairs, 1):
        qid = f"{img_id}_{k}"
        all_records.append({
            "question_id": qid,
            "image_id": img_id,
            "question": p["question"],
            "answer": p["answer"],
        })

    if i % 25 == 0:
        print(f"[{i}/{len(rows)}] ok; {bad} with no valid JSON")

# Save JSONL
with open(OUT_JSONL, "w", encoding="utf-8") as f:
    for rec in all_records:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

#Save JSON

with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(all_records, f, indent=2, ensure_ascii=False)

print("✅ Also saved pretty JSON to:", OUT_JSON)

# Save CSV
import csv
fieldnames = ["question_id", "image_id", "question", "answer"]
with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(all_records)

print(f"Done. Wrote {len(all_records)} QA rows.")
print("JSONL:", OUT_JSONL)
print("CSV:  ", OUT_CSV)


[25/637] ok; 1 with no valid JSON
[50/637] ok; 1 with no valid JSON
[75/637] ok; 1 with no valid JSON
[100/637] ok; 1 with no valid JSON
[125/637] ok; 2 with no valid JSON
[150/637] ok; 2 with no valid JSON
[175/637] ok; 2 with no valid JSON
[200/637] ok; 2 with no valid JSON
[225/637] ok; 2 with no valid JSON
[250/637] ok; 2 with no valid JSON
[275/637] ok; 2 with no valid JSON
[300/637] ok; 2 with no valid JSON
[325/637] ok; 2 with no valid JSON
[350/637] ok; 2 with no valid JSON
[375/637] ok; 2 with no valid JSON
[400/637] ok; 2 with no valid JSON
[425/637] ok; 2 with no valid JSON
[450/637] ok; 2 with no valid JSON
[475/637] ok; 2 with no valid JSON
[500/637] ok; 2 with no valid JSON
[525/637] ok; 2 with no valid JSON
[550/637] ok; 2 with no valid JSON
[575/637] ok; 2 with no valid JSON
[600/637] ok; 2 with no valid JSON
[625/637] ok; 2 with no valid JSON
✅ Also saved pretty JSON to: /content/drive/MyDrive/NLP_Project/outputs/vqa_pairs_from_caption_new.json
Done. Wrote 1905 QA rows