In [None]:
#  SPIQA to LLaVA LoRA Fine-Tuning Pipeline (Colab Ready)

# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 2. Install required libraries
!pip install -q bitsandbytes accelerate peft transformers datasets huggingface_hub
!git clone https://github.com/haotian-liu/LLaVA.git
%cd LLaVA
!pip install -e .

# 3. Download SPIQA dataset from Hugging Face
from huggingface_hub import snapshot_download
snapshot_download(repo_id="google/spiqa", repo_type="dataset", local_dir='/content/spiqa')

# 4. Unzip relevant data
import zipfile, os

zip_path = "/content/spiqa/train_val/SPIQA_train_val_Images.zip"
extract_path = "/content/spiqa_images"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 5. Load and convert SPIQA_train.json to LLaVA format
import json
import os

with open("/content/spiqa/train_val/SPIQA_train.json", "r") as f:
    spiqa_data = json.load(f)

converted = []

for entry in spiqa_data:
    paper_id = entry["paper_id"]
    figures = {fig['id']: fig['file_path'] for fig in entry.get("figures", [])}
    for qa in entry.get("qa_pairs", []):
        fig_id = qa.get("figure_id")
        if not fig_id or fig_id not in figures:
            continue
        image_path = os.path.join(extract_path, figures[fig_id])
        if not os.path.exists(image_path):
            continue
        converted.append({
            "image": image_path,
            "conversations": [
                {
                    "from": "human",
                    "value": f"<image>\n{qa['question']}"
                },
                {
                    "from": "gpt",
                    "value": qa["answer"]
                }
            ]
        })

# 6. Save converted dataset to Google Drive
output_path = "/content/drive/MyDrive/SPIQA_train_converted.json"
with open(output_path, "w") as f:
    json.dump(converted, f, indent=2)

print(f"Saved converted dataset to {output_path}")
