In [1]:
import os
import time
from tqdm import tqdm
from image_generator import LlavaGenerator
from utils import load_data, save_batch_output, log_time
from tqdm.auto import tqdm

In [2]:
# Config
BATCH_SIZE = 10
INPUT_CSV = "../Data/VisDoM-main/spiqa/spiqa.csv"  # must contain 'question', 'caption', 'doc_id', 'reference_figure'
IMAGE_FOLDER = "../Data/spiqa/test-A/SPIQA_testA_Images/SPIQA_testA_Images"     # where all your images live
OUTPUT_DIR = "../Eval_outputs/SPIQA/vision_only"
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [None]:
BASE_URL = os.getenv(LOCAL_URL)

In [4]:
API_KEY = "lm-studio"

In [5]:
# Vision RAG prompt
vision_prompt = """<image>

Use the image provided to answer the question as accurately as possible.

Question: <question>

Answer:"""

In [6]:
# Init generator
generator = LlavaGenerator(base_url=BASE_URL,api_key=API_KEY,prompt_template=vision_prompt)
df = load_data(INPUT_CSV)

In [10]:
# Detect completed batches
existing_batches = {
    int(f.split("_")[1])
    for f in os.listdir(OUTPUT_DIR)
    if f.startswith("batch_") and f.endswith(".csv")
}
existing_batches

{0}

In [None]:
# Batch processing
for i in tqdm(range(0, len(df), BATCH_SIZE)):
    if i in existing_batches:
        print(f"Skipping batch {i} (already processed)")
        continue

    batch_df = df.iloc[i:i + BATCH_SIZE]
    results = []
    start = time.time()

    for idx, row in tqdm(batch_df.iterrows(), total=len(batch_df), desc=f"Batch {i}"):
        try:
            image_path = os.path.join(IMAGE_FOLDER, row["doc_id"], row["reference_figure"])
            question = row["old_question"]
    #         # caption = row["caption"]
            answer = generator.generate_answer(image_path, 
                                            #    caption, 
                                               question)

            results.append({
                "index": idx,
                "question": question,
                # "caption": caption,
                "image": image_path,
                "response": answer
            })
        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            continue
    # results = {}

    duration = time.time() - start
    save_batch_output(results, OUTPUT_DIR, i)
    log_time(OUTPUT_DIR, i, duration, prefix="vision_only")

  0%|          | 0/59 [00:00<?, ?it/s]

Skipping batch 0 (already processed)


Batch 10:   0%|          | 0/10 [00:00<?, ?it/s]

File saved to:  ../Eval_outputs/SPIQA/vision_only/batch_10_20250420_202013.csv


Batch 20:   0%|          | 0/10 [00:00<?, ?it/s]

File saved to:  ../Eval_outputs/SPIQA/vision_only/batch_20_20250420_203208.csv


Batch 30:   0%|          | 0/10 [00:00<?, ?it/s]

File saved to:  ../Eval_outputs/SPIQA/vision_only/batch_30_20250420_204533.csv


Batch 40:   0%|          | 0/10 [00:00<?, ?it/s]

File saved to:  ../Eval_outputs/SPIQA/vision_only/batch_40_20250420_205639.csv


Batch 50:   0%|          | 0/10 [00:00<?, ?it/s]

File saved to:  ../Eval_outputs/SPIQA/vision_only/batch_50_20250420_211100.csv


Batch 60:   0%|          | 0/10 [00:00<?, ?it/s]

File saved to:  ../Eval_outputs/SPIQA/vision_only/batch_60_20250420_212639.csv


Batch 70:   0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
# duration = time.time() - start
# save_batch_output(results, OUTPUT_DIR, i)
# log_time(OUTPUT_DIR, i=5, duration='3454',prefix="vision_only")
# log_time(output_dir=OUTPUT_DIR,batch_index="cc",duration='test',prefix='vision')

ValueError: Unknown format code 'f' for object of type 'str'