In [1]:
import os
from tqdm.auto import tqdm

In [2]:
from text_generator import MistralGenerator
from image_generator import LlavaGenerator

In [3]:
from text_generator import MultimodalAnswerPipeline

In [5]:
import os
import time
# from tqdm import tqdm
# from image_generator import LlavaGenerator
from utils import load_data, save_batch_output, log_time
from tqdm.auto import tqdm

In [14]:
mistral_gen = MistralGenerator(model="mistral-instruct-7b@q4_k_m",base_url=os.getenv('REMOTE_URL'))
llava_gen = LlavaGenerator(model="llava-v1.5-7b@q5_k_m",base_url=os.getenv('LOCAL_URL'))

pipeline = MultimodalAnswerPipeline(mistral=mistral_gen, llava=llava_gen)

In [8]:
import pandas as pd
from PIL import Image

# Load dataset
DATASET_PATH = "../Data/VisDoM-main/spiqa/spiqa.csv"
IMAGE_FOLDER = "../Data/spiqa/test-A/SPIQA_testA_Images/SPIQA_testA_Images"
df = pd.read_csv(DATASET_PATH)
id_list = df["q_id"].astype(str).tolist()

In [9]:
OUTPUT_DIR = "../Eval_outputs/SPIQA/ReAlignQA"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [10]:
existing_batches = {
    int(f.split("_")[1])
    for f in os.listdir(OUTPUT_DIR)
    if f.startswith("batch_") and f.endswith(".csv")
}
existing_batches

set()

In [11]:
BATCH_SIZE = 10

In [16]:
# Batch processing
for i in tqdm(range(0, len(df), BATCH_SIZE)):
    if i in existing_batches:
        print(f"Skipping batch {i} (already processed)")
        continue

    batch_df = df.iloc[i:i + BATCH_SIZE]
    results = []
    start = time.time()

    for idx, row in tqdm(batch_df.iterrows(), total=len(batch_df), desc=f"Batch {i}"):
        try:
            image_path = os.path.join(IMAGE_FOLDER, row["doc_id"], row["reference_figure"])
            question = row["old_question"]
            caption = row["caption"]
            answer = pipeline.answer_question(question,caption,image_path)

            results.append({
                "index": idx,
                "question": question,
                "caption": caption,
                "image": image_path,
                "response": answer
            })
        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            continue
    # results = {}

    duration = time.time() - start
    save_batch_output(results, OUTPUT_DIR, i)
    log_time(OUTPUT_DIR, i, duration, prefix="vision_only")

  0%|          | 0/59 [00:00<?, ?it/s]

Batch 0:   0%|          | 0/10 [00:00<?, ?it/s]

ic| self.mistral.generate_answer(detection_prompt): 'YES'
ic| self.mistral.generate_answer(synthesis_prompt): ('Figure 9 in the paper "Disentangling Language and Knowledge in Task-Oriented '
                                                     'Dialogs" depicts the transformation process between the original and '
                                                     'pre-processed SMD Navigate datasets. The visualization is presented as a '
                                                     'table with two columns - one for the "Original SMD Navigate Dataset" and '
                                                     'another for the "Preprocessed SMD Navigate Dataset." Each column contains '
                                                     'several rows of words or phrases, with an arrow pointing from the left to '
                                                     'the right. This visual representation illustrates how pre-processing '
                                         

File saved to:  ../Eval_outputs/SPIQA/ReAlignQA/batch_0_20250424_233542.csv


Batch 10:   0%|          | 0/10 [00:00<?, ?it/s]

ic| self.mistral.generate_answer(detection_prompt): 'NO'
ic| self.mistral.generate_answer(detection_prompt): 'Yes'
ic| self.mistral.generate_answer(synthesis_prompt): ('Based on the data presented in the "BDD100K: A Diverse Driving Dataset for '
                                                     'Heterogeneous Multitask Learning" paper and Table 11 of that same paper, the '
                                                     'object category with the highest combined total of bounding box and instance '
                                                     "track annotations in the BDD100K MOT dataset is 'Vehicle'.")
ic| self.mistral.generate_answer(detection_prompt): 'Yes'
ic| self.mistral.generate_answer(synthesis_prompt): ('The provided information indicates that the visual model has identified '
                                                     "'Table 1' as 'Dataset Statistics.' However, no specific data or numbers have "
                                                     '

File saved to:  ../Eval_outputs/SPIQA/ReAlignQA/batch_10_20250424_234733.csv


Batch 20:   0%|          | 0/10 [00:00<?, ?it/s]

ic| self.mistral.generate_answer(detection_prompt): ('YES (Assuming that the figure provides sufficient data to compare the '
                                                     'runtime performance of One Step ALOQ and WSN for F-SRE1 and F-SRE2)')
ic| self.mistral.generate_answer(synthesis_prompt): ('Based on the visual comparison provided in the paper "Alternating '
                                                     'Optimisation and Quadrature for Robust Control," One Step ALOQ does not '
                                                     'appear to show superior runtime efficiency over WSN for both F-SRE1 and '
                                                     'F-SRE2. The graph indicates that, in some instances, WSN has a faster '
                                                     "runtime compared to One Step ALOQ. However, it's important to note that this "
                                                     'analysis is based solely on the provided visual model and d

File saved to:  ../Eval_outputs/SPIQA/ReAlignQA/batch_20_20250425_000533.csv


Batch 30:   0%|          | 0/10 [00:00<?, ?it/s]

ic| self.mistral.generate_answer(detection_prompt): 'Yes'
ic| self.mistral.generate_answer(synthesis_prompt): ('The highest number of network-related reboots during the deployment in the '
                                                     'tier-1 datacenter occurred within a time window from approximately 18:05 to '
                                                     '19:04.')
ic| self.mistral.generate_answer(detection_prompt): 'YES'
ic| self.mistral.generate_answer(synthesis_prompt): ('The TF-IDF (Term Frequency-Inverse Document Frequency) and BM25 (Best '
                                                     'Matching Baseline 25) features differ in their approach to estimating '
                                                     'document relevance within the "Unbiased Learning to Rank with Unbiased '
                                                     'Propensity Estimation" framework. While TF-IDF calculates a weighted score '
                                                

File saved to:  ../Eval_outputs/SPIQA/ReAlignQA/batch_30_20250425_003221.csv


Batch 40:   0%|          | 0/10 [00:00<?, ?it/s]

ic| self.mistral.generate_answer(detection_prompt): 'YES'
ic| self.mistral.generate_answer(synthesis_prompt): ('The qualitative results from the ablation figure in the Arbitrary Talking '
                                                     'Face Generation paper suggest that while both the baseline method and the '
                                                     'proposed methods generate faces, there is an improvement in visual clarity '
                                                     'and facial detail in the proposed methods compared to the baseline. '
                                                     "Specifically, the 'A' indicates better results for the proposed methods, "
                                                     "while the '-' shows worse performance for the baseline, and the 'D' suggests "
                                                     'a decline in quality when using the ablation of specific components in the '
                                  

File saved to:  ../Eval_outputs/SPIQA/ReAlignQA/batch_40_20250425_005431.csv


Batch 50:   0%|          | 0/10 [00:00<?, ?it/s]

ic| self.mistral.generate_answer(detection_prompt): 'NO'
ic| self.mistral.generate_answer(detection_prompt): 'Yes'
ic| self.mistral.generate_answer(synthesis_prompt): ('The Pairwise Confusion (PC) method improves the localization performance of '
                                                     'the VGG-16 model by focusing more accurately on the target objects in the '
                                                     'Grad-CAM heatmaps compared to the baseline model. This is demonstrated in '
                                                     'Figure 3 of the paper, where PC leads to tighter and more accurate '
                                                     'localization of target objects like birds, whereas the baseline VGG-16 '
                                                     'network often focuses on artifacts even when making correct predictions. The '
                                                     'comparison between the heatmaps generated from this meth

KeyboardInterrupt: 