<a href="https://colab.research.google.com/github/NaGho/reasoning_multimodal_LLMs/blob/main/vllm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install qwen_vl_utils
# !pip install transformers
# !pip install datasets
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from datasets import load_dataset  # Assuming the GMS8k dataset is HuggingFace-compatible
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import pandas as pd
import numpy as np



In [None]:
compute = True

# Check device compatibility
device = "mps" if torch.backends.mps.is_available() else "cpu"

model_name = "Qwen/Qwen2-VL-2B-Instruct"
dataset_name = "deepcs233/Visual-CoT"
file_name = f"data/output/{dataset_name.split('/')[-1]}_{model_name.split('/')[-1]}.csv"
# Load the model
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float32 if device == "cpu" else torch.bfloat16,
    device_map=None
)

# Initialize processor
min_pixels = 256 * 28 * 28
max_pixels = 1280 * 28 * 28
processor = AutoProcessor.from_pretrained(model_name, min_pixels=min_pixels, max_pixels=max_pixels)



In [8]:
# Load the dataset
dataset_test = load_dataset("MathLLMs/MathVision")



In [7]:
dataset_test[0]

{'id': '1',
 'question': 'Which number should be written in place of the question mark?\n<image1>',
 'options': [],
 'image': 'images/1.jpg',
 'decoded_image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1176x178>,
 'answer': '60',
 'solution': None,
 'level': 2,
 'subject': 'arithmetic'}

In [None]:
def generate_answer(input):
    image_path = f"{img_folder_path}/{input['image']}"
    prompt = input["question"]

    # Prepare input
    messages = [
        {"role": "user", "content": [{"type": "image", "image": image_path}, {"type": "text", "text": prompt}]}
    ]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )

    # Move inputs to the device
    inputs = inputs.to(device)
    model = model.to(device)

    # Perform inference
    generated_ids = model.generate(**inputs, max_new_tokens=256)

    output_text = processor.batch_decode(
        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]

    # Store results
    return {
        "prompt": prompt, "generated_text": output_text
      }


dataset_test = pd.DataFrame(dataset_test)
# Iterate over dataset
dataset_test[["prompt", "generated_text"]] = dataset_test.apply(generate_answer, axis=1, result_type="expand")



In [None]:

def final_answer(text: str):
    text = text.lower()
    if 'answer is' not in text:
        return np.nan
    return text.split('answer is:').strip().replace('$', '')
# Example metric: String matching (very basic)
ground_truths = [result["ground_truth"] for result in results]
generated_texts = [final_answer(result["generated_text"]) for result in results]
exact_matches = [gt == gen for gt, gen in zip(ground_truths, generated_texts)]
accuracy = sum(exact_matches) / len(exact_matches)

print(f"Exact Match Accuracy: {accuracy:.4f}")

# Optionally save results
import json
with open("evaluation_results.json", "w") as f:
    json.dump(results, f, indent=4)