In [17]:
# !pip install qwen_vl_utils
# !pip install transformers
# !pip install datasets
import torch
from transformers import (
    Qwen2VLForConditionalGeneration, AutoProcessor,
    QwenImageProcessor, QwenTokenizer, Trainer, TrainingArguments
)
from qwen_vl_utils import process_vision_info
from datasets import load_dataset, Dataset
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from PIL import Image
import torch
import os



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
compute = True

# Check device compatibility
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"

model_name = "Qwen/Qwen2-VL-2B-Instruct"

img_folder_path = "/content/drive/MyDrive/MATH-V-main"

dataset_name = "MathLLMs/MathVision"

file_name = f"data/output/{dataset_name.split('/')[-1]}_{model_name.split('/')[-1]}.csv"



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/56.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

In [1]:
# Load the dataset
data_list = load_dataset(dataset_name, split='test')


# Split data into training and validation sets
train_size = int(0.7 * len(data_list))
val_size = int(0.1 * len(data_list))
test_size = len(data_list) - train_size - val_size
data_list = data_list.shuffle(seed=42)  # Shuffle the dataset for randomness

train_data = data_list.select(range(train_size))
val_data = data_list.select(range(train_size, train_size + val_size))
test_data = data_list.select(range(train_size+val_size, len(data_list)))

# test_size = 100
# # Split into train and test
# test_dataset = Dataset.from_dict(
#     train_dataset[-test_size:]
# )

# train_dataset = Dataset.from_dict(
#     train_dataset[:-test_size]
# )

NameError: name 'load_dataset' is not defined

In [33]:
test_dataset[0]

{'id': '2941',
 'question': 'A sphere is inscribed in a cone with height 4 and base radius 3.  What is the ratio of the volume of the sphere to the volume of the cone?\n\n<image1>',
 'options': [],
 'image': 'images/2941.jpg',
 'decoded_image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=2520x3355>,
 'answer': '\\frac{3}{8}',
 'solution': 'We start by finding the radius of the inscribed sphere. If we slice the diagram by a plane that contains the central axis of the cone, we get a circle inscribed in an isosceles triangle with base 6 and height 4, and the radius of the inscribed circle is the same as the radius of the sphere (since any plane that contains the central axis of the cone contains a diameter of the inscribed sphere). We label the points as shown in the diagram below.\n\n[asy]\ndraw((0,3)--(4,0)--(0,-3)--cycle);\ndraw(circle((1.5,0),1.5));\ndraw((0,0)--(4,0),dashed);\nlabel("$A$",(0,3),NW);\nlabel("$B$",(4,0),E);\nlabel("$C$",(0,-3),SW);\nlabel("$D$",(0,0),W);\ndr

In [None]:
# Load pretrained model and tokenizer
model_name = "Qwen/Qwen2-VL-2B-Instruct"
model = Qwen2VLForConditionalGeneration.from_pretrained(model_name)
tokenizer = QwenTokenizer.from_pretrained(model_name)
image_processor = QwenImageProcessor.from_pretrained(model_name)

# Custom Dataset Class
class MathQAImageDataset(Dataset):
    def __init__(self, data_list, tokenizer, image_processor, max_length=256):
        self.data_list = data_list
        self.tokenizer = tokenizer
        self.image_processor = image_processor
        self.max_length = max_length

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        sample = self.data_list[idx]

        # Load and process image
        image_path = sample['image_path']
        image = Image.open(image_path).convert("RGB")
        processed_image = self.image_processor(image, return_tensors="pt")

        # Tokenize input question
        question = sample['question']
        prompt = f"Question: {question}\n Please provide the bounding box coordinate of the region that can help you answer the question better. Answer:"
        inputs = self.tokenizer(
            question,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Tokenize output answer
        answer = sample['answer']
        labels = self.tokenizer(
            answer,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        labels = labels.input_ids.squeeze()

        # Replace padding token id in labels with -100 to ignore loss calculation on padding
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "pixel_values": processed_image["pixel_values"].squeeze(),
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels
        }

# Create Dataset objects
train_dataset = MathQAImageDataset(train_data, tokenizer, image_processor)
val_dataset = MathQAImageDataset(val_data, tokenizer, image_processor)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True,  # Use mixed precision if supported
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"  # Disable report to wandb/huggingface hub
)

# Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Start fine-tuning
trainer.train()


In [43]:
# test

# Load the model
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float32, #if device == "cpu" else torch.bfloat16,
    device_map=None
)

# Initialize processor
min_pixels = 256 * 28 * 28
max_pixels = 1280 * 28 * 28
processor = AutoProcessor.from_pretrained(model_name, min_pixels=min_pixels, max_pixels=max_pixels)


results = []

for i, input in enumerate(tqdm(test_dataset)):
    image_path = f"{img_folder_path}/{input['image']}"
    prompt = input["question"]

    # Prepare input
    messages = [
        {"role": "user", "content": [{"type": "image", "image": image_path}, {"type": "text", "text": prompt}]}
    ]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )

    # Move inputs to the device
    inputs = inputs.to(device)
    model = model.to(device)

    # Perform inference
    generated_ids = model.generate(**inputs, max_new_tokens=256)

    output_text = processor.batch_decode(
        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]

    # Store results
    results.append(input | {
        "prompt": prompt, "generated_text": output_text
    })
    # if i > 4:
    #   break


pd.DataFrame(results).to_csv(f"{img_folder_path}/test_generated_answer.csv")

100%|██████████| 100/100 [28:11<00:00, 16.91s/it]


OSError: Cannot save file into a non-existent directory: 'data/output'

In [45]:
import re
remove_list = ['the answer', 'is', ':', 'boxed', '$']

results = pd.DataFrame(results)
def final_answer(text: str):
    text = text.lower()
    if 'answer is' not in text:
        return np.nan
    # Create a regex pattern to match all words in the remove_list
    pattern = r'(' + r'|'.join(re.escape(word) for word in remove_list) + r'|\s)'
    # Use re.sub to replace matched patterns with an empty string
    result = re.sub(pattern, '', text.split('answer is')[-1])
    result = re.sub(r'\\\{(\\frac\{[^{}]+\}\{[^{}]+\})\}', r'\1', result)
    return re.sub(r'\\\\', r'\\', result)
# Example metric: String matching (very basic)
results['prediction'] = results['generated_text'].apply(final_answer)
results['exact_match'] = results['prediction'] == results['answer']
display(results)
pd.DataFrame(results).to_csv(f"{img_folder_path}/test_generated_answer.csv")

accuracy = sum(results['exact_match']) / len(results)
print(f"Exact Match Accuracy: {accuracy:.4f}")

# Optionally save results
# import json
# with open("evaluation_results.json", "w") as f:
#     json.dump(results, f, indent=4)

Unnamed: 0,id,question,options,image,decoded_image,answer,solution,level,subject,prompt,generated_text,prediction,exact_match
0,2941,A sphere is inscribed in a cone with height 4 ...,[],images/2941.jpg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,\frac{3}{8},We start by finding the radius of the inscribe...,2,solid geometry,A sphere is inscribed in a cone with height 4 ...,system\nYou are a helpful assistant.\nuser\nA ...,\frac{3}{1}.,False
1,2942,The length of the diameter of this spherical b...,[],images/2942.jpg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,27000-4500\pi,The box has volume $30^3=27000$ cubic cm.\n\nT...,1,solid geometry,The length of the diameter of this spherical b...,system\nYou are a helpful assistant.\nuser\nTh...,,False
2,2943,"In the circle below, $\overline{AB} \| \overli...",[],images/2943.jpg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,8\pi,"By symmetry, $\widehat{BD}=\widehat{CA}=100^\c...",4,metric geometry - length,"In the circle below, $\overline{AB} \| \overli...",system\nYou are a helpful assistant.\nuser\nIn...,,False
3,2944,"In right triangle $ABC$, $\angle B = 90^\circ$...",[],images/2944.jpg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,2\sqrt{3},"Let $\overline{DE}$ have length $x$, so $\over...",4,metric geometry - length,"In right triangle $ABC$, $\angle B = 90^\circ$...",system\nYou are a helpful assistant.\nuser\nIn...,,False
4,2945,The area of square $ABCD$ is 100 square centim...,[],images/2945.jpg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,68,"Since $AE = 2$, $EB = 8$, but since $EFGH$ is ...",1,metric geometry - area,The area of square $ABCD$ is 100 square centim...,system\nYou are a helpful assistant.\nuser\nTh...,100,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3036,The two externally tangent circles each have a...,[],images/3036.jpg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,8-2\pi,Each diameter of a circle is 2 units. The rect...,1,metric geometry - area,The two externally tangent circles each have a...,system\nYou are a helpful assistant.\nuser\nTh...,,False
96,3037,The area of $\triangle ABC$ is 6 square centim...,[],images/3037.jpg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,54,"Since $AB \parallel DE,$ we know that $\angle ...",4,metric geometry - area,The area of $\triangle ABC$ is 6 square centim...,system\nYou are a helpful assistant.\nuser\nTh...,,False
97,3038,"In the diagram, $K$, $O$ and $M$ are the cente...",[],images/3038.jpg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,1250\pi,We know that $OA$ and $OB$ are each radii of t...,4,metric geometry - area,"In the diagram, $K$, $O$ and $M$ are the cente...",system\nYou are a helpful assistant.\nuser\nIn...,512\pi,False
98,3039,The volume of the cylinder shown is $45\pi$ cu...,[],images/3039.jpg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,5,The volume of the cylinder is $bh=\pi r^2h$. T...,1,solid geometry,The volume of the cylinder shown is $45\pi$ cu...,system\nYou are a helpful assistant.\nuser\nTh...,,False


Exact Match Accuracy: 0.0200
