In [1]:
import requests
from PIL import Image
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration

# Load the model in half-precision
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16, device_map="auto")
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

# Get two different images
url = "https://www.ilankelman.org/stopsigns/australia.jpg"
image_stop = Image.open(requests.get(url, stream=True).raw)

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image_cats = Image.open(requests.get(url, stream=True).raw)

# Prepare a batch of two prompts
conversation_1 = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "What is shown in this image?"},
        ],
    },
]

conversation_2 = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "What is shown in this image?"},
        ],
    },
]

prompt_1 = processor.apply_chat_template(conversation_1, add_generation_prompt=True)
prompt_2 = processor.apply_chat_template(conversation_2, add_generation_prompt=True)
prompts = [prompt_1, prompt_2]

# We can simply feed images in the order they have to be used in the text prompt
inputs = processor(images=[image_stop, image_cats], text=prompts, padding=True, return_tensors="pt").to(model.device, torch.float16)

# Generate
generate_ids = model.generate(**inputs, max_new_tokens=30)
processor.batch_decode(generate_ids, skip_special_tokens=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.62M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.50.


['USER:  \nWhat is shown in this image? ASSISTANT: The image shows a street scene with a stop sign, a car driving down the street, and a red and white building in the background. There are',
 'USER:  \nWhat is shown in this image? ASSISTANT: The image shows two cats lying on a couch, with one of them sleeping and the other one awake.']

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
!pip install datasets
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, LlavaForConditionalGeneration,
    LlavaProcessor, AutoProcessor
)
from tqdm import tqdm
import re
from transformers import Trainer
from PIL import Image
import pandas as pd

IMAGE_PATH = '/content/drive/MyDrive/MATH-V-main'

class MathVisionLlavaEvaluator:
    def __init__(self, dataset_name="MathLLMs/MathVision"):
        """
        Initialize the evaluator with the dataset and Llava model

        :param dataset_name: Name of the dataset to evaluate
        """
        # Load dataset
        self.dataset = load_dataset(dataset_name)

        # Load Llava 1.5 model and processor
        self.model_id = "llava-hf/llava-1.5-7b-hf"

        self.model = LlavaForConditionalGeneration.from_pretrained(
            self.model_id,
            torch_dtype=torch.float16,
            device_map='auto'
        )
        self.processor = AutoProcessor.from_pretrained(self.model_id) # LlavaProcessor.from_pretrained(self.model_id)

        # Evaluation results
        self.results = {'zero_shot': [], 'few_shot': []}

    def _prepare_input(self, example, few_shot=False):
        """
        Prepare input for the model

        :param example: Single dataset example
        :param few_shot: Whether to use few-shot prompting
        :return: Processed inputs
        """
        # Prepare base prompt
        base_prompt = f"Solve the following math problem step by step, given the image attached. Write the final answer after <Answer:> \n{example['question']}" # . Only write the solution in the output.

        if few_shot:
            # Add 1-2 example solutions to provide context
            few_shot_examples = self.dataset['test'][:2]
            base_prompt = "Here are example problem-solving approaches:\n" + \
                "\n".join([
                    f"Problem: {ex['question']}\nSolution: {ex['answer']}"
                    for ex in few_shot_examples
                ]) + \
                f"\n\nNow solve this problem:\n{base_prompt}"

        conversation = [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": base_prompt},
                ],
            },
        ]
        prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)

        # If the example contains an image, process it
        if 'image' in example and example['image']:
            images = [Image.open(f"{IMAGE_PATH}/{example['image']}")]
        else:
            images = None
        inputs = self.processor(images=images, text=[prompt], padding=True, return_tensors="pt").to(self.model.device, torch.float16)

        return inputs

    def _extract_solution(self, generated_text):
        """
        Extract the solution from generated text

        :param generated_text: Full generated text
        :return: Extracted solution
        """
        # Use regex to extract solution after specific markers
        print(generated_text)
        solution_match = re.search(r'Solution:?\s*(.*)', generated_text[0], re.DOTALL)
        if solution_match:
            return solution_match.group(1).strip()
        return generated_text

    def evaluate(self, prompt_types=['zero_shot']): # , 'few_shot'
        """
        Evaluate Llava model on the dataset

        :param prompt_types: Types of prompting to use
        """
        # for split in ['testmini']:
        for prompt_type in prompt_types:
            model_results = []
            for i, example in enumerate(tqdm(self.dataset['testmini'],
                                desc=f"Evaluating Llava - {prompt_type}")):
                # Prepare input
                inputs = self._prepare_input(
                    example,
                    few_shot=(prompt_type == 'few_shot')
                )

                # Generate response
                generate_ids = self.model.generate(
                    **inputs,
                    max_new_tokens=200,
                    # do_sample=False
                )

                # Decode and process response
                # processor.batch_decode(generate_ids, skip_special_tokens=True)
                generated_text = self.processor.batch_decode(generate_ids, skip_special_tokens=True)
                extracted_solution = self._extract_solution(generated_text)

                # Compare with ground truth
                model_results.append({
                    'question': example['question'],
                    'ground_truth': example['answer'],
                    'model_prediction': extracted_solution,
                    # 'correct': self._check_answer(example['answer'], extracted_solution)
                })
                if i % 50 == 0:
                  pd.DataFrame(model_results).to_csv(f'{IMAGE_PATH}/zero_shot_results.csv')


            # Store results
            self.results[prompt_type] = model_results

        return self.results

    def _check_answer(self, ground_truth, prediction):
        """
        Basic answer checking method

        :param ground_truth: Correct answer from dataset
        :param prediction: Model's generated answer
        :return: Boolean indicating correctness
        """
        # Simple numeric comparison if possible
        try:
            ground_float = float(ground_truth)
            pred_float = float(prediction)
            return abs(ground_float - pred_float) < 1e-2
        except (ValueError, TypeError):
            # Fallback to string matching
            return ground_truth.lower() in prediction.lower()

    def print_summary(self):
        """
        Print summary of evaluation results
        """
        for prompt_type, results in self.results.items():
            correct_count = sum(res['correct'] for res in results)
            total_count = len(results)
            accuracy = correct_count / total_count * 100

            print(f"{prompt_type.replace('_', ' ').title()} Accuracy: {accuracy:.2f}%")
            print(f"Correct Predictions: {correct_count}/{total_count}")

# Main execution
# if __name__ == "__main__":
evaluator = MathVisionLlavaEvaluator()
evaluator.evaluate()
pd.DataFrame(evaluator.results['zero_shot']).to_csv(f'{IMAGE_PATH}/zero_shot_results.csv')
# pd.DataFrame(evaluator.results['few_shot']).to_csv('few_shot_results.csv')
evaluator.print_summary()



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating Llava - zero_shot:   0%|          | 1/304 [00:09<48:56,  9.69s/it]

['USER:  \nSolve the following math problem step by step, given the image attached:\nHow many different digits can you find in this picture?\n<image1> ASSISTANT: In the image, there are two different digits: 5 and 8.']


Evaluating Llava - zero_shot:   1%|          | 2/304 [01:29<4:15:19, 50.73s/it]

['USER:  \nSolve the following math problem step by step, given the image attached:\nWhich number do you have to write in the last daisy?\n<image1> ASSISTANT: In the image, there is a daisy with a number 6 written in the center. The daisy is surrounded by numbers, including 1, 2, 3, 4, 5, 6, 7, and 8. To determine which number to write in the last daisy, we need to find the missing number between the last two numbers in the sequence.\n\nThe last two numbers in the sequence are 6 and 7. To find the missing number, we can subtract 6 from 7:\n\n7 - 6 = 1\n\nTherefore, the last daisy should have a number 1 written in the center.']


Evaluating Llava - zero_shot:   1%|          | 3/304 [03:04<5:57:03, 71.17s/it]

['USER:  \nSolve the following math problem step by step, given the image attached:\nMisty the cat has five kittens: two of them are striped, one spotty, the rest of them are absolutely white. In which picture can we see the kittens of Misty, knowing that the ears of one of them are of different colour?\n<image1> ASSISTANT: To solve this problem, we need to identify the kittens of Misty based on the image description. The kittens are in different positions, and we can see their ears and tails.\n\n1. The first kitten is white and has a black tail.\n2. The second kitten is striped and has a black tail.\n3. The third kitten is spotty and has a black tail.\n4. The fourth kitten is white and has a black tail.\n5. The fifth kitten is white and has a black tail.\n\nSince the ears of one of the kittens are of different color, we can conclude that the first kitten is the one with the different-colored ears. The other four kittens are all white with black tails.']


Evaluating Llava - zero_shot:   1%|▏         | 4/304 [04:22<6:09:57, 73.99s/it]

['USER:  \nSolve the following math problem step by step, given the image attached:\nHow many bricks are missing in the wall?\n<image1> ASSISTANT: To determine the number of bricks missing in the wall, we need to identify the missing bricks in the image. The image shows a brick wall with a missing section. The missing section consists of a few bricks, which are not present in the wall.\n\n1. Count the number of bricks in the wall: There are a total of 12 bricks in the wall.\n2. Identify the missing section: The missing section consists of 3 bricks.\n3. Subtract the missing section from the total number of bricks: 12 - 3 = 9\n\nThere are 9 bricks remaining in the wall.']


Evaluating Llava - zero_shot:   2%|▏         | 5/304 [05:50<6:33:51, 79.03s/it]

["USER:  \nSolve the following math problem step by step, given the image attached:\nThe sums of the all the three numbers on each side of the triangle are equal. Two numbers happened to be stained with ink. How much is the sum of these two numbers?\n<image1> ASSISTANT: To solve this problem, we need to find the sum of the three numbers on each side of the triangle. Let's start by identifying the numbers on each side of the triangle.\n\n1. Top left: 7\n2. Top right: 16\n3. Bottom left: 1\n4. Bottom right: 3\n\nNow, we need to find the sum of these four numbers. To do this, we can add the numbers together:\n\n7 + 16 + 1 + 3 = 37\n\nSince the sums of the numbers on each side of the triangle are equal, we can conclude that the sum of the two numbers that are stained with ink is 16."]


Evaluating Llava - zero_shot:   2%|▏         | 5/304 [07:27<7:25:49, 89.46s/it]


KeyboardInterrupt: 