In [None]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

class MathVisionEvaluator:
    def __init__(self, dataset_name="MathLLMs/MathVision", models=None):
        """
        Initialize the evaluator with the dataset and models
        
        :param dataset_name: Name of the dataset to evaluate
        :param models: Dictionary of models to test {'model_name': {'model': model, 'tokenizer': tokenizer}}
        """
        # Load dataset
        self.dataset = load_dataset(dataset_name)
        
        # Default model configurations if not provided
        if models is None:
            models = {
                'llava1.5': self._load_llava_model(),
                'qwen2_vl': self._load_qwen2_model()
            }
        self.models = models
        
        # Evaluation metrics
        self.results = {model_name: {'zero_shot': [], 'few_shot': []} 
                        for model_name in models.keys()}
    
    def _load_llava_model(self):
        """Load Llava 1.5 model and tokenizer"""
        model_id = "llava-hf/llava-1.5-7b-hf"
        model = AutoModelForCausalLM.from_pretrained(
            model_id, 
            torch_dtype=torch.float16, 
            device_map='auto'
        )
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        return {'model': model, 'tokenizer': tokenizer}
    
    def _load_qwen2_model(self):
        """Load Qwen2 VL model and tokenizer"""
        model_id = "Qwen/Qwen-VL"
        model = AutoModelForCausalLM.from_pretrained(
            model_id, 
            torch_dtype=torch.float16, 
            device_map='auto'
        )
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        return {'model': model, 'tokenizer': tokenizer}
    
    def _generate_prompt(self, example, few_shot=False):
        """
        Generate prompts for the models
        
        :param example: Single dataset example
        :param few_shot: Whether to use few-shot prompting
        :return: Prompt string
        """
        base_prompt = f"Question: {example['question']}\nPlease solve this math problem step by step."
        
        if few_shot:
            # Add 1-2 example solutions to provide context
            few_shot_examples = self.dataset['train'][:2]
            base_prompt = (
                "Here are a couple of example solutions:\n" +
                "\n".join([
                    f"Example {i+1} Question: {ex['question']}\nSolution: {ex['answer']}"
                    for i, ex in enumerate(few_shot_examples)
                ]) + 
                f"\n\nNow solve this problem:\n{base_prompt}"
            )
        
        return base_prompt
    
    def evaluate(self, prompt_types=['zero_shot', 'few_shot']):
        """
        Evaluate models on the dataset
        
        :param prompt_types: Types of prompting to use
        """
        for split in ['test', 'validation']:
            for model_name, model_config in self.models.items():
                for prompt_type in prompt_types:
                    model, tokenizer = model_config['model'], model_config['tokenizer']
                    
                    model_results = []
                    for example in tqdm(self.dataset[split], 
                                        desc=f"Evaluating {model_name} - {prompt_type}"):
                        # Generate prompt
                        prompt = self._generate_prompt(
                            example, 
                            few_shot=(prompt_type == 'few_shot')
                        )
                        
                        # Generate model response
                        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
                        outputs = model.generate(**inputs, max_new_tokens=200)
                        
                        # Decode and process response
                        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
                        
                        # Compare with ground truth
                        model_results.append({
                            'question': example['question'],
                            'ground_truth': example['answer'],
                            'model_prediction': generated_text,
                            'correct': self._check_answer(example['answer'], generated_text)
                        })
                    
                    # Store results
                    self.results[model_name][prompt_type] = model_results
        
        return self.results
    
    def _check_answer(self, ground_truth, prediction):
        """
        Basic answer checking method
        Can be enhanced with more sophisticated comparison
        
        :param ground_truth: Correct answer from dataset
        :param prediction: Model's generated answer
        :return: Boolean indicating correctness
        """
        # Simple string matching (can be replaced with more advanced techniques)
        return ground_truth.lower() in prediction.lower()
    
    def print_summary(self):
        """
        Print summary of evaluation results
        """
        for model_name, prompt_types in self.results.items():
            print(f"\n{model_name.upper()} Model Performance:")
            for prompt_type, results in prompt_types.items():
                correct_count = sum(res['correct'] for res in results)
                total_count = len(results)
                accuracy = correct_count / total_count * 100
                
                print(f"{prompt_type.replace('_', ' ').title()} Accuracy: {accuracy:.2f}%")
                print(f"Correct Predictions: {correct_count}/{total_count}")

# Main execution
if __name__ == "__main__":
    evaluator = MathVisionEvaluator()
    evaluator.evaluate()
    evaluator.print_summary()