In [3]:
from google.colab import drive
drive.mount('/content/drive')

!git clone https://github.com/NaGho/reasoning_multimodal_LLMs.git
import sys
sys.path.append('/content/reasoning_multimodal_LLMs')

!pip install av
!pip install datasets
!pip install langchain
!pip install -U langchain-community
!pip install chromadb
# !pip install faiss-gpu
import torch
from datasets import load_dataset
from transformers import LlavaForConditionalGeneration, AutoProcessor
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma # FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PIL import Image
import pandas as pd
import networkx as nx
from tqdm import tqdm
import re
import os
import transformers
from arguments import ModelArguments, DataArguments, TrainingArguments, LoraArguments
from collators import COLLATORS
from loaders import LOADERS
from supported_models import MODULE_KEYWORDS

IMAGE_PATH = "/content/drive/MyDrive/MATH-V-main"
LOCAL_MODEL_PATH = "/content/drive/MyDrive/reasoning_multimodal_LLMs"

class MathVisionEvaluator:
    def __init__(self, methods, dataset_name="MathLLMs/MathVision"):
        """
        Initialize the evaluator with RAG and graphRAG capabilities
        """
        # Load dataset and model as before
        self.dataset = load_dataset(dataset_name)
        self.train_dataset = pd.read_csv("/content/reasoning_multimodal_LLMs/example_data/mathvision_train.csv")
        self.results = {}
        self.model_id = "llava-1.5-7b-hf" # "qwen2-vl-7b-instruct" #
        local_model_path = f"{LOCAL_MODEL_PATH}/{self.model_id}"

        # Check if the model is already saved locally
        if False and os.path.exists(local_model_path):
            print(f"Loading model from local path: {local_model_path}")
            self.model = LlavaForConditionalGeneration.from_pretrained(
                local_model_path,
                torch_dtype=torch.float16,
                device_map='auto'
            )
            self.processor = AutoProcessor.from_pretrained(local_model_path, use_fast=True)
        else:
            print(f"Downloading model from {self.model_id}")
            # Create the directory if it doesn't exist
            os.makedirs(local_model_path, exist_ok=True)

            # Download and save the model
            # self.model = LlavaForConditionalGeneration.from_pretrained(
            #     self.model_id,
            #     torch_dtype=torch.float16,
            #     device_map='auto'
            # )
            # self.processor = AutoProcessor.from_pretrained(self.model_id, use_fast=True)
            parser = transformers.HfArgumentParser(
                (ModelArguments, DataArguments, TrainingArguments, LoraArguments)
            )
            default_args = [
                "--output_dir", "/content/drive/MyDrive/reasoning_multimodal_LLMs/outputs",
            ]
            model_args, data_args, _, _ = parser.parse_args_into_dataclasses(default_args)

            data_args.data_path = "/content/reasoning_multimodal_LLMs/example_data/mathvision_train_with_solution.json" # mathvision_train
            data_args.image_folder = "/content/reasoning_multimodal_LLMs/example_data/images" # "/content/drive/MyDrive/MATH-V-main"

            loader = LOADERS[model_args.model_family_id](
                model_hf_path=model_args.model_hf_path,
                model_local_path=model_args.model_local_path,
            )
            # if len(set(['fine_tuned']) - set(methods)) > 0:
            self.model, self.tokenizer, self.processor, self.config = loader.load()

            if False:
                # Save the model and processor locally
                print(f"Saving model to {local_model_path}")
                self.model.save_pretrained(local_model_path)
                self.processor.save_pretrained(local_model_path)


            if 'fine_tuned' in methods:
                self.fine_tune_path = "/content/drive/MyDrive/reasoning_multimodal_LLMs/fine_tuned/checkpoint-6000"
                self.fine_tuned_model = LlavaForConditionalGeneration.from_pretrained(
                    self.fine_tune_path,
                    torch_dtype=torch.float16,
                    device_map='cuda'
                )

        # Initialize RAG components
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )

        # Create knowledge base from training data
        self._initialize_knowledge_base()

        # Create problem graph
        self._initialize_problem_graph()

    def _initialize_knowledge_base(self, method='Chroma'):
        """
        Initialize RAG knowledge base from training data
        """

        # Prepare documents from training data
        documents = []
        metadatas = []
        for _, example in self.train_dataset.iterrows():
            if example['solution'] is None:
                continue
            doc = f"Question: {example['question']}\nSolution: {example['solution']}\nAnswer: {example['answer']}\nSubject: {example['subject']}\nlevel: {example['level']}"
            documents.append(doc)
            metadatas.append({
                'subject': example['subject'],
                'level': example['level']
            })

        if method=='Chroma': #Chroma
            # Create Chroma vector store
            self.vector_store = Chroma.from_texts(
                texts=documents,
                metadatas=metadatas,
                embedding=self.embeddings,
                persist_directory="./chroma_db"  # This will persist the database locally
            )
        else: # FAISS
            # Split documents into chunks
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=500,
                chunk_overlap=50
            )
            texts = text_splitter.create_documents(documents)

            # Create vector store
            self.vector_store = FAISS.from_documents(texts, self.embeddings)


    def _initialize_problem_graph(self):
        """
        Create a graph representation of problems and their relationships
        """
        self.problem_graph = nx.Graph()

        # Add nodes for each training example
        for i, example in self.train_dataset.iterrows():
            self.problem_graph.add_node(i,
                                      subject=example['subject'],
                                      level=example['level'],
                                      question=example['question'],
                                      solution=example['solution'],
                                      answer=example['answer'])

        # Add edges between similar problems (based on subject and level)
        for i in range(len(self.train_dataset)):
            for j in range(i + 1, len(self.train_dataset)):
                if (self.problem_graph.nodes[i]['subject'] == self.problem_graph.nodes[j]['subject'] and
                    abs(self.problem_graph.nodes[i]['level'] - self.problem_graph.nodes[j]['level']) <= 1):
                    self.problem_graph.add_edge(i, j, weight=1)

    def _get_relevant_context(self, question, subject, level, options, method='rag'):
        """
        Get relevant context using either RAG or graphRAG
        """
        if method == 'rag':
            # Use traditional RAG to find similar problems
            similar_docs = self.vector_store.similarity_search(
                f"Question: {question} level: {level}", k=3, # Subject: {subject}
                filter={"subject": subject}
            )
            context = "\n\n".join([doc.page_content for doc in similar_docs])

        else:  # graph_rag
            # Find most similar node in graph
            similar_problems = []
            for node in self.problem_graph.nodes():
                node_data = self.problem_graph.nodes[node]
                if (node_data['subject'] == subject and
                    abs(node_data['level'] - level) <= 1):
                    similar_problems.append(node)

            # Get connected problems from graph
            context_problems = []
            for problem in similar_problems[:2]:  # Get top 2 similar problems
                neighbors = list(self.problem_graph.neighbors(problem))
                if neighbors:
                    context_problems.extend([self.problem_graph.nodes[n] for n in neighbors[:2]])

            context = "\n\n".join([
                f"Question: {p['question']}\nSolution: {p['solution']}\nAnswer: {p['answer']}"
                for p in context_problems
            ])

        return context

    def _prepare_input(self, item, device, method='zero_shot'):
        """
        Prepare input with optional RAG/graphRAG context
        """

        if method in ['rag', 'graph_rag']:
            context = self._get_relevant_context(
                item['question'],
                item['subject'],
                item['level'],
                item['options'],
                method=method
            )
            question_prompt = f"Here are some similar problems and their solutions:\n{context}\n\n"
        else:
            question_prompt = ""

        if 'CoT' in method:
            # question_prompt = f"""Analyze this math problem carefully.
            #     Problem: {item['question']}

            #     Solve this step-by-step:
            #     1. First, identify what the problem is asking
            #     2. Break down the necessary calculations
            #     3. Work through each step methodically
            #     4. Conclude with your final answer enclosed in quotes like this: "answer"
            #     {
            #         '. For the answer, choose one option from: ' + ', '.join(item['options']) \
            #           if item.get('options') and len(item['options']) > 0 else \
            #         ', where the answer must a single number or alphabet'
            #     }"""
            question_prompt += f"""Please solve the problem step by step considering the image provided.
                Write your final answer using a single word or phrase enclosed in quotes like this: "answer".
                Problem: {item['question']}
                {'. Choose one option from: ' + ', '.join(item['options']) + '.' if item.get('options') and len(item['options']) > 0 else ''}"""
            # question_prompt = f"""
            #     You are an expert in mathematical reasoning. Analyze the given image and solve the problem using logical steps.

            #     ### **Problem Statement:**
            #     {item['question']}

            #     {'. Choose one option from: ' + ', '.join(item['options']) + '.' if item.get('options') and len(item['options']) > 0 else ''}

            #     ### **Solution Approach:**
            #     1. **Understand the problem**: Extract relevant information from the image.
            #     2. **Break it down step by step**: Clearly outline each step in the reasoning process.
            #     3. **Perform calculations logically**: Show all intermediate steps.
            #     4. **Verify the answer**: Ensure consistency and correctness.

            #     ### **Final Answer Format:**
            #     - Write your final answer in **quotes** like this: `"answer"`.
            #     - Ensure the answer is **precise and matches the expected format**.

            #     ### **Step-by-Step Solution:**
            #     """

        else:
            question_prompt += "Answer the following question using a single word or phrase." + item['question']
        # question_prompt = f"""Please solve the problem step by step and put your answer and the end of the solution in one " ". If it is a multiple choice question, only one letter is allowed in the " ". \n {item['question']}."""

            if item.get('options') and len(item['options']) > 0:
                question_prompt += f" Choose one option from {', '.join(item['options'])}."


        conversation = [
            # {
            #     "role": "system",
            #     "content": "You are a math tutor who always solves problems step-by-step with clear reasoning. Never provide just the answer - always show your work." #+ instruction
            # },
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": question_prompt}
                ]
            },
        ]
        prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)

        # Process image if available
        if 'image' in item and item['image']:
            images = [Image.open(f"{IMAGE_PATH}/{item['image']}")]
        else:
            images = None

        inputs = self.processor(
            images=images,
            text=[prompt],
            padding=True,
            return_tensors="pt"
        ).to(device, torch.float16)

        return inputs

    def extract_answer(self, model_response):
        # Extract the answer from the response (assuming it's in quotes)
        match = re.search(r"'([^']+)'(\.|\s)*$", model_response)
        match2 = re.search(r'ASSISTANT:\s*(?:The answer is\s*)?(.+?)(?:\s*\.*)?$', model_response)
        return match.group(1).strip() if match else match2.group(1).strip() if match2 else None

    # def post_process_output(self, output_text):
    #     """Ensure the output follows the expected format"""
    #     # Check if there's already a quoted answer at the end
    #     if not re.search(r'"[^"]*"$', output_text.strip()):
    #         # If no answer in quotes found, try to extract it and reformat
    #         potential_answer = output_text.strip().split("\n")[-1]
    #         if potential_answer.isalnum():  # Simple check if it could be an answer
    #             return potential_answer
    #     return None

    def evaluate(self, methods):
        """
        Evaluate using specified methods
        """
        for method in methods:
            model_results = []
            for i, example in enumerate(tqdm(self.dataset['testmini'],
                                          desc=f"Evaluating {self.model_id} - {method}")):
                # if i > 2:
                #   break
                if method == 'fine_tuned':
                    model = self.fine_tuned_model
                else:
                    model = self.model

                inputs = self._prepare_input(example, model.device, method=method)


                generate_ids = model.generate(
                    **inputs,
                    max_new_tokens=1024,  # Limit output length 100 if 'CoT' not in method else
                    # do_sample=False,  # Use greedy decoding (faster than sampling)
                    # num_beams=1,  # Disable beam search (use greedy decoding)
                    # temperature=0.7,  # Reduce randomness (keeps output stable)
                    # use_cache=True  # Enables KV caching for efficiency
                    # max_new_tokens=1024,
                    # temperature=0.2,  # Lower temperature for more logical responses
                    # num_beams=4,      # Beam search for better coherence
                    # do_sample=False   # Disable sampling for more consistent results
                )

                generated_text = self.processor.batch_decode(
                    generate_ids,
                    skip_special_tokens=True
                )[0]

                model_results.append({
                    'question': example['question'],
                    'ground_truth': example['answer'],
                    'model_prediction': generated_text,
                    'potential_answer': self.extract_answer(generated_text),
                    'id': example['id'],
                    'level': example['level'],
                    'subject': example['subject'],
                    'options': example['options'],
                    'method': method,
                })

                if i % 50 == 0:
                    directoryPath = os.path.dirname(f'{IMAGE_PATH}/{self.model_id}_{method}_results.csv')
                    os.makedirs(directoryPath, exist_ok=True)
                    pd.DataFrame(model_results).to_csv(f'{IMAGE_PATH}/{self.model_id}_{method}_results.csv')

            self.results[method] = model_results

        return self.results


# Usage
methods = ['fine_tuned'] # 'zero_shot', 'fine_tuned', 'rag', 'graph_rag'
evaluator = MathVisionEvaluator(methods)
results = evaluator.evaluate(methods)

# Save results
for method in results:
    pd.DataFrame(results[method]).to_csv(f'{IMAGE_PATH}/{evaluator.model_id}_{method}_results.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
fatal: destination path 'reasoning_multimodal_LLMs' already exists and is not an empty directory.
Downloading model from llava-1.5-7b-hf


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading processor from llava-hf/llava-1.5-7b-hf
Loading tokenizer from llava-hf/llava-1.5-7b-hf


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating llava-1.5-7b-hf - fine_tuned: 100%|██████████| 304/304 [02:57<00:00,  1.71it/s]


## Evaluate

In [4]:
from google.colab import drive
import pandas as pd
import re
drive.mount('/content/drive')
IMAGE_PATH = "/content/drive/MyDrive/MATH-V-main"

def extract_answer(text):
    """Extract answer from generated text"""
    pattern = r'ASSISTANT:\s*(?:The answer is\s*)?(.+?)(?:\s*\.*)?$'
    solution_match = re.search(pattern, text)
    return solution_match.group(1).strip() if solution_match else None


def evaluate_pred(gt_answer, pred_answer):
    # Check if the answer is correct
    gt_answer = str(gt_answer)
    pred_answer = str(pred_answer)
    return pred_answer and pred_answer.strip().lower() == gt_answer.lower()

try:
    methods = methods
except:
    methods = ['fine_tuned'] # 'zero_shot', 'fine_tuned', 'rag', 'graph_rag'

try:
    model_id = evaluator.model_id
except:
    model_id = "llava-1.5-7b-hf" # "qwen2-vl-7b-instruct" #

answer_col = "potential_answer" # "potential_answer"
for method in methods:
    df = pd.read_csv(f'{IMAGE_PATH}/{model_id}_{method}_results.csv', index_col=0)
    df['correct_answer'] = df.apply(lambda x: evaluate_pred(x['ground_truth'], x['potential_answer']), axis=1)
    # df['potential_answer_2'] = df.apply(lambda x: extract_answer(x['model_prediction']), axis=1)
    # df['correct_answer'] = df['correct_answer'] | (df.apply(lambda x: evaluate_pred(x['ground_truth'], x['potential_answer_2']), axis=1))
    print(f"{method} Accuracy: {df['correct_answer'].mean()*100}%")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
fine_tuned Accuracy: 8.881578947368421%
