In [2]:
from google.colab import drive
drive.mount('/content/drive')

!pip install datasets
from datasets import load_dataset
import torch
import json
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import pandas as pd
import openai
import re
import base64
import os

os.environ["OPENAI_API_KEY"] = "x"


FOLDER_PATH = "/content/drive/MyDrive/reasoning_multimodal_LLMs/example_data"
IMG_PATH = "/content/drive/MyDrive/MATH-V-main"

def load_qwen_model():
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        "Qwen/Qwen2.5-VL-7B-Instruct",
        torch_dtype=torch.bfloat16, # 'auto', #
        attn_implementation="flash_attention_2",
        device_map="auto",
    )
    # The default range for the number of visual tokens per image in the model is 4-16384.
    # You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
    # min_pixels = 256*28*28
    # max_pixels = 1280*28*28
    processor = AutoProcessor.from_pretrained(
        "Qwen/Qwen2.5-VL-7B-Instruct",
        # min_pixels=min_pixels,
        # max_pixels=256*28*28
    )
    return model, processor

if True:
    !pip install git+https://github.com/huggingface/transformers accelerate
    !pip install transformers[qwen] --upgrade
    !pip install qwen-vl-utils[decord]==0.0.8
    !pip install flash-attn
    from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
    from qwen_vl_utils import process_vision_info
    model, processor = load_qwen_model()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-dvduxs6v
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-dvduxs6v
  Resolved https://github.com/huggingface/transformers to commit 92c5ca9dd70de3ade2af2eb835c96215cc50e815
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
def split_convert_mathvision_to_json(split):
    # Load the dataset
    ds = load_dataset("MathLLMs/MathVision")

    # Combine train and test splits for reshuffling
    train_data = []
    test_data = []

    test_mini_ids = ds['testmini']['id']
    for item in ds[split]:
        if item['id'] in test_mini_ids:
            test_data.append(item)
        else:
            train_data.append(item)

    # Shuffle and split the data (80% train, 20% test)
    # train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42)

    # Process each split
    train_json = process_split(train_data, 'train', generate_solution=True)
    test_json = process_split(test_data, 'test', generate_solution=False)

    # Save to files
    with open(f"{FOLDER_PATH}/mathvision_train.json", 'w') as f:
        json.dump(train_json, f, indent=2)

    with open(f"{FOLDER_PATH}/mathvision_test.json", 'w') as f:
        json.dump(test_json, f, indent=2)

    print(f"Converted {len(train_json)} entries for train split")
    print(f"Converted {len(test_json)} entries for test split")

    return train_json, test_json

def convert_mathvision_to_json(split):
    # Load the dataset
    ds = load_dataset("MathLLMs/MathVision")
    data_json = process_split(ds[split], split)

    # Save to files
    with open(f"{FOLDER_PATH}/mathvision_{split}.json", 'w') as f:
        json.dump(data_json, f, indent=2)

    print(f"Converted {len(data_json)} entries for test split")

    return data_json

def find_solution(model_response, item):
    # Extract the answer from the response (assuming it's in quotes)
    match = re.search(r"'([^']+)'(\.|\s)*$", model_response)
    model_answer = match.group(1) if match else None

    # Check if the answer is correct
    correct_answer = item.get('answer', '').strip()
    is_correct = model_answer and model_answer.strip().lower() == correct_answer.lower()

    return model_response if is_correct else f"""The correct answer is: '{correct_answer}' """

def process_split(data, split, generate_solution = False):
    count_correct_solution = 0
    converted_data = []
    instruction = "Answer the following question using a single word or phrase, by considering the image provided."
    # instruction =
    for i, item in enumerate(tqdm(data)):
        if i <= 1650:
            continue

        question_prompt = f"""Please solve the problem step by step and put your final answer and the end of the solution in single quotes. If it is a multiple choice question, only one letter is allowed in the quotes. \n {item['question']}"""

        if item.get('options') and len(item['options']) > 0:
            question_prompt += f". Choose from the options {', '.join(item['options'][:-1])}, or {item['options'][-1]}."

        image_path = f"{IMG_PATH}/{item.get('image')}"
        if generate_solution:
            # First, get the model's answer
            # model_response = query_gpt4v(image_path, question_prompt)
            model_response = query_qwen(image_path, question_prompt, "")

            final_solution = find_solution(model_response, item)
            if not final_solution.startswith("The correct answer is"):
                count_correct_solution += 1
            # print(f"model_response = {model_response}")
            # print(f"final_solution = {final_solution}")
        else:
            final_solution = f"""The correct answer is: '{item.get('answer', '').strip()}' """
        # For fine-tuning, include full solution if correct
        conversation_entry = {
            "system_prompt": "You are a helpful visual assistant that can understand images and answer questions about them accurately and concisely. " + instruction,
            "image": item.get("image"),
            "conversations": [
                {
                    "from": "human",
                    "value": f"<image>\n{question_prompt}"
                },
                {
                    "from": "gpt",
                    "value": final_solution
                }
            ]
        }

        converted_data.append(conversation_entry)
        if i%50 == 0:
            with open(f"{FOLDER_PATH}/mathvision_{split}.json", 'w') as f:
                json.dump(converted_data, f, indent=2)
            print("num correct solutions added = ", count_correct_solution)

    print("FINAL num correct solutions added = ", count_correct_solution)
    return converted_data


def query_qwen(image_path, prompt, instruction):
    messages = [
            {
                "role": "system",
                "content": "You are a helpful visual assistant that can understand images and answer questions about them accurately and concisely." + instruction
            },
             {
            "role": "user",
            "content": [
                {"type": "image", "image": image_path},
                {"type": "text", "text": prompt},
            ],
        }
    ]

    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=1024)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    return output_text[0]



# Helper function to query GPT4-V (you'll need to implement this based on your API access)
def query_gpt4v(image_path, prompt):
    with open(image_path, "rb") as image_file:
          base64_image = base64.b64encode(image_file.read()).decode('utf-8')

    response = openai.ChatCompletion.create(
        model="gpt-4-vision-preview",
        messages=[
            {
                "role": "system",
                "content": "You are a helpful visual assistant that can understand images and answer questions about them accurately and concisely."
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                ]
            }
        ]
    )
    return response.choices[0].message.content


# Run the conversion
train_json, test_json = split_convert_mathvision_to_json('test')

# # Convert testmini split
# testmini_json = convert_mathvision_to_json('testmini')


  0%|          | 0/2736 [00:00<?, ?it/s]

## Dataset investigation

In [None]:
ds = load_dataset("MathLLMs/MathVision")['testmini']
ds
counter = 0
for item in tqdm(ds):
    if '<image2>' in item['question']:
        print(item['id'])
        print(item['question'])
        print(item['image'])
        print('----------')
        counter += 1
print(counter)


# Creating dataframes with solutions if exist

In [None]:
# Load the dataset
ds = load_dataset("MathLLMs/MathVision")

with open(f"{FOLDER_PATH}/mathvision_train.json", 'r') as f:
    train_json = json.load(f)

# Combine train and test splits for reshuffling
train_data = []
test_data = []

test_mini_ids = ds['testmini']['id']
train_counter = 0
with_solution_counter = 0
for item in tqdm(ds['test']):
    if item['id'] in test_mini_ids:
        test_data.append(item)
    else:
        if train_counter < len(train_json):
          json_item = train_json[train_counter]
          if (
              item['question'] in json_item.get('conversations')[0]['value']
              and not json_item['conversations'][1]['value'].startswith('The correct answer is:')
          ):
              item['solution'] = json_item['conversations'][1]['value']
              with_solution_counter += 1
        train_data.append(item)
        train_counter += 1


print(f'There are {with_solution_counter} questions in the train split with solutions')
pd.DataFrame(train_data).to_csv(f"{FOLDER_PATH}/mathvision_train.csv", index=False)
pd.DataFrame(test_data).to_csv(f"{FOLDER_PATH}/mathvision_test.csv", index=False)

Merging two json files with dictionaries

In [None]:
import json

filepath_1 = "/content/drive/MyDrive/reasoning_multimodal_LLMs/example_data/mathvision_train_1to1650.json"
filepath_2 = "/content/drive/MyDrive/reasoning_multimodal_LLMs/example_data/mathvision_train.json"
# Load first JSON file
with open(filepath_1, "r") as file:
    data1 = json.load(file)

# Load second JSON file
with open(filepath_2, "r") as file:
    data2 = json.load(file)

# Merge dictionaries (data2 overwrites data1 in case of key conflicts)
merged_data = {**data1, **data2}

# Save the merged JSON
with open(filepath_2, "w") as file:
    json.dump(merged_data, file, indent=4)

print("Merged JSON saved successfully!")
