In [1]:
!pip install datasets
from datasets import load_dataset
import json
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import pandas as pd
import openai
import re
import base64
import os
from google.colab import drive
drive.mount('/content/drive')

FOLDER_PATH = "/content/drive/MyDrive/reasoning_multimodal_LLMs/example_data"
IMG_PATH = "/content/drive/MyDrive/MATH-V-main"

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [None]:
def split_convert_mathvision_to_json(split):
    # Load the dataset
    ds = load_dataset("MathLLMs/MathVision")

    # Combine train and test splits for reshuffling
    train_data = []
    test_data = []

    test_mini_ids = ds['testmini']['id']
    for item in ds[split]:
        if item['id'] in test_mini_ids:
            test_data.append(item)
        else:
            train_data.append(item)

    # Shuffle and split the data (80% train, 20% test)
    # train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42)

    # Process each split
    train_json = process_split(train_data)
    test_json = process_split(test_data)

    # Save to files
    with open(f"{FOLDER_PATH}/mathvision_train.json", 'w') as f:
        json.dump(train_json, f, indent=2)

    with open(f"{FOLDER_PATH}/mathvision_test.json", 'w') as f:
        json.dump(test_json, f, indent=2)

    print(f"Converted {len(train_json)} entries for train split")
    print(f"Converted {len(test_json)} entries for test split")

    return train_json, test_json

def convert_mathvision_to_json(split):
    # Load the dataset
    ds = load_dataset("MathLLMs/MathVision")
    data_json = process_split(ds[split])

    # Save to files
    with open(f"{FOLDER_PATH}/mathvision_{split}.json", 'w') as f:
        json.dump(data_json, f, indent=2)

    print(f"Converted {len(data_json)} entries for test split")

    return data_json

def process_split(data):
    converted_data = []
    instruction = "Answer the following question using a single word or phrase, by considering the image provided."
    
    for item in tqdm(data):
        question_prompt = f"""Please solve the problem step by step and put your answer and the end of the solution in one " ". If it is a multiple choice question, only one letter is allowed in the " ". \n {item['question']}"""
        if item.get('options') and len(item['options']) > 0:
            question_prompt += f". Choose from the options {', '.join(item['options'][:-1])}, or {item['options'][-1]}."
        
        # First, get the model's answer
        gpt4v_response = query_gpt4v(item.get("image"), question_prompt)
        
        # Extract the answer from the response (assuming it's in quotes)
        match = re.search(r'"([^"]*)"', gpt4v_response)
        model_answer = match.group(1) if match else None
        
        # Check if the answer is correct
        correct_answer = item.get('answer', '').strip()
        is_correct = model_answer and model_answer.strip().lower() == correct_answer.lower()
        
        final_answer = gpt4v_response if is_correct else f"""The correct answer is: "{correct_answer}" """
        
        # For fine-tuning, include full solution if correct
        conversation_entry = {
            "system_prompt": "You are a helpful visual assistant that can understand images and answer questions about them accurately and concisely. " + instruction,
            "image": item.get("image"),
            "conversations": [
                {
                    "from": "human",
                    "value": f"<image>\n{question_prompt}"
                },
                {
                    "from": "gpt",
                    "value": final_answer
                }
            ]
        }

        converted_data.append(conversation_entry)

    return converted_data

# Helper function to query GPT4-V (you'll need to implement this based on your API access)
def query_gpt4v(image_path, prompt):
    # Implementation depends on how you access GPT4-V API
    # Example structure:
    with open(f"{IMG_PATH}/{image_path}", "rb") as image_file:
        base64_image = base64.b64encode(image_file.read()).decode('utf-8')
        
    response = openai.ChatCompletion.create(
        model="gpt-4-vision-preview",
        messages=[
            {
                "role": "system",
                "content": "You are a helpful visual assistant that can understand images and answer questions about them accurately and concisely."
            },
            {
                "role": "user", 
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                ]
            }
        ]
    )
    return response.choices[0].message.content

# Run the conversion
train_json, test_json = split_convert_mathvision_to_json('test')

# Convert testmini split
testmini_json = convert_mathvision_to_json('testmini')


Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.93k [00:00<?, ?B/s]

(…)-00000-of-00001-3532b8d3f1b4047a.parquet:   0%|          | 0.00/57.0M [00:00<?, ?B/s]

(…)-00000-of-00001-f8ff70fcb2f29b1d.parquet:   0%|          | 0.00/6.99M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/3040 [00:00<?, ? examples/s]

Generating testmini split:   0%|          | 0/304 [00:00<?, ? examples/s]

In [None]:
ds = load_dataset("MathLLMs/MathVision")['testmini']
ds
counter = 0
for item in tqdm(ds):
    if '<image2>' in item['question']:
        print(item['id'])
        print(item['question'])
        print(item['image'])
        print('----------')
        counter += 1
print(counter)


 24%|██▎       | 72/304 [00:00<00:00, 356.92it/s]

33
When the ant <image1> walks from home <image2> along the arrows $\rightarrow 3, \uparrow 3, \rightarrow 3, \uparrow 1$, he gets to the ladybird <image3>.
Which animal does the ant <image1> get to when he walks from home <image2> along the following arrows: $\rightarrow 2, \downarrow 2, \rightarrow 3, \uparrow 3, \rightarrow 2, \uparrow 2$?
<image6>
<image7>
images/33.jpg
----------
34
Max has cut a rectangle into two pieces. One piece looks like:
<image1>
What does the other piece look like?
<image2>
images/34.jpg
----------
44
Florian has 10 equally long metal strips with equally many holes.
<image1>
He bolts the metal strips together in pairs. Now he has five long strips (see the diagram).
<image2>
Which of the long strips is the shortest?
images/44.jpg
----------
45
Which of the kangaroo cards shown below can be turned around so that it then looks the same as the card shown on the right?
<image1>
<image2>
images/45.jpg
----------
46
What do you see if you look at the tower, which

 69%|██████▉   | 210/304 [00:00<00:00, 564.44it/s]

273
A drinking glass is made in the shape of a truncated cone. The outside of the glass (without the upper or lower circle) should be covered with coloured paper. How do you need to cut the paper to completely cover the glass without an overlap?
<image1>
<image2>
images/273.jpg
----------
292
A rectangular piece of paper $A B C D$ is $5 \mathrm{~cm}$ wide and $50 \mathrm{~cm}$ long. The paper is white on one side and grey on the other. Christina folds the strip as shown so that the vertex $B$ coincides with $M$ the midpoint of the edge $C D$. Then she folds it so that the vertex $D$ coincides with $N$ the midpoint of the edge $A B$. How big is the area of the visible white part in the diagram?
<image1>
<image2>
<image3>
images/292.jpg
----------
325
Three triangles are connected to each other as shown. In which of the following pictures are the three triangles connected in the same way?
<image1>
<image2>
images/325.jpg
----------
351
Paula's weather app shows a diagram of the predicted

100%|██████████| 304/304 [00:00<00:00, 476.32it/s]

1451
Jenny looks at her weather app that shows the predicted weather and maximum temperatures for the next five days. Which of the following represents the corresponding graph of maximum temperatures?
<image1>
<image2>
images/1451.jpg
----------
1889
The trapezium shown in the diagram is rotated anti-clockwise by $90^{\circ}$ around the origin $O$, and then reflected in the $x$-axis. Which of the following shows the end result of these transformations? <image1>
<image2>
images/1889.jpg
----------
1969
On Nadya's smartphone, the diagram shows how much time she spent last week on four of her apps. This week she halved the time spent on two of these apps, but spent the same amount of time as the previous week on the other two apps.
<image1>
Which of the following could be the diagram for this week?
<image2>
images/1969.jpg
----------
2773
The letter M in the figure below is first reflected over the line $q$ and then reflected over the line $p$. What is the resulting image?
<image1>

<imag




In [3]:
# def generate_dataframe():
# Load the dataset
ds = load_dataset("MathLLMs/MathVision")

# Combine train and test splits for reshuffling
train_data = []
test_data = []

test_mini_ids = ds['testmini']['id']
for item in tqdm(ds['test']):
    if item['id'] in test_mini_ids:
        test_data.append(item)
    else:
        train_data.append(item)

pd.DataFrame(train_data).to_csv(f"{FOLDER_PATH}/mathvision_train.csv", index=False)
pd.DataFrame(test_data).to_csv(f"{FOLDER_PATH}/mathvision_test.csv", index=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.93k [00:00<?, ?B/s]

(…)-00000-of-00001-3532b8d3f1b4047a.parquet:   0%|          | 0.00/57.0M [00:00<?, ?B/s]

(…)-00000-of-00001-f8ff70fcb2f29b1d.parquet:   0%|          | 0.00/6.99M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/3040 [00:00<?, ? examples/s]

Generating testmini split:   0%|          | 0/304 [00:00<?, ? examples/s]