In [None]:
from datasets import load_dataset
import json
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split

FOLDER_PATH = "/content/drive/MyDrive/reasoning_multimodal_LLMs/example_data"
def split_convert_mathvision_to_json(split):
    # Load the dataset
    ds = load_dataset("MathLLMs/MathVision")
    
    # Combine train and test splits for reshuffling
    all_data = []
    # for split in ds.keys():
    for item in ds[split]:
        all_data.append(item)
    
    # Shuffle and split the data (80% train, 20% test)
    train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42)
    
    # Process each split
    train_json = process_split(train_data)
    test_json = process_split(test_data)
    
    # Save to files
    with open(f"{FOLDER_PATH}/mathvision_train.json", 'w') as f:
        json.dump(train_json, f, indent=2)
    
    with open(f"{FOLDER_PATH}/mathvision_test.json", 'w') as f:
        json.dump(test_json, f, indent=2)
    
    print(f"Converted {len(train_json)} entries for train split")
    print(f"Converted {len(test_json)} entries for test split")
    
    return train_json, test_json

def convert_mathvision_to_json(split):
    # Load the dataset
    ds = load_dataset("MathLLMs/MathVision")
    data_json = process_split(ds[split])
    
    # Save to files
    with open(f"{FOLDER_PATH}/mathvision_{split}.json", 'w') as f:
        json.dump(data_json, f, indent=2)
    
    print(f"Converted {len(data_json)} entries for test split")
    
    return data_json

def process_split(data):
    converted_data = []
    for item in tqdm(data):
        question_prompt = item['question']
        if item.get('options') and len(item['options']) > 0:
            question_prompt += f" The answer is one of the options {', '.join(item['options'])}"
        
        conversation_entry = {
            "system_prompt": "Solve the following math problem step by step, given the image attached. Write the final answer after <Answer>",
            "image": item.get("image"),
            "conversations": [
                {
                    "from": "human",
                    "value": f"<image>{question_prompt}"
                },
                {
                    "from": "gpt",
                    "value": item.get('answer', 'No answer available.')
                }
            ]
        }
        
        converted_data.append(conversation_entry)
    
    return converted_data

# Run the conversion
train_json, test_json = split_convert_mathvision_to_json('test')

# Convert testmini split
testmini_json = convert_mathvision_to_json('testmini')
