In [1]:
import os
import re
from datasets import load_dataset
from PIL import Image

# Create a directory to save images
os.makedirs('saved_images', exist_ok=True)

# Load and preprocess the dataset
ds = load_dataset('HuggingFaceM4/VQAv2', split="train[:10%]")
cols_keep = ["question_type", "answers", "answer_type", "image", "image_id", "question_id", "question"]
ds = ds.remove_columns([col for col in ds.column_names if col not in cols_keep])
split_ds = ds.train_test_split(test_size=0.05)
train_ds = split_ds["test"]

# Helper function to sanitize filenames
def sanitize_filename(text):
    text = re.sub(r'\W+', '_', text)
    return text[:50]  # Truncate to 50 characters for safety

# Save 10 images from the dataset with filenames based on metadata
for i in range(10):
    example = train_ds[i]
    image = example["image"]
    question_type = sanitize_filename(example["question_type"])
    question = sanitize_filename(example["question"])
    # Combine unique answers, truncating and sanitizing the string
    answers = "_".join(sorted(set([sanitize_filename(ans['answer']) for ans in example["answers"]])))
    answers = answers[:50]  # Truncate to 50 characters for safety
    answer_type = sanitize_filename(example["answer_type"])
    
    filename = f"{question_type}_{question}_{answers}_{answer_type}.jpg"
    filepath = os.path.join('saved_images', filename)
    
    image.save(filepath)

print("10 images saved successfully.")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.


10 images saved successfully.
