In [None]:
!pip install pandas pillow tqdm datasets -q

Figure_path,Caption,Question,Choice A,Choice B,Choice C,Choice D,Answer

In [12]:
import pandas as pd
from PIL import Image
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

def load_image(image_name, image_folder):
    image_path = os.path.join(image_folder, image_name)
    try:
        with Image.open(image_path) as img:
            return img.copy()  # Copy the image object to avoid closing
    except Exception as e:
        return None, str(e)

def create_dataset(csv_file, image_folder, batch_size=1000, max_workers=4):
    # Read the CSV file
    df = pd.read_csv(csv_file)
    total_rows = len(df)
    dataset = []
    error_log = []

    for start_idx in range(0, total_rows, batch_size):
        end_idx = min(start_idx + batch_size, total_rows)
        batch_df = df[start_idx:end_idx]

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_row = {executor.submit(load_image, row['Figure_path'], image_folder): row for _, row in batch_df.iterrows()}
            
            for future in tqdm(as_completed(future_to_row), total=len(future_to_row), desc=f"Processing batch {start_idx // batch_size + 1}"):
                row = future_to_row[future]
                try:
                    image = future.result()
                    if isinstance(image, tuple):  # Checking if an error occurred
                        error_log.append((row['Figure_path'], image[1]))
                    else:
                        dataset.append({
                            "image": image,
                            "caption": row['Caption'],
                            "question": row['Question'],
                            "choice a": row['Choice A'],
                            "choice b": row['Choice B'],
                            "choice c": row['Choice C'],
                            "choice d": row['Choice D'],
                            "answer": row['Answer'] 
                        })
                except Exception as e:
                    error_log.append((row['Figure_path'], str(e)))

    # Save error log to a file
    with open('error_log.txt', 'w') as f:
        for error in error_log:
            f.write(f"{error[0]}: {error[1]}\n")

    return dataset

In [13]:
# Example usage
csv_file = 'test_2.csv'
image_folder = 'images_2'
test_dataset = create_dataset(csv_file, image_folder)

Processing batch 1: 100%|██████████| 1000/1000 [00:00<00:00, 1091.38it/s]
Processing batch 2: 100%|██████████| 1000/1000 [00:00<00:00, 1341.53it/s]
Processing batch 3: 100%|██████████| 1000/1000 [00:00<00:00, 1151.59it/s]
Processing batch 4: 100%|██████████| 1000/1000 [00:00<00:00, 1254.08it/s]
Processing batch 5: 100%|██████████| 1000/1000 [00:00<00:00, 1029.32it/s]
Processing batch 6: 100%|██████████| 1000/1000 [00:01<00:00, 964.77it/s] 
Processing batch 7: 100%|██████████| 1000/1000 [00:00<00:00, 1191.94it/s]
Processing batch 8: 100%|██████████| 1000/1000 [00:00<00:00, 1058.89it/s]
Processing batch 9: 100%|██████████| 1000/1000 [00:00<00:00, 1100.48it/s]
Processing batch 10: 100%|██████████| 1000/1000 [00:01<00:00, 929.41it/s]
Processing batch 11: 100%|██████████| 1000/1000 [00:01<00:00, 973.58it/s]
Processing batch 12: 100%|██████████| 1000/1000 [00:01<00:00, 931.44it/s]
Processing batch 13: 100%|██████████| 1000/1000 [00:01<00:00, 844.90it/s]
Processing batch 14: 100%|██████████| 

In [1]:
import pandas as pd
from PIL import Image
import os
from tqdm import tqdm
from datasets import Dataset, DatasetDict, load_from_disk

def load_image(image_name, image_folder):
    image_path = os.path.join(image_folder, image_name)
    try:
        with Image.open(image_path) as img:
            return img.copy(), None  # Return image and no error
    except Exception as e:
        return None, str(e)  # Return None and the error message

def process_batch(batch_df, image_folder):
    batch_data = []
    error_log = []

    for _, row in batch_df.iterrows():
        image_name = row['Figure_path']
        caption = row['Caption']
        question = row['Question']
        answer = row['Answer']
        choice_a = row['Choice A']
        choice_b = row['Choice B']
        choice_c = row['Choice C']
        choice_d = row['Choice D']

        image, error = load_image(image_name, image_folder)
        if error:
            error_log.append((image_name, error))
        else:
            batch_data.append({
                "image": image,
                "caption": caption,
                "question": question,
                "answer": answer,
                "choice a": choice_a,
                "choice b": choice_b,
                "choice c": choice_c,
                "choice d":choice_d
            })

    return batch_data, error_log

def create_and_save_batches(csv_file, image_folder, batch_size=1000, output_dir='output_batches'):
    # Read the CSV file
    df = pd.read_csv(csv_file)
    total_rows = len(df)
    os.makedirs(output_dir, exist_ok=True)

    all_error_logs = []

    for start_idx in range(0, total_rows, batch_size):
        end_idx = min(start_idx + batch_size, total_rows)
        batch_df = df[start_idx:end_idx]
        
        batch_data, error_log = process_batch(batch_df, image_folder)
        all_error_logs.extend(error_log)

        # Convert batch_data to Dataset and save
        if batch_data:
            data_dict = {
                'image': [data['image'] for data in batch_data],
                'caption': [data['caption'] for data in batch_data],
                'question': [data['question'] for data in batch_data],
                'answer': [data['answer'] for data in batch_data],
                'choice a': [data['choice a'] for data in batch_data],
                'choice b': [data['choice b'] for data in batch_data],
                'choice c': [data['choice c'] for data in batch_data],
                'choice d': [data['choice d'] for data in batch_data]

            }
            batch_dataset = Dataset.from_dict(data_dict)
            batch_dataset.save_to_disk(os.path.join(output_dir, f'batch_{start_idx // batch_size}'))

    # Save error log to a file
    with open(os.path.join(output_dir, 'error_log.txt'), 'w') as f:
        for error in all_error_logs:
            f.write(f"{error[0]}: {error[1]}\n")

def load_combined_dataset(output_dir='output_batches'):
    batches = [os.path.join(output_dir, d) for d in os.listdir(output_dir) if d.startswith('batch_')]
    datasets = [load_from_disk(batch) for batch in batches]
    return datasets

In [2]:
# Example usage
csv_file = 'train_2.csv'
image_folder = 'images_2'
output_dir = 'C:/Users/hp pav/Downloads/PMC-VQA2/batch'
create_and_save_batches(csv_file, image_folder, batch_size=1000, output_dir=output_dir)

Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 2233.60 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 5208.85 examples/s]


KeyboardInterrupt: 

In [2]:
from datasets import Dataset, DatasetDict, load_from_disk, concatenate_datasets

def load_combined_dataset(output_dir='output_batches'):
    batches = [os.path.join(output_dir, d) for d in os.listdir(output_dir) if d.startswith('batch_')]
    datasets = [load_from_disk(batch) for batch in batches]
    combined_dataset = concatenate_datasets(datasets)
    return DatasetDict({'train': combined_dataset})

In [3]:
# # Load combined dataset
output_dir = 'C:/Users/hp pav/Downloads/PMC-VQA2/batch'
train_dataset = load_combined_dataset(output_dir)
# print(combined_dataset)

In [4]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'caption', 'question', 'answer', 'choice a', 'choice b', 'choice c', 'choice d'],
        num_rows: 152603
    })
})

In [5]:
train_dict = train_dataset['train']

In [14]:
train_dict

Dataset({
    features: ['image', 'caption', 'question', 'answer', 'choice a', 'choice b', 'choice c', 'choice d'],
    num_rows: 152603
})

In [8]:
train_dict

Dataset({
    features: ['image', 'caption', 'question', 'answer', 'choice a', 'choice b', 'choice c', 'choice d'],
    num_rows: 152603
})

In [3]:
# Example usage
csv_file = 'test_2.csv'
image_folder = 'images_2'
output_dir = 'C:/Users/hp pav/Downloads/PMC-VQA2/test_batch'
create_and_save_batches(csv_file, image_folder, batch_size=1000, output_dir=output_dir)

Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 3648.92 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 4078.42 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 3685.05 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 3991.17 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 3695.51 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 4852.14 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 4628.02 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 4468.60 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 5909.48 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 4437.49 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 

In [7]:
# # Load combined dataset
output_dir = 'C:/Users/hp pav/Downloads/PMC-VQA2/test_batch'
test_dataset = load_combined_dataset(output_dir)
# print(combined_dataset)

In [13]:
test_dict = test_dataset['train']
test_dict

Dataset({
    features: ['image', 'caption', 'question', 'answer', 'choice a', 'choice b', 'choice c', 'choice d'],
    num_rows: 33430
})

In [17]:
type(test_dict)

datasets.arrow_dataset.Dataset

In [2]:
# Example usage
csv_file = 'train_2.csv'
image_folder = 'images_2'
train_dataset = create_dataset(csv_file, image_folder)

In [9]:
from datasets import DatasetDict, Dataset
def convert_to_datasetdict(dataset_list):
    # Convert list of dictionaries to a format suitable for Dataset
    images = [data['image'] for data in dataset_list]
    captions = [data['caption'] for data in dataset_list]
    questions = [data['question'] for data in dataset_list]
    answers = [data['answer'] for data in dataset_list]
    choice_a = [data['choice a'] for data in dataset_list]
    choice_b = [data['choice b'] for data in dataset_list]
    choice_c = [data['choice c'] for data in dataset_list]
    choice_d = [data['choice d'] for data in dataset_list]

    # Create a dictionary suitable for Dataset.from_dict
    data_dict = {
        'image': images,
        'caption': captions,
        'question': questions,
        'answer': answers,
        'choice a': choice_a,
        'choice b': choice_b,
        'choice c': choice_c,
        'choice d': choice_d
    }

    # Create a Dataset from the dictionary
    dataset = Dataset.from_dict(data_dict)

    return dataset

In [5]:
from datasets import DatasetDict, Dataset
def convert_to_datasetdict(dataset_list):
    # Initialize lists to store data
    images, captions, questions, answers = [], [], [], []
    choice_a, choice_b, choice_c, choice_d = [], [], [], []

    # Iterate over dataset_list with progress bar
    for data in tqdm(dataset_list, desc="Converting to DatasetDict"):
        images.append(data['image'])
        captions.append(data['caption'])
        questions.append(data['question'])
        answers.append(data['answer'])
        choice_a.append(data['choice a'])
        choice_b.append(data['choice b'])
        choice_c.append(data['choice c'])
        choice_d.append(data['choice d'])

    # Create a dictionary suitable for Dataset.from_dict
    data_dict = {
        'image': images,
        'caption': captions,
        'question': questions,
        'answer': answers,
        'choice a': choice_a,
        'choice b': choice_b,
        'choice c': choice_c,
        'choice d': choice_d
    }

    # Create a Dataset from the dictionary
    dataset = Dataset.from_dict(data_dict)

    return dataset

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
from datasets import Dataset
from tqdm import tqdm

def convert_to_datasetdict(dataset_list):
    # Initialize lists to store data
    images, captions, questions, answers = [], [], [], []
    choice_a, choice_b, choice_c, choice_d = [], [], [], []

    # Iterate over dataset_list with progress bar
    for data in tqdm(dataset_list, desc="Converting to DatasetDict", total=len(dataset_list)):
        images.append(data['image'])
        captions.append(data['caption'])
        questions.append(data['question'])
        answers.append(data['answer'])
        choice_a.append(data['choice a'])
        choice_b.append(data['choice b'])
        choice_c.append(data['choice c'])
        choice_d.append(data['choice d'])

    # Create a dictionary suitable for Dataset.from_dict
    data_dict = {
        'image': images,
        'caption': captions,
        'question': questions,
        'answer': answers,
        'choice a': choice_a,
        'choice b': choice_b,
        'choice c': choice_c,
        'choice d': choice_d
    }

    # Create a Dataset from the dictionary
    dataset = Dataset.from_dict(data_dict)

    return dataset


In [10]:
test_dict = convert_to_datasetdict(test_dataset)

Converting to DatasetDict:   0%|          | 0/1 [00:00<?, ?it/s]




TypeError: string indices must be integers, not 'str'

In [None]:
train_dict = convert_to_datasetdict(train_dataset)

In [24]:
dataset = DatasetDict({
    "train": newtrain_dict,
    "test": newtest_dict,
})

In [25]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'caption', 'question', 'answer', 'choice a', 'choice b', 'choice c', 'choice d', 'answer label'],
        num_rows: 152603
    })
    test: Dataset({
        features: ['image', 'caption', 'question', 'answer', 'choice a', 'choice b', 'choice c', 'choice d', 'answer label'],
        num_rows: 33430
    })
})

In [39]:
dataset['train'][0]

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=341x247>,
 'caption': 'A slightly altered cell . (c-c‴) A highly altered cell as seen from 4 different angles . Note mitochondria/mitochondrial networks (green), Golgi complexes (red), cell nuclei (light blue) and the cell outline (yellow).',
 'question': ' What color is used to label the Golgi complexes in the image?',
 'answer': 'Red',
 'choice a': ' A: Green ',
 'choice b': ' B: Red ',
 'choice c': ' C: Light blue ',
 'choice d': ' D: Yellow',
 'answer label': 'B'}

In [15]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [26]:
dataset.push_to_hub("mdwiratathya/PMC-VQA_small", commit_message="minor editing", commit_description="adding answer label and change then answer to not only contains alphabet")

Uploading the dataset shards:   0%|          | 0/26 [00:00<?, ?it/s]

Map:   0%|          | 0/5870 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5870 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5870 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5870 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5870 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5870 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5870 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5870 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5870 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5869 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5869 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5869 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5869 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5869 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5869 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5869 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5869 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5869 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5869 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5869 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5869 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5869 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5869 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5869 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5869 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Map:   0%|          | 0/5869 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/7 [00:00<?, ?it/s]

Map:   0%|          | 0/4776 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/48 [00:00<?, ?ba/s]

Map:   0%|          | 0/4776 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/48 [00:00<?, ?ba/s]

Map:   0%|          | 0/4776 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/48 [00:00<?, ?ba/s]

Map:   0%|          | 0/4776 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/48 [00:00<?, ?ba/s]

Map:   0%|          | 0/4776 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/48 [00:00<?, ?ba/s]

Map:   0%|          | 0/4775 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/48 [00:00<?, ?ba/s]

Map:   0%|          | 0/4775 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/48 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/658 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/datasets/mdwiratathya/PMC-VQA_small/commit/e41ce0b311397b8c1bfe85c80048d9e0c0adcdfe', commit_message='minor editing', commit_description='adding answer label and change then answer to not only contains alphabet', oid='e41ce0b311397b8c1bfe85c80048d9e0c0adcdfe', pr_url=None, pr_revision=None, pr_num=None)

In [15]:
from datasets import Dataset
from tqdm import tqdm

def transform_row(row):
    # Move the answer to answer_label
    row['answer label'] = row['answer']
    
    # Determine the full answer based on the answer label and update the answer field without the alphabet prefix
    if row['answer'] == 'A':
        row['answer'] = row['choice a'][3:].strip()  # Remove 'A: ' prefix
    elif row['answer'] == 'B':
        row['answer'] = row['choice b'][3:].strip()  # Remove 'B: ' prefix
    elif row['answer'] == 'C':
        row['answer'] = row['choice c'][3:].strip()  # Remove 'C: ' prefix
    elif row['answer'] == 'D':
        row['answer'] = row['choice d'][3:].strip()  # Remove 'D: ' prefix
    
    return row

def convert_to_datasetdict(dataset):
    # Apply the transformation to the dataset
    transformed_dataset = dataset.map(transform_row, desc="Converting to DatasetDict")
    return transformed_dataset

In [16]:
test_dict[0]

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=346x292>,
 'caption': 'CT pulmonary angiogram reveals encasement and displacement of the left anterior descending coronary artery ( blue arrows ).',
 'question': ' What is the name of the artery encased and displaced in the image? ',
 'answer': 'B',
 'choice a': ' A: Right Coronary Artery ',
 'choice b': ' B: Left Anterior Descending Coronary Artery ',
 'choice c': ' C: Circumflex Coronary Artery ',
 'choice d': ' D: Superior Mesenteric Artery '}

In [17]:
newtest_dict = convert_to_datasetdict(test_dict)

In [18]:
newtest_dict

Dataset({
    features: ['image', 'caption', 'question', 'answer', 'choice a', 'choice b', 'choice c', 'choice d', 'answer label'],
    num_rows: 33430
})

In [21]:
newtest_dict[5]

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=646x390>,
 'caption': 'showing enlarged parotid glands with multi-loculated cystic formations, the septa are of intermediate signal enhancing after gadulinium injection.',
 'question': ' What type of imaging was used to visualize the parotid glands? ',
 'answer': 'MRI scan',
 'choice a': ' A: CT scan ',
 'choice b': ' B: MRI scan ',
 'choice c': ' C: X-ray ',
 'choice d': ' D: Ultrasound scan  ',
 'answer label': 'B'}

In [22]:
newtrain_dict = convert_to_datasetdict(train_dict)

In [23]:
newtrain_dict[5]

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=360x217>,
 'caption': 'MR of brain (diffusion weighted image) demonstrating multiterritory acute infarct ; b Time of flight MR demonstrating bilateral ICA occlusion ; and c DSA demonstrating anterior circulation supplied by posterior circulation.',
 'question': ' What does the DSA image demonstrate?',
 'answer': 'Anterior circulation supplied by posterior circulation',
 'choice a': ' A: Acute infarct ',
 'choice b': ' B: Bilateral ICA occlusion ',
 'choice c': ' C: Anterior circulation supplied by posterior circulation ',
 'choice d': ' D: None of the above ',
 'answer label': 'C'}