# Load File

In [None]:
import pandas as pd
from PIL import Image
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

def load_image(image_name, image_folder):
    image_path = os.path.join(image_folder, image_name)
    try:
        with Image.open(image_path) as img:
            return img.copy()  # Copy the image object to avoid closing
    except Exception as e:
        return None, str(e)

def create_dataset(csv_file, image_folder, batch_size=1000, max_workers=4):
    # Read the CSV file
    df = pd.read_csv(csv_file)
    total_rows = len(df)
    dataset = []
    error_log = []

    for start_idx in range(0, total_rows, batch_size):
        end_idx = min(start_idx + batch_size, total_rows)
        batch_df = df[start_idx:end_idx]

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_row = {executor.submit(load_image, row['name'], image_folder): row for _, row in batch_df.iterrows()}
            
            for future in tqdm(as_completed(future_to_row), total=len(future_to_row), desc=f"Processing batch {start_idx // batch_size + 1}"):
                row = future_to_row[future]
                try:
                    image = future.result()
                    if isinstance(image, tuple):  # Checking if an error occurred
                        error_log.append((row['name'], image[1]))
                    else:
                        dataset.append({
                            "image": image,
                            "image_id": row['id'],
                            "caption": row['caption']
                        })
                except Exception as e:
                    error_log.append((row['name'], str(e)))

    # Save error log to a file
    with open('error_log.txt', 'w') as f:
        for error in error_log:
            f.write(f"{error[0]}: {error[1]}\n")

    return dataset

In [None]:
# Example usage
csv_file = 'train/radiologytraindata.csv'
image_folder = 'train/radiology/images'
train_dataset = create_dataset(csv_file, image_folder)

# Example usage
csv_file = 'test/radiologytraindata.csv'
image_folder = 'test/radiology/images'
test_dataset = create_dataset(csv_file, image_folder)

# Example usage
csv_file = 'validation/radiologytraindata.csv'
image_folder = 'validation/radiology/images'
val_dataset = create_dataset(csv_file, image_folder)

# Convert to DatasetDict

In [None]:
from datasets import DatasetDict, Dataset
def convert_to_datasetdict(dataset_list):
    # Convert list of dictionaries to a format suitable for Dataset
    images = [data['image'] for data in dataset_list]
    image_ids = [data['image_id'] for data in dataset_list]
    captions = [data['caption'] for data in dataset_list]

    # Create a dictionary suitable for Dataset.from_dict
    data_dict = {
        'image': images,
        'image_id': image_ids,
        'caption': captions
    }

    # Create a Dataset from the dictionary
    dataset = Dataset.from_dict(data_dict)

    return dataset

In [None]:
train_dict = convert_to_datasetdict(train_dataset)

In [None]:
test_dict = convert_to_datasetdict(test_datasetst)

In [None]:
val_dict = convert_to_datasetdict(val_dataset)

In [None]:
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dict,
    "test": test_dict,
})

In [None]:
dataset

# Upload to Hugging Face Hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
dataset.push_to_hub("mdwiratathya/ROCO-radiology", commit_message="first commit", commit_description="adding train/validation/test split, with image stored as PIL object")