Basic configuration (assuming there is a data folder that contains the training and test folders, and rest of the csv files).

In [1]:
import os
import shutil
import pandas as pd
from PIL import Image
from datasets import load_from_disk, load_dataset, DatasetDict, Features, Image as ImageFeature, Value

BASE_DIR = "data"
CSV_FILE = os.path.join(BASE_DIR, "train_images.csv")
IMAGE_DIR = os.path.join(BASE_DIR, "train_images")
OUTPUT_PATH = "processed_bird_data"

Load dataset:

In [2]:
if not os.path.exists(IMAGE_DIR):
    raise FileNotFoundError(f"Could not find folder: {IMAGE_DIR}. Make sure you unzipped the images there!")

dataset = load_dataset("csv", data_files=CSV_FILE, split="train")

Generating train split: 0 examples [00:00, ? examples/s]

**Handling path correction, grayscale conversion, and label fixing:**

For path correction, we want to standardize CSV paths to full local path: "data/train_images/1.jpg"
Then, we convert images to RGB. CNNs usually expect 3 channels, and some images might be grayscale, which is 1 channel. Fixing this now prevents crashes during training.
Finally, converting the indexing, which starts at 1, to start at 0 (for PyTorch/TensorFlow).

In [3]:
# Progress tracker
counter = 0

def process_data(example):
    global counter
    
    # Print status every 100 images:
    if counter % 100 == 0:
        print(f"Processing image #{counter}...", end="\r")
    counter += 1
    
    # Fix path
    filename = os.path.basename(example["image_path"]) 
    full_path = os.path.join(IMAGE_DIR, filename)
    
    try:
        # Convert to RGB to avoid grayscale errors
        img = Image.open(full_path).convert("RGB")
        valid = True
    except Exception:
        # If image is broken, return a tiny black square.
        img = Image.new("RGB", (224, 224), (0, 0, 0))
        valid = False
    
    return {
        "image": img,
        "label": example["label"] - 1,
        "valid": valid
    }

print("Starting processing")

# Run with cache disabled to force a fresh run:
dataset = dataset.map(process_data, remove_columns=["image_path"], load_from_cache_file=False)

# Filter out the dummy/broken images:
dataset = dataset.filter(lambda x: x["valid"])
dataset = dataset.remove_columns(["valid"])

print(f"\nDone! Processed {len(dataset)} images.")

OUTPUT_PATH = "processed_bird_data_FULL"
print(f"Saving to {OUTPUT_PATH}...")

dataset.save_to_disk(OUTPUT_PATH)

print(f"Done! Saved to {OUTPUT_PATH}")

Starting processing


Map:   0%|          | 0/3926 [00:00<?, ? examples/s]

Processing image #3900...

Filter:   0%|          | 0/3926 [00:00<?, ? examples/s]


Done! Processed 3926 images.
Saving to processed_bird_data_FULL...


Saving the dataset (0/3 shards):   0%|          | 0/3926 [00:00<?, ? examples/s]

Done! Saved to processed_bird_data_FULL


Saving:

**Doing split:**

Call this cell to retrieve the pre-saved file:

In [4]:
print("Loading the full dataset.")
full_dataset = load_from_disk("processed_bird_data_FULL")

Loading the full dataset.


Perform the split (85% Train, 15% Validation), and save:

In [5]:
# Seed=42, 85% Train, 15% Validation)
print("Splitting into Train and Validation.")
splits = full_dataset.train_test_split(test_size=0.15, seed=42)

final_dataset = DatasetDict({
    "train": splits["train"],
    "validation": splits["test"] 
})

FINAL_OUTPUT_PATH = "processed_bird_data"
print(f"Saving final split dataset to '{FINAL_OUTPUT_PATH}'...")
final_dataset.save_to_disk(FINAL_OUTPUT_PATH)

print("\nFinal Data Pipeline Summary")
print(f"Train samples:      {len(final_dataset['train'])}")
print(f"Validation samples: {len(final_dataset['validation'])}")

Splitting into Train and Validation.
Saving final split dataset to 'processed_bird_data'...


Saving the dataset (0/2 shards):   0%|          | 0/3337 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/589 [00:00<?, ? examples/s]


Final Data Pipeline Summary
Train samples:      3337
Validation samples: 589


Doing the same thing for test data, first configuration:

In [6]:
TEST_CSV = os.path.join(BASE_DIR, "test_images_path.csv")
TEST_IMAGE_DIR = os.path.join(BASE_DIR, "test_images")
TEST_OUTPUT_PATH = "processed_bird_test_data"

if not os.path.exists(TEST_IMAGE_DIR):
    raise FileNotFoundError(f"Could not find folder: {TEST_IMAGE_DIR}. Check your unzip!")

print(f"Loading test data from {TEST_CSV}.")
test_dataset = load_dataset("csv", data_files=TEST_CSV, split="train")

Loading test data from data\test_images_path.csv.


Generating train split: 0 examples [00:00, ? examples/s]

Good luck to my computer

In [7]:
counter = 0

def process_test_data(example):
    global counter
    
    # Progress tracker
    if counter % 100 == 0:
        print(f"Processing test image #{counter}...", end="\r")
    counter += 1
    
    # Fix path
    filename = os.path.basename(example["image_path"])
    full_path = os.path.join(TEST_IMAGE_DIR, filename)
    
    try:
        img = Image.open(full_path).convert("RGB")
        valid = True
    except Exception:
        # Safety net for broken images
        img = Image.new("RGB", (224, 224), (0, 0, 0))
        valid = False
    
    return {
        "image": img,
        "id": example["id"],
        "valid": valid
    }

print("Starting test data processing...")

# Remove 'image_path' and 'label'
test_dataset = test_dataset.map(
    process_test_data, 
    remove_columns=["image_path", "label"], 
    load_from_cache_file=False
)

# Filter out bad images
test_dataset = test_dataset.filter(lambda x: x["valid"])
test_dataset = test_dataset.remove_columns(["valid"])

print(f"\nDone! Processed {len(test_dataset)} test images.")

# Save to Disk
print(f"Saving to {TEST_OUTPUT_PATH}...")
test_dataset.save_to_disk(TEST_OUTPUT_PATH)

print(f"All Done! Test data saved to {TEST_OUTPUT_PATH}")

Starting test data processing...


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Processing test image #3900...

Filter:   0%|          | 0/4000 [00:00<?, ? examples/s]


Done! Processed 4000 test images.
Saving to processed_bird_test_data...


Saving the dataset (0/3 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

All Done! Test data saved to processed_bird_test_data
