Basic configuration (assuming there is a data folder that contains the training and test folders, and rest of the csv files).

In [1]:
import os
import shutil
import pandas as pd
from PIL import Image
from datasets import load_from_disk, load_dataset, DatasetDict, Features, Image as ImageFeature, Value, ClassLabel

BASE_DIR = "data"
CSV_FILE = os.path.join(BASE_DIR, "train_images.csv")
IMAGE_DIR = os.path.join(BASE_DIR, "train_images")
OUTPUT_PATH = "processed_bird_data"

Load dataset:

In [3]:
if not os.path.exists(IMAGE_DIR):
    raise FileNotFoundError(f"Could not find folder: {IMAGE_DIR}. Make sure you unzipped the images there!")

dataset = load_dataset("csv", data_files=CSV_FILE, split="train")

**Handling path correction, grayscale conversion, and label fixing:**

For path correction, we want to standardize CSV paths to full local path: "data/train_images/1.jpg"
Then, we convert images to RGB. CNNs usually expect 3 channels, and some images might be grayscale, which is 1 channel. Fixing this now prevents crashes during training.
Finally, converting the indexing, which starts at 1, to start at 0 (for PyTorch/TensorFlow).

In [4]:
# Progress tracker
counter = 0

def process_data(example):
    global counter
    
    # Print status every 100 images:
    if counter % 100 == 0:
        print(f"Processing image #{counter}...", end="\r")
    counter += 1
    
    # Fix path
    filename = os.path.basename(example["image_path"]) 
    full_path = os.path.join(IMAGE_DIR, filename)
    
    try:
        # Convert to RGB to avoid grayscale errors
        img = Image.open(full_path).convert("RGB")
        valid = True
    except Exception:
        # If image is broken, return a tiny black square.
        img = Image.new("RGB", (224, 224), (0, 0, 0))
        valid = False
    
    return {
        "image": img,
        "label": example["label"] - 1,
        "valid": valid
    }

print("Starting processing")

# Run with cache disabled to force a fresh run:
dataset = dataset.map(process_data, remove_columns=["image_path"], load_from_cache_file=False)

# Filter out the dummy/broken images:
dataset = dataset.filter(lambda x: x["valid"])
dataset = dataset.remove_columns(["valid"])

print(f"\nDone! Processed {len(dataset)} images.")

OUTPUT_PATH = "processed_bird_data_FULL"
print(f"Saving to {OUTPUT_PATH}...")

dataset.save_to_disk(OUTPUT_PATH)

print(f"Done! Saved to {OUTPUT_PATH}")

Starting processing


Map: 100%|##########| 3926/3926 [00:00<?, ? examples/s]

Processing image #3900...
Done! Processed 3926 images.
Saving to processed_bird_data_FULL...


Saving the dataset (0/3 shards):   0%|          | 0/3926 [00:00<?, ? examples/s]

Done! Saved to processed_bird_data_FULL


**Doing split:**

Call this cell to retrieve the pre-saved file:

In [4]:
# Just for safety, also easier to re-do it after a closed kernel
FULL_DATASET_PATH = "processed_bird_data_FULL"
FINAL_OUTPUT_PATH = "processed_bird_data"

if os.path.exists(FULL_DATASET_PATH):
    print(f"Found saved data at '{FULL_DATASET_PATH}'. Loading.")
    
    try:
        full_dataset = load_from_disk(FULL_DATASET_PATH)
        print(f"Loaded {len(full_dataset)} images from disk.")
        
        print("Resuming splitting process.")
        
        # 85% train, 15% validation (seed=42 for locking it)
        splits = full_dataset.train_test_split(test_size=0.15, seed=42)

        final_dataset = DatasetDict({
            "train": splits["train"],
            "validation": splits["test"] 
        })

        import numpy as np
        from datasets import ClassLabel
        
        class_names_path = os.path.join("data", "class_names.npy")
        if os.path.exists(class_names_path):
            print("Attaching class names metadata.")
            class_names = np.load(class_names_path, allow_pickle=True).tolist()
            
            if isinstance(class_names, dict):
                sorted_keys = sorted(class_names.keys())
                class_names = [str(class_names[k]) for k in sorted_keys]

            new_features = final_dataset["train"].features.copy()
            new_features["label"] = ClassLabel(names=class_names)
            final_dataset = final_dataset.cast(new_features)
        else:
            print("Warning: class_names.npy not found. Skipping metadata attachment.")
        # -----------------------------------------------------------------------

        print(f"Saving final split dataset to '{FINAL_OUTPUT_PATH}'.")
        final_dataset.save_to_disk(FINAL_OUTPUT_PATH)

        print("\nPipeline complete!")
        print(f"Train samples:      {len(final_dataset['train'])}")
        print(f"Validation samples: {len(final_dataset['validation'])}")
        
    except Exception as e:
        print(f"Error loading dataset: {e}")
else:
    print(f"Could not find '{FULL_DATASET_PATH}'. You must run the image processing cell first.")

Found saved data at 'processed_bird_data_FULL'. Loading.
Loaded 3926 images from disk.
Resuming splitting process.
Attaching class names metadata.


Casting the dataset:   0%|          | 0/3337 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/589 [00:00<?, ? examples/s]

Saving final split dataset to 'processed_bird_data'.


Saving the dataset (0/2 shards):   0%|          | 0/3337 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/589 [00:00<?, ? examples/s]


Pipeline complete!
Train samples:      3337
Validation samples: 589


Perform the split (85% Train, 15% Validation), and save:

Doing the same thing for test data, first configuration:

In [2]:
TEST_CSV = os.path.join(BASE_DIR, "test_images_path.csv")
TEST_IMAGE_DIR = os.path.join(BASE_DIR, "test_images")
TEST_OUTPUT_PATH = "processed_bird_test_data"

if not os.path.exists(TEST_IMAGE_DIR):
    raise FileNotFoundError(f"Could not find folder: {TEST_IMAGE_DIR}. Check your unzip!")

print(f"Loading test data from {TEST_CSV}.")
test_dataset = load_dataset("csv", data_files=TEST_CSV, split="train")

Loading test data from data/test_images_path.csv.


Good luck to my computer

In [3]:
def process_test_data(example):   
    filename = os.path.basename(example["image_path"])
    full_path = os.path.join(TEST_IMAGE_DIR, filename)
    
    try:
        img = Image.open(full_path).convert("RGB")
        valid = True
    except Exception:
        img = Image.new("RGB", (224, 224), (0, 0, 0))
        valid = False
    
    return {
        "image": img,
        "id": example["id"],
        "label": example["label"] - 1, 
        "valid": valid
    }

print("Starting now.")

test_dataset = test_dataset.map(
    process_test_data, 
    remove_columns=["image_path"], 
    load_from_cache_file=False,
    desc="Processing (resize, rgb, fix labels)" 
)

test_dataset = test_dataset.filter(lambda x: x["valid"])
test_dataset = test_dataset.remove_columns(["valid"])

print(f"\nDone!")

test_dataset.save_to_disk(TEST_OUTPUT_PATH)

print(f"Saved in: {TEST_OUTPUT_PATH}")

Starting now.


Processing (resize, rgb, fix labels):   0%|          | 0/4000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4000 [00:00<?, ? examples/s]


Done!


Saving the dataset (0/3 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved in: processed_bird_test_data
