In [None]:
# @title Notebook & Environment Setup
# NOTE: run this in a Colab / Jupyter notebook environment with GPU selected if you want RAPIDS/cuDF acceleration.
%load_ext cudf.pandas

# Package installs (not strictly necessary each run once environment is prepared)
!pip install datasets imagehash opencv-python-headless pillow fuzzywuzzy python-Levenshtein
!apt-get install git-lfs -y
!git lfs install

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
Git LFS initialized.


In [None]:
# @title Imports (Sorted by Purpose)
# ---------------------------------
# File & Path Management
# ---------------------------------
import os
import shutil
from pathlib import Path

# ---------------------------------
# Image Loading & Processing
# ---------------------------------
from PIL import Image, ImageOps, UnidentifiedImageError
import cv2

# ---------------------------------
# Image Hashing / Similarity
# ---------------------------------
import imagehash
from fuzzywuzzy import fuzz

# ---------------------------------
# Data Handling & Numerical Operations
# ---------------------------------
import numpy as np
import pandas as pd
from collections import defaultdict

# ---------------------------------
# Dataset I/O (HuggingFace)
# ---------------------------------
from datasets import load_dataset

In [None]:
# @title Configuration / Constants
BLUR_THRESHOLD = 60.0
DUPLICATE_HASH_SIZE = 16
DUP_HAMMING_DISTANCE = 4
AUTO_ROTATE = True
OVERWRITE_ORIENTED_IMAGE = True

In [None]:
# @title Output Base Directory
OUTPUT_BASE = Path("/content/cleaned_birds")

In [None]:
# @title Utility Function
def normalize_label(x):
    """Normalize bird names for consistent labeling / folder naming."""
    return (
        str(x)
        .strip()
        .lower()
        .replace("-", "_")
        .replace(" ", "_")
    )

In [None]:
# @title Load and Prepare Dataset
def load_and_prepare(df, dataset_tag: str):
    """Drop NA, normalize label column, remove unused columns, and reorder columns."""
    print(f"=== {dataset_tag} rows before cleaning: {len(df)} ===")
    df = df.dropna()
    print(f"\n=== {dataset_tag} | rows after dropna: {len(df)} ===")

    img_col = "image"
    lbl_col = "bird_name"

    # Normalize labels
    df["bird_name"] = df[lbl_col].apply(normalize_label)

    keep_cols = ["image", "bird_name"]
    other_cols = [c for c in df.columns if c not in keep_cols]
    df = df[keep_cols + other_cols]

    cols = ['text', 'description']
    for col in cols:
        df = df.drop(columns=[col], errors="ignore")

    # return df, img_col, lbl_col
    return df

In [None]:
# @title Data Cleaning
def filter_corrupted_and_blurry(df: pd.DataFrame, dataset_tag: str):
    """Remove corrupted images and those below blur threshold. Returns filtered DataFrame."""
    print(f"\n=== {dataset_tag} | Filtering corrupted & blurry images ===")

    indices_to_drop = []
    img_col = "image"
    total = len(df)

    for idx, row in df.iterrows():
        # Progress every 500 images
        if idx % 500 == 0:
            print(f" Processing {idx}/{total}...")

        img = row[img_col]

        # Quick validation
        if img is None or not hasattr(img, "convert"):
            indices_to_drop.append(idx)
            continue

        reason = None

        # 1. FAST integrity check: just try to convert to RGB
        try:
            _ = img.convert("RGB")  # If this works, image is valid
        except Exception:
            reason = "corrupted"
            indices_to_drop.append(idx)
            continue  # Skip expensive operations

        # 2. Auto-orient (optional, but fast)
        if AUTO_ROTATE and reason is None:
            try:
                oriented = ImageOps.exif_transpose(img)
                if oriented is not None and oriented != img:
                    df.at[idx, img_col] = oriented  # Update in DataFrame
                    # Save back to file if requested
                    if OVERWRITE_ORIENTED_IMAGE and hasattr(img, 'filename') and img.filename:
                        try:
                            oriented.save(img.filename)
                        except:
                            pass
            except Exception:
                pass

        # 3. Blur check (SLOW - comment out for initial testing)
        if reason is None:  # Remove this block to skip blur detection
            try:
                gray = np.array(img.convert("L"), dtype=np.uint8)
                varLap = cv2.Laplacian(gray, cv2.CV_64F).var()
                if varLap < BLUR_THRESHOLD:
                    reason = f"blurry({varLap:.1f})"
            except Exception:
                reason = "blur_calc_fail"

        if reason:
            indices_to_drop.append(idx)

    # Batch drop for speed
    if indices_to_drop:
        df.drop(index=indices_to_drop, inplace=True)
        df.reset_index(drop=True, inplace=True)

    print(f"{dataset_tag}: Kept {len(df)} images, dropped {len(indices_to_drop)}")
    return df

def deduplicate(df: pd.DataFrame, dataset_tag: str):
    """Remove exact and near-duplicate images using perceptual hashing on PIL images."""
    print(f"\n=== {dataset_tag} | Duplicate filtering) ===")

    phash_objs = {}
    phash_hexes = {}

    for idx, row in df.iterrows():
        img = row["image"]
        if img is None or not hasattr(img, "save"):
            continue

        # Compute perceptual hash directly on PIL image
        try:
            h = imagehash.phash(img.convert("RGB"), hash_size=DUPLICATE_HASH_SIZE)
            phash_objs[idx] = h
            phash_hexes[idx] = str(h)
        except Exception:
            continue

    to_remove = set()

    # 1. Exact duplicates
    hash_to_idxs = defaultdict(list)
    for idx, h_hex in phash_hexes.items():
        hash_to_idxs[h_hex].append(idx)

    for h_hex, idxs in hash_to_idxs.items():
        if len(idxs) > 1:
            idxs.sort()
            for i in idxs[1:]:
                to_remove.add(i)

    # 2. Near duplicates within same label
    by_label = defaultdict(list)
    for idx in phash_objs:
        if idx not in to_remove:
            label = df.loc[idx, "bird_name"]
            by_label[label].append((idx, phash_objs[idx]))

    for label, items in by_label.items():
        items.sort(key=lambda x: x[0])
        local_removed = set()
        for i in range(len(items)):
            base_idx, base_h = items[i]
            if base_idx in local_removed or base_idx in to_remove:
                continue
            for j in range(i+1, len(items)):
                comp_idx, comp_h = items[j]
                if comp_idx in local_removed or comp_idx in to_remove:
                    continue
                try:
                    d = base_h - comp_h
                except Exception:
                    continue
                if d <= DUP_HAMMING_DISTANCE:
                    to_remove.add(comp_idx)
                    local_removed.add(comp_idx)

    if to_remove:
        df.drop(index=list(to_remove), inplace=True)

    print(f"{dataset_tag}: processed {len(df)} images\n")
    return df

In [None]:
# @title Saving Data
def save_yolov8_structure(df, dataset_name, output_base):
    """Save images into YOLOv8-like folder structure: output_base/dataset_name/<split>/<class>/*.jpg"""
    dataset_dir = output_base / dataset_name
    dataset_dir.mkdir(exist_ok=True)
    total_saved = 0

    print(f"\nSaving images for {dataset_name}...")
    for split_name, split_df in df.groupby('split'):
        print(f"  {split_name}: {len(split_df)} images")
        for bird_name, bird_df in split_df.groupby('bird_name_normalized'):
            target_dir = dataset_dir / split_name / bird_name
            target_dir.mkdir(parents=True, exist_ok=True)

            for idx, row in bird_df.iterrows():
                try:
                    image = row['image']
                    if hasattr(image, 'filename') and image.filename:
                        filename = Path(image.filename).name
                    else:
                        filename = f"img_{idx}.jpg"

                    # Handle duplicate filenames
                    target_path = target_dir / filename
                    counter = 1
                    while target_path.exists():
                        stem, suffix = filename.rsplit('.', 1) if '.' in filename else (filename, 'jpg')
                        filename = f"{stem}_{counter}.{suffix}"
                        target_path = target_dir / filename
                        counter += 1

                    img_rgb = image.convert('RGB') if image.mode != 'RGB' else image
                    img_rgb.save(target_path, quality=95)
                    total_saved += 1
                except Exception as e:
                    print(f"  Error at index {idx}: {e}")
                    continue

    print(f"{dataset_name}: {total_saved} images saved")
    return total_saved

In [None]:
# @title Dataset Load (repositories)
# Clone repos if not present (not duplicated)
if not os.path.exists("/content/bird-species"):
    !git clone https://huggingface.co/datasets/Ez-Clap/bird-species /content/bird-species/

ds2 = load_dataset("imagefolder", data_dir="/content/bird-species")

if not os.path.exists("/content/birds"):
    !git clone https://huggingface.co/datasets/JotDe/birds /content/birds/

ds1 = load_dataset("/content/birds")

Resolving data files:   0%|          | 0/6775 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/215 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/215 [00:00<?, ?it/s]

In [None]:
ds1

DatasetDict({
    train: Dataset({
        features: ['image', 'description', 'label', 'file_name', 'text'],
        num_rows: 5994
    })
    test: Dataset({
        features: ['image', 'description', 'label', 'file_name', 'text'],
        num_rows: 5794
    })
})

In [None]:
ds2

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 6775
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 215
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 215
    })
})

In [None]:
print("Sample dataset entries (train split):")
print(f" Dataset 1 example: {ds1['train'][0]}")
print(f" Dataset 2 example: {ds2['train'][0]}")

Sample dataset entries (train split):
 Dataset 1 example: {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x301 at 0x7E17CCC62870>, 'description': 'this bird is brown with a lighter brown crest.\naquatic large bird with long hooked bill, white face, and brown body.\nbird has brown body feathers, brown breast feathers, and brown beak\nthis bird has a white superciliary and brown all around its body with a long bill\nthis is a brown bird with a white face and a long downward pointing beak.\nthis bird is brown with white and has a long, pointy beak.\nbrown duck playing on the lake making a poodle\nthis bird has wings that are brown and has a long bill\nthis bird has long brown bill, with a brown body.\nthis is a medium sized brown bird, with a long pointed bill.\n', 'label': 0, 'file_name': 'Black_Footed_Albatross_0007_796138.jpg', 'text': 'this bird is brown with a lighter brown crest'}
 Dataset 2 example: {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB siz

In [None]:
# @title Converting Incorrect Labels
label_feature = ds1["train"].features["label"]
ds1 = ds1.map(lambda x: {"bird_name": label_feature.int2str(x["label"])})

label_feature2 = ds2["train"].features["label"]
ds2 = ds2.map(lambda x: {"bird_name": label_feature2.int2str(x["label"])})

Map:   0%|          | 0/6775 [00:00<?, ? examples/s]

Map:   0%|          | 0/215 [00:00<?, ? examples/s]

Map:   0%|          | 0/215 [00:00<?, ? examples/s]

In [None]:
# @title Convert to Pandas DataFrames

df1_train = pd.DataFrame(ds1['train'])
df1_test = pd.DataFrame(ds1['test'])
df2_train = pd.DataFrame(ds2['train'])
df2_test = pd.DataFrame(ds2['test'])
df2_validate = pd.DataFrame(ds2['validation'])

In [None]:
# @title Inspect First Rows
df1_train.iloc[0]['image']
df2_train.iloc[0]['image']
df1_train.head()
df2_train.head()

Unnamed: 0,image,label,bird_name
0,<PIL.JpegImagePlugin.JpegImageFile image mode=...,0,BAR-TAILED GODWIT
1,<PIL.JpegImagePlugin.JpegImageFile image mode=...,0,BAR-TAILED GODWIT
2,<PIL.JpegImagePlugin.JpegImageFile image mode=...,0,BAR-TAILED GODWIT
3,<PIL.JpegImagePlugin.JpegImageFile image mode=...,0,BAR-TAILED GODWIT
4,<PIL.JpegImagePlugin.JpegImageFile image mode=...,0,BAR-TAILED GODWIT


In [None]:
# @title Train Splits
df1_train = load_and_prepare(df1_train, "DF1_Train")
df1_train = filter_corrupted_and_blurry(df1_train, "DF1_Train")
df1_train = deduplicate(df1_train, "DF1_Train")

df2_train = load_and_prepare(df2_train, "DF2_Train")
df2_train = filter_corrupted_and_blurry(df2_train, "DF2_Train")
df2_train = deduplicate(df2_train, "DF2_Train")

=== DF1_Train rows before cleaning: 5994 ===

=== DF1_Train | rows after dropna: 5994 ===

=== DF1_Train | Filtering corrupted & blurry images ===
 Processing 0/5994...
 Processing 500/5994...
 Processing 1000/5994...
 Processing 1500/5994...
 Processing 2000/5994...
 Processing 2500/5994...
 Processing 3000/5994...
 Processing 3500/5994...
 Processing 4000/5994...
 Processing 4500/5994...
 Processing 5000/5994...
 Processing 5500/5994...
DF1_Train: Kept 5952 images, dropped 42

=== DF1_Train | Duplicate filtering) ===
DF1_Train: processed 5950 images

=== DF2_Train rows before cleaning: 6775 ===

=== DF2_Train | rows after dropna: 6775 ===

=== DF2_Train | Filtering corrupted & blurry images ===
 Processing 0/6775...
 Processing 500/6775...
 Processing 1000/6775...
 Processing 1500/6775...
 Processing 2000/6775...
 Processing 2500/6775...
 Processing 3000/6775...
 Processing 3500/6775...
 Processing 4000/6775...
 Processing 4500/6775...
 Processing 5000/6775...
 Processing 5500/6775..

In [None]:
# @title Test splits
df1_test = load_and_prepare(df1_test, "DF1_Test")
df1_test = filter_corrupted_and_blurry(df1_test, "DF1_Test")
df1_test = deduplicate(df1_test, "DF1_Test")

df2_test = load_and_prepare(df2_test, "DF2_Test")
df2_test = filter_corrupted_and_blurry(df2_test, "DF2_Test")
df2_test = deduplicate(df2_test, "DF2_Test")

=== DF1_Test rows before cleaning: 5794 ===

=== DF1_Test | rows after dropna: 5794 ===

=== DF1_Test | Filtering corrupted & blurry images ===
 Processing 0/5794...
 Processing 500/5794...
 Processing 1000/5794...
 Processing 1500/5794...
 Processing 2000/5794...
 Processing 2500/5794...
 Processing 3000/5794...
 Processing 3500/5794...
 Processing 4000/5794...
 Processing 4500/5794...
 Processing 5000/5794...
 Processing 5500/5794...
DF1_Test: Kept 5746 images, dropped 48

=== DF1_Test | Duplicate filtering) ===
DF1_Test: processed 5744 images

=== DF2_Test rows before cleaning: 215 ===

=== DF2_Test | rows after dropna: 215 ===

=== DF2_Test | Filtering corrupted & blurry images ===
 Processing 0/215...
DF2_Test: Kept 215 images, dropped 0

=== DF2_Test | Duplicate filtering) ===
DF2_Test: processed 215 images



In [None]:
# @title Validation Split (df2 only)
df2_validate = load_and_prepare(df2_validate, "DF2_Validate")
df2_validate = filter_corrupted_and_blurry(df2_validate, "DF2_Validate")
df2_validate = deduplicate(df2_validate, "DF2_Validate")

=== DF2_Validate rows before cleaning: 215 ===

=== DF2_Validate | rows after dropna: 215 ===

=== DF2_Validate | Filtering corrupted & blurry images ===
 Processing 0/215...
DF2_Validate: Kept 215 images, dropped 0

=== DF2_Validate | Duplicate filtering) ===
DF2_Validate: processed 215 images



In [None]:
# @title CSV Exports & Combination
OUTPUT_BASE = OUTPUT_BASE
OUTPUT_BASE.mkdir(parents=True, exist_ok=True)

def normalize_label(x):
    """Normalize bird names for folder naming."""
    return str(x).strip().lower().replace(" ", "")

# Create jotde CSV
print("Creating jotde_birds.csv...")
df1_train['split'] = 'train'
df1_test['split'] = 'test'
jotde_combined = pd.concat([df1_train, df1_test], ignore_index=True)
jotde_combined['bird_name_normalized'] = jotde_combined['bird_name'].apply(normalize_label)
jotde_csv_path = OUTPUT_BASE / "jotde_birds.csv"
jotde_combined.to_csv(jotde_csv_path, index=False)

# Create bird_species CSV
print("Creating bird_species.csv...")
df2_train['split'] = 'train'
df2_test['split'] = 'test'
df2_validate['split'] = 'validation'
bird_species_combined = pd.concat([df2_train, df2_test, df2_validate], ignore_index=True)
bird_species_combined['bird_name_normalized'] = bird_species_combined['bird_name'].apply(normalize_label)
bird_species_csv_path = OUTPUT_BASE / "bird_species.csv"
bird_species_combined.to_csv(bird_species_csv_path, index=False)

# Combine both datasets
print("Combining datasets...")
df_combined = pd.concat([jotde_combined, bird_species_combined], ignore_index=True)
combined_csv_path = OUTPUT_BASE / "birds_combined_cleaned.csv"
df_combined.to_csv(combined_csv_path, index=False)
print(f"Combined dataset saved to {combined_csv_path}")

Creating jotde_birds.csv...
Creating bird_species.csv...
Combining datasets...
Combined dataset saved to /content/cleaned_birds/birds_combined_cleaned.csv


In [None]:
# @title Save Images to YOLOv8 Structure
save_yolov8_structure(df_combined, "birds-combined", OUTPUT_BASE)


Saving images for birds-combined...
  test: 5959 images
  train: 12714 images
  validation: 215 images
birds-combined: 18888 images saved


18888

In [None]:
# @title Final verification
print("\n" + "="*60)
print("FINAL VERIFICATION")
print("="*60)

dataset_dir = OUTPUT_BASE / "birds-combined"
print(f"\nBIRDS-COMBINED:")
for split_dir in sorted(dataset_dir.iterdir()):
    if split_dir.is_dir():
        num_classes = len([d for d in split_dir.iterdir() if d.is_dir()])
        num_images = sum(1 for _ in split_dir.rglob("*.jpg"))
        print(f"  {split_dir.name}/: {num_classes} classes, {num_images} images")
    else:
        print("  birds-combined directory not found.")

print(f"\nProcessing complete. All data saved to: {OUTPUT_BASE}")
print("CSV files:")
print(f"  {jotde_csv_path}")
print(f"  {bird_species_csv_path}")
print(f"  {combined_csv_path}")


FINAL VERIFICATION

BIRDS-COMBINED:
  test/: 239 classes, 5959 images
  train/: 239 classes, 12714 images
  validation/: 43 classes, 215 images

Processing complete. All data saved to: /content/cleaned_birds
CSV files:
  /content/cleaned_birds/jotde_birds.csv
  /content/cleaned_birds/bird_species.csv
  /content/cleaned_birds/birds_combined_cleaned.csv
