# Pipeline de prÃ©paration des donnÃ©es pour la dÃ©tection dâ€™oiseaux morts

Ce notebook dÃ©crit lâ€™ensemble du workflow de prÃ©paration des donnÃ©es utilisÃ© pour lâ€™entraÃ®nement du modÃ¨le de dÃ©tection dâ€™oiseaux morts Ã  partir dâ€™images aÃ©riennes de haute rÃ©solution.

En raison de la grande taille des images originales et de la trÃ¨s petite taille relative des oiseaux, un apprentissage direct sur les images brutes est peu efficace. Un pipeline de prÃ©-traitement en plusieurs Ã©tapes est donc mis en place, comprenant :

- lâ€™analyse statistique du jeu de donnÃ©es initial,
- le dÃ©coupage des images en tuiles de 512Ã—512 pixels,
- lâ€™augmentation de donnÃ©es,
- la fusion des jeux de donnÃ©es gÃ©nÃ©rÃ©s,
- lâ€™analyse finale du dataset utilisÃ© pour lâ€™apprentissage.

Lâ€™objectif est dâ€™amÃ©liorer la visibilitÃ© des objets, dâ€™augmenter la diversitÃ© des Ã©chantillons et de renforcer la capacitÃ© de gÃ©nÃ©ralisation du modÃ¨le.


In [None]:
import os
import glob
import cv2
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter


## 1. Analyse du jeu de donnÃ©es initial

In [None]:
def analyze_dataset(root):
    stats = []
    for split in ["train", "valid", "test"]:
        img_dir = os.path.join(root, split, "images")
        lbl_dir = os.path.join(root, split, "labels")

        if not os.path.exists(img_dir):
            continue

        images = glob.glob(os.path.join(img_dir, "*.jpg")) + glob.glob(os.path.join(img_dir, "*.png"))
        labels = glob.glob(os.path.join(lbl_dir, "*.txt"))

        n_boxes = 0
        n_empty = 0
        for f in labels:
            with open(f) as file:
                lines = file.readlines()
                if len(lines) == 0:
                    n_empty += 1
                n_boxes += len(lines)

        stats.append([split, len(images), len(labels), n_boxes, n_empty])

    return pd.DataFrame(stats, columns=["Split", "Images", "Labels", "Boxes", "Empty"])


In [None]:
import os

DATASET_PATH = "augmented_dataset"

def count_labels(split_path):
    label_dir = os.path.join(split_path, "labels")
    img_dir = os.path.join(split_path, "images")

    total_imgs = 0
    empty_labels = 0
    non_empty_labels = 0

    for file in os.listdir(label_dir):
        if not file.endswith(".txt"):
            continue

        total_imgs += 1
        path = os.path.join(label_dir, file)

        with open(path, "r") as f:
            content = f.read().strip()

        if content == "":
            empty_labels += 1
        else:
            non_empty_labels += 1

    print(f"\nðŸ“‚ {os.path.basename(split_path)}")
    print(f"Images        : {total_imgs}")
    print(f"Labels vides  : {empty_labels}")
    print(f"Labels pleins : {non_empty_labels}")


for split in ["train", "valid", "test"]:
    split_path = os.path.join(DATASET_PATH, split)
    if os.path.exists(split_path):
        count_labels(split_path)


In [None]:
ORIGINAL_DATASET = "exports"

df_original = analyze_dataset(ORIGINAL_DATASET)
df_original

## 2. DÃ©coupage des images (Tiling)


In [None]:
print("Lancement du script de dÃ©coupage...")
os.system("python test_split.py")
print("DÃ©coupage terminÃ©.")


In [None]:
TILED_DATASET = "PIC_2_test_tiled_512_v2"
df_tiled = analyze_dataset(TILED_DATASET)
df_tiled


## 3. Augmentation de donnÃ©es

In [None]:
print("Lancement du script dâ€™augmentation...")
os.system("python data_augmentation.py")
print("Augmentation terminÃ©e.")


In [None]:
AUG_DATASET = "C:\\Users\\DELL\\Documents\\S9\\Projet_Pic\\dead-bird-detection\\augmented_dataset"

df_aug = analyze_dataset(AUG_DATASET)
df_aug


In [None]:
AUG_DATASET = "augmented_dataset"
df_aug = analyze_dataset(AUG_DATASET)
df_aug


## 4. Fusion des datasets

In [None]:
COMBINED_ROOT = "final_dataset"

def merge_datasets(src_roots, dst_root):
    for split in ["train","test","valid"]:
        for root in src_roots:
            img_dir = os.path.join(root, split, "images")
            lbl_dir = os.path.join(root, split, "labels")

            out_img = os.path.join(dst_root, split, "images")
            out_lbl = os.path.join(dst_root, split, "labels")

            os.makedirs(out_img, exist_ok=True)
            os.makedirs(out_lbl, exist_ok=True)

            for f in glob.glob(os.path.join(img_dir, "*")):
                os.system(f'copy "{f}" "{out_img}"')

            for f in glob.glob(os.path.join(lbl_dir, "*")):
                os.system(f'copy "{f}" "{out_lbl}"')

merge_datasets([TILED_DATASET, AUG_DATASET], COMBINED_ROOT)
print("Fusion terminÃ©e.")


In [None]:
df_final = analyze_dataset(COMBINED_ROOT)
df_final


In [None]:
import os
from pathlib import Path
from PIL import Image

# Chemin vers le dataset
dataset_path = Path("final_dataset")

splits = ["train", "valid", "test"]

for split in splits:
    images_path = dataset_path / split / "images"
    labels_path = dataset_path / split / "labels"
    
    image_files = list(images_path.glob("*.jpg")) + list(images_path.glob("*.png"))
    label_files = list(labels_path.glob("*.txt"))
    
    print(f"\n=== Split: {split} ===")
    print(f"Nombre d'images: {len(image_files)}")
    
    # VÃ©rifier la dimension des images
    wrong_dim = []
    for img_file in image_files:
        with Image.open(img_file) as img:
            if img.size != (512, 512):
                wrong_dim.append(img_file.name)
    if wrong_dim:
        print(f"Images avec mauvaise dimension (pas 512x512): {len(wrong_dim)}")
        print(wrong_dim[:5], "..." if len(wrong_dim) > 5 else "")
    else:
        print("Toutes les images sont de 512x512 âœ…")
    
    # Analyse des labels
    total_annotations = 0
    empty_labels = 0
    out_of_bounds = 0
    
    for lbl_file in label_files:
        with open(lbl_file, "r") as f:
            lines = f.readlines()
            if len(lines) == 0:
                empty_labels += 1
            for line in lines:
                total_annotations += 1
                parts = line.strip().split()
                if len(parts) != 5:
                    print(f"Format incorrect: {lbl_file.name}")
                    continue
                _, x, y, w, h = map(float, parts)
                if not (0 <= x <= 1 and 0 <= y <= 1 and 0 <= w <= 1 and 0 <= h <= 1):
                    out_of_bounds += 1
    
    print(f"Nombre total d'annotations: {total_annotations}")
    print(f"Images sans annotations: {empty_labels}")
    print(f"Annotations hors limites [0,1]: {out_of_bounds}")


In [None]:
import os
from pathlib import Path

dataset_path = Path("final_dataset")
splits = ["train", "valid", "test"]

deleted_images = 0

for split in splits:
    images_dir = dataset_path / split / "images"
    labels_dir = dataset_path / split / "labels"

    print(f"\n=== Nettoyage du split {split} ===")

    for label_file in labels_dir.glob("*.txt"):
        remove_sample = False

        with open(label_file, "r") as f:
            lines = f.readlines()

            for line in lines:
                parts = line.strip().split()
                if len(parts) != 5:
                    remove_sample = True
                    break

                _, x, y, w, h = map(float, parts)

                if not (0 <= x <= 1 and 0 <= y <= 1 and
                        0 <= w <= 1 and 0 <= h <= 1):
                    remove_sample = True
                    break

        if remove_sample:
            image_name = label_file.stem
            image_path = None

            for ext in [".jpg", ".png", ".jpeg"]:
                candidate = images_dir / f"{image_name}{ext}"
                if candidate.exists():
                    image_path = candidate
                    break

            # Suppression
            label_file.unlink()
            if image_path:
                image_path.unlink()
                deleted_images += 1

    print(f"Images supprimÃ©es dans {split}")

print(f"\nTotal images supprimÃ©es : {deleted_images}")


In [None]:
import os
import random

DATASET_PATH = "final_dataset"

KEEP_RATIOS = {
    "train": 1,
    "valid": 1
}


def clean_split(split):
    img_dir = os.path.join(DATASET_PATH, split, "images")
    lbl_dir = os.path.join(DATASET_PATH, split, "labels")

    empty_files = []

    for lbl_file in os.listdir(lbl_dir):
        if not lbl_file.endswith(".txt"):
            continue

        path = os.path.join(lbl_dir, lbl_file)
        with open(path) as f:
            content = f.read().strip()

        if content == "":
            empty_files.append(lbl_file)

    total_empty = len(empty_files)
    keep_n = int(total_empty * KEEP_RATIOS[split])

    random.shuffle(empty_files)
    keep_files = set(empty_files[:keep_n])
    remove_files = empty_files[keep_n:]

    for lbl_file in remove_files:
        img_file = os.path.splitext(lbl_file)[0] + ".jpg"

        lbl_path = os.path.join(lbl_dir, lbl_file)
        img_path = os.path.join(img_dir, img_file)

        if os.path.exists(lbl_path):
            os.remove(lbl_path)
        if os.path.exists(img_path):
            os.remove(img_path)

    print(f"\nðŸ“‚ {split}")
    print(f"Empty labels total : {total_empty}")
    print(f"Kept              : {keep_n}")
    print(f"Removed           : {len(remove_files)}")


for split in ["train", "valid"]:
    clean_split(split)
