In [None]:
# ==========================================================
# üêæ ANIMAL DETECTION DATASET PREP (YOLO FORMAT - 9 CLASSES)
# Dataset Type: Pascal VOC XML (annotations) + images
#
# Output Folder:
# /kaggle/working/AnimalDataset/
#     train/images + train/labels
#     valid/images + valid/labels
#     test/images  + test/labels
#     data.yaml
#
# NOTE: This code uses splitfolders (NO recursive glob)
# Run this cell only ONCE
# ==========================================================

!pip install -q split-folders pyyaml lxml

import os
import glob
import shutil
import yaml
import splitfolders
import xml.etree.ElementTree as ET


# ==========================================================
# SET PATHS (CHANGE THIS ACCORDING TO YOUR DATASET)
# ==========================================================
DATASET_DIR = "/kaggle/input/animal-detection-dataset"   # <-- CHANGE THIS
IMG_DIR = os.path.join(DATASET_DIR, "images")            # <-- CHANGE THIS if needed
ANN_DIR = os.path.join(DATASET_DIR, "annotations")       # <-- CHANGE THIS if needed

print("Images folder:", IMG_DIR)
print("Annotations folder:", ANN_DIR)


# ==========================================================
# FINAL OUTPUT FOLDER
# ==========================================================
FINAL_DIR = "/kaggle/working/AnimalDataset"

if os.path.exists(FINAL_DIR):
    shutil.rmtree(FINAL_DIR)

os.makedirs(FINAL_DIR, exist_ok=True)


# ==========================================================
# CREATE TEMP DIR (splitfolders requires a folder)
# ==========================================================
TEMP_DIR = "/kaggle/working/temp_split"

if os.path.exists(TEMP_DIR):
    shutil.rmtree(TEMP_DIR)

os.makedirs(TEMP_DIR, exist_ok=True)

TEMP_IMAGES = os.path.join(TEMP_DIR, "images")
os.makedirs(TEMP_IMAGES, exist_ok=True)


# ==========================================================
# COPY IMAGES TO TEMP
# ==========================================================
image_files = glob.glob(os.path.join(IMG_DIR, "*.jpg")) + glob.glob(os.path.join(IMG_DIR, "*.png"))

for img_path in image_files:
    shutil.copy(img_path, TEMP_IMAGES)

print(f"‚úÖ Copied {len(image_files)} images into temp_split/images")


# ==========================================================
# SPLIT DATASET USING SPLITFOLDERS
# ==========================================================
splitfolders.ratio(
    TEMP_IMAGES,
    output=FINAL_DIR,
    seed=42,
    ratio=(0.8, 0.1, 0.1)
)

# Rename val -> valid
if os.path.exists(os.path.join(FINAL_DIR, "val")):
    os.rename(os.path.join(FINAL_DIR, "val"), os.path.join(FINAL_DIR, "valid"))

print("‚úÖ Dataset split completed!")


# ==========================================================
# CREATE REQUIRED YOLO STRUCTURE
# ==========================================================
splits = ["train", "valid", "test"]

for split in splits:
    os.makedirs(os.path.join(FINAL_DIR, split, "images"), exist_ok=True)
    os.makedirs(os.path.join(FINAL_DIR, split, "labels"), exist_ok=True)

print("‚úÖ YOLO folders created!")


# ==========================================================
# DEFINE 9 CLASSES (CHANGE CLASS NAMES AS PER YOUR DATASET)
# ==========================================================
CLASS_NAMES = [
    "cat",
    "cow",
    "dog",
    "horse",
    "sheep",
    "goat",
    "elephant",
    "lion",
    "tiger"
]

CLASS_MAP = {name: idx for idx, name in enumerate(CLASS_NAMES)}

print("Class Mapping:", CLASS_MAP)


# ==========================================================
# XML -> YOLO FUNCTION (MULTI-CLASS)
# ==========================================================
def convert_xml_to_yolo(xml_file, output_txt_file):

    tree = ET.parse(xml_file)
    root = tree.getroot()

    img_width = int(root.find("size/width").text)
    img_height = int(root.find("size/height").text)

    yolo_lines = []

    for obj in root.findall("object"):
        label = obj.find("name").text.strip()

        if label not in CLASS_MAP:
            continue

        class_id = CLASS_MAP[label]

        bbox = obj.find("bndbox")
        xmin = int(float(bbox.find("xmin").text))
        ymin = int(float(bbox.find("ymin").text))
        xmax = int(float(bbox.find("xmax").text))
        ymax = int(float(bbox.find("ymax").text))

        x_center = ((xmin + xmax) / 2) / img_width
        y_center = ((ymin + ymax) / 2) / img_height
        w = (xmax - xmin) / img_width
        h = (ymax - ymin) / img_height

        yolo_lines.append(f"{class_id} {x_center:.6f} {y_center:.6f} {w:.6f} {h:.6f}")

    with open(output_txt_file, "w") as f:
        f.write("\n".join(yolo_lines))


# ==========================================================
# MOVE IMAGES INTO images/ AND CREATE labels/
# ==========================================================
for split in splits:

    split_root = os.path.join(FINAL_DIR, split)

    img_list = glob.glob(os.path.join(split_root, "*.jpg")) + glob.glob(os.path.join(split_root, "*.png"))

    print(f"{split.upper()} images found:", len(img_list))

    for img_path in img_list:

        img_name = os.path.basename(img_path)
        base_name = os.path.splitext(img_name)[0]

        # move image into split/images
        dst_img_path = os.path.join(FINAL_DIR, split, "images", img_name)
        shutil.move(img_path, dst_img_path)

        # xml annotation
        xml_path = os.path.join(ANN_DIR, base_name + ".xml")

        # output label file
        label_path = os.path.join(FINAL_DIR, split, "labels", base_name + ".txt")

        if os.path.exists(xml_path):
            convert_xml_to_yolo(xml_path, label_path)
        else:
            open(label_path, "w").close()

print("‚úÖ Images moved + Labels created successfully!")


# ==========================================================
# CREATE data.yaml FILE
# ==========================================================
DATA_YAML_PATH = os.path.join(FINAL_DIR, "data.yaml")

data_yaml = {
    "train": os.path.join(FINAL_DIR, "train/images"),
    "val": os.path.join(FINAL_DIR, "valid/images"),
    "test": os.path.join(FINAL_DIR, "test/images"),
    "nc": len(CLASS_NAMES),
    "names": CLASS_NAMES
}

with open(DATA_YAML_PATH, "w") as f:
    yaml.dump(data_yaml, f)

print("‚úÖ data.yaml created at:", DATA_YAML_PATH)


# ==========================================================
# CHECK FILE COUNTS
# ==========================================================
for split in splits:
    img_count = len(glob.glob(os.path.join(FINAL_DIR, split, "images", "*.*")))
    lbl_count = len(glob.glob(os.path.join(FINAL_DIR, split, "labels", "*.txt")))

    print(f"{split.upper()} -> Images: {img_count} | Labels: {lbl_count}")

print("\nüéØ FINAL YOLO DATASET READY AT:", FINAL_DIR)


# ==========================================================
# OPTIONAL: DELETE TEMP_SPLIT FOLDER
# ==========================================================
shutil.rmtree(TEMP_DIR)
print("‚úÖ temp_split folder deleted!")


In [None]:
# ==========================================================
# üêæ ANIMAL DETECTION DATASET PREP (YOLO FORMAT)
# Google Colab Version (splitfolders + cv2 image size)
#
# Output Folder:
# /content/AnimalDataset/
#     train/images + train/labels
#     valid/images + valid/labels
#     test/images  + test/labels
#     data.yaml
#
# NOTE: Works for Multiple Classes (9 classes)
# NOTE: Uses splitfolders (NO recursive glob)
# NOTE: Uses cv2.imread() to get actual image width & height
# Run this cell only ONCE
# ==========================================================

!pip install -q split-folders pyyaml lxml opencv-python

import os
import glob
import shutil
import yaml
import splitfolders
import cv2
import xml.etree.ElementTree as ET

from google.colab import drive
drive.mount("/content/drive")


# ==========================================================
# SET PATHS (GOOGLE DRIVE DATASET PATH)
# ==========================================================
DATASET_DIR = "/content/drive/MyDrive/Datasets/Animal_Detection"   # <-- CHANGE THIS
IMG_DIR = os.path.join(DATASET_DIR, "images")
ANN_DIR = os.path.join(DATASET_DIR, "annotations")

print("Images folder:", IMG_DIR)
print("Annotations folder:", ANN_DIR)


# ==========================================================
# FINAL OUTPUT FOLDER
# ==========================================================
FINAL_DIR = "/content/AnimalDataset"

if os.path.exists(FINAL_DIR):
    shutil.rmtree(FINAL_DIR)

os.makedirs(FINAL_DIR, exist_ok=True)


# ==========================================================
# CREATE TEMP DIR (splitfolders requires a folder)
# ==========================================================
TEMP_DIR = "/content/temp_split"

if os.path.exists(TEMP_DIR):
    shutil.rmtree(TEMP_DIR)

os.makedirs(TEMP_DIR, exist_ok=True)

TEMP_IMAGES = os.path.join(TEMP_DIR, "images")
os.makedirs(TEMP_IMAGES, exist_ok=True)


# ==========================================================
# COPY IMAGES TO TEMP
# ==========================================================
image_files = glob.glob(os.path.join(IMG_DIR, "*.jpg")) + glob.glob(os.path.join(IMG_DIR, "*.png"))

for img_path in image_files:
    shutil.copy(img_path, TEMP_IMAGES)

print(f"‚úÖ Copied {len(image_files)} images into temp_split/images")


# ==========================================================
# SPLIT DATASET USING SPLITFOLDERS
# ==========================================================
splitfolders.ratio(
    TEMP_IMAGES,
    output=FINAL_DIR,
    seed=42,
    ratio=(0.8, 0.1, 0.1)
)

# Rename val -> valid
if os.path.exists(os.path.join(FINAL_DIR, "val")):
    os.rename(os.path.join(FINAL_DIR, "val"), os.path.join(FINAL_DIR, "valid"))

print("‚úÖ Dataset split completed!")


# ==========================================================
# CREATE REQUIRED YOLO STRUCTURE
# ==========================================================
splits = ["train", "valid", "test"]

for split in splits:
    os.makedirs(os.path.join(FINAL_DIR, split, "images"), exist_ok=True)
    os.makedirs(os.path.join(FINAL_DIR, split, "labels"), exist_ok=True)

print("‚úÖ YOLO folders created!")


# ==========================================================
# CLASS LIST (9 CLASSES) - CHANGE ACCORDING TO YOUR DATASET
# ==========================================================
CLASS_NAMES = [
    "cat",
    "dog",
    "cow",
    "horse",
    "sheep",
    "goat",
    "elephant",
    "tiger",
    "lion"
]

CLASS_MAP = {name: idx for idx, name in enumerate(CLASS_NAMES)}

print("Class Mapping:", CLASS_MAP)


# ==========================================================
# XML -> YOLO FUNCTION (MULTI CLASS SUPPORT)
# ==========================================================
def convert_xml_to_yolo(xml_file, image_file, output_txt_file):

    img = cv2.imread(image_file)
    if img is None:
        print("‚ùå Error reading image:", image_file)
        return

    img_height, img_width = img.shape[:2]

    tree = ET.parse(xml_file)
    root = tree.getroot()

    yolo_lines = []

    for obj in root.findall("object"):
        label = obj.find("name").text.strip().lower()

        # Skip unknown classes
        if label not in CLASS_MAP:
            continue

        class_id = CLASS_MAP[label]

        bbox = obj.find("bndbox")
        xmin = int(float(bbox.find("xmin").text))
        ymin = int(float(bbox.find("ymin").text))
        xmax = int(float(bbox.find("xmax").text))
        ymax = int(float(bbox.find("ymax").text))

        # YOLO Normalization
        x_center = ((xmin + xmax) / 2) / img_width
        y_center = ((ymin + ymax) / 2) / img_height
        w = (xmax - xmin) / img_width
        h = (ymax - ymin) / img_height

        yolo_lines.append(f"{class_id} {x_center:.6f} {y_center:.6f} {w:.6f} {h:.6f}")

    with open(output_txt_file, "w") as f:
        f.write("\n".join(yolo_lines))


# ==========================================================
# MOVE IMAGES INTO images/ AND CREATE labels/
# ==========================================================
for split in splits:

    split_root = os.path.join(FINAL_DIR, split)

    # splitfolders puts images directly inside train/valid/test folder
    img_list = glob.glob(os.path.join(split_root, "*.jpg")) + glob.glob(os.path.join(split_root, "*.png"))

    print(f"{split.upper()} images found:", len(img_list))

    for img_path in img_list:

        img_name = os.path.basename(img_path)
        base_name = os.path.splitext(img_name)[0]

        # move image into split/images
        dst_img_path = os.path.join(FINAL_DIR, split, "images", img_name)
        shutil.move(img_path, dst_img_path)

        # xml annotation path
        xml_path = os.path.join(ANN_DIR, base_name + ".xml")

        # output txt label path
        label_path = os.path.join(FINAL_DIR, split, "labels", base_name + ".txt")

        if os.path.exists(xml_path):
            convert_xml_to_yolo(xml_path, dst_img_path, label_path)
        else:
            open(label_path, "w").close()

print("‚úÖ Images moved + Labels created successfully!")


# ==========================================================
# CREATE data.yaml FILE
# ==========================================================
DATA_YAML_PATH = os.path.join(FINAL_DIR, "data.yaml")

data_yaml = {
    "train": os.path.join(FINAL_DIR, "train/images"),
    "val": os.path.join(FINAL_DIR, "valid/images"),
    "test": os.path.join(FINAL_DIR, "test/images"),
    "nc": len(CLASS_NAMES),
    "names": CLASS_NAMES
}

with open(DATA_YAML_PATH, "w") as f:
    yaml.dump(data_yaml, f)

print("‚úÖ data.yaml created at:", DATA_YAML_PATH)


# ==========================================================
# CHECK FILE COUNTS
# ==========================================================
for split in splits:
    img_count = len(glob.glob(os.path.join(FINAL_DIR, split, "images", "*.*")))
    lbl_count = len(glob.glob(os.path.join(FINAL_DIR, split, "labels", "*.txt")))

    print(f"{split.upper()} -> Images: {img_count} | Labels: {lbl_count}")

print("\nüéØ FINAL YOLO DATASET READY AT:", FINAL_DIR)


# ==========================================================
# DELETE TEMP FOLDER (OPTIONAL)
# ==========================================================
shutil.rmtree(TEMP_DIR)
print("‚úÖ temp_split folder deleted!")
