In [None]:
import os
import shutil
from pathlib import Path

# Root paths
RAW_ROOT = "/Users/sc/Desktop/GP/Arva-Autonomous-Robotic-System-for-Smart-Agriculture/Python/Merged Yolo Pipeline/agridatasets"
MERGED_ROOT = os.path.join(RAW_ROOT, "merged datasets")

# Dataset folders
DATASETS = {
    "plant": "plant_raw",
    "pest": "pest_raw",
    "road": "road_raw"
}

# Mapping dataset prefixes to final class IDs
CLASS_MAP = {
    "plant": None,  # plant classes 0-10 are correct already
    "pest": 11,     # all labels → class 11
    "road": 12      # all labels → class 12
}

# Splits in raw datasets
SPLITS = ["train", "valid", "test"]

IMG_EXTS = [".jpg", ".jpeg", ".png"]

# Create merged folders
for split in SPLITS:
    os.makedirs(os.path.join(MERGED_ROOT, split, "images"), exist_ok=True)
    os.makedirs(os.path.join(MERGED_ROOT, split, "labels"), exist_ok=True)

# Merge images and labels
for ds_name, ds_folder in DATASETS.items():
    ds_path = os.path.join(RAW_ROOT, ds_folder)

    for split in SPLITS:
        img_dir = Path(ds_path) / split / "images"
        lbl_dir = Path(ds_path) / split / "labels"

        if not img_dir.exists():
            print(f"Warning: {ds_name} {split} images folder not found: {img_dir}")
            continue

        for img in img_dir.iterdir():
            if img.suffix.lower() not in IMG_EXTS:
                continue

            new_name = f"{ds_name}_{img.name}"
            shutil.copy(img, os.path.join(MERGED_ROOT, split, "images", new_name))

            src_lbl = lbl_dir / img.with_suffix(".txt").name
            dst_lbl = os.path.join(MERGED_ROOT, split, "labels", new_name.replace(img.suffix, ".txt"))

            if src_lbl.exists():
                # Read label and remap class IDs if needed
                with open(src_lbl, "r") as f:
                    lines = f.readlines()

                new_lines = []
                for line in lines:
                    if not line.strip():
                        continue
                    parts = line.strip().split()
                    if CLASS_MAP[ds_name] is not None:
                        parts[0] = str(CLASS_MAP[ds_name])
                    new_lines.append(" ".join(parts))

                with open(dst_lbl, "w") as f:
                    f.write("\n".join(new_lines))
            else:
                # create empty label if none exists
                open(dst_lbl, "w").close()

print("✅ Merge and class remapping complete")

# Validation: check all class IDs are valid
VALID_IDS = set(range(13))  # 0-12
for split in SPLITS:
    lbl_dir = os.path.join(MERGED_ROOT, split, "labels")
    for file in os.listdir(lbl_dir):
        if not file.endswith(".txt"):
            continue
        with open(os.path.join(lbl_dir, file)) as f:
            for line in f:
                if not line.strip():
                    continue
                cls_id = int(line.split()[0])
                assert cls_id in VALID_IDS, f"Invalid class ID {cls_id} in file {file}"

print("✅ All class IDs are valid. Merged dataset is ready for YOLOv8.")


In [2]:
import os
import shutil
import random
import yaml
from pathlib import Path
from sklearn.model_selection import KFold

# --- CONFIGURATION ---
MERGED_ROOT = "/Users/sc/Desktop/GP/Arva-Autonomous-Robotic-System-for-Smart-Agriculture/Python/Merged Yolo Pipeline/combinded_dataset"
OUTPUT_ROOT = "/Users/sc/Desktop/GP/Arva-Autonomous-Robotic-System-for-Smart-Agriculture/Python/Merged Yolo Pipeline/Final_Balanced_Project_Dataset"

K_FOLDS = 5
TEST_SPLIT = 0.10 
IMG_EXTS = [".jpg", ".jpeg", ".png"]

class_names = {
    0: "Apple_Scab_Leaf", 1: "Apple_leaf", 2: "Bell_pepper_leaf_spot",
    3: "Bell_pepper_leaf", 4: "Corn_leaf_blight", 5: "Potato_leaf_late_blight",
    6: "Potato_leaf", 7: "Squash_Powdery_mildew_leaf", 8: "Strawberry_leaf",
    9: "Tomato_mold_leaf", 10: "Tomato_leaf", 11: "pest", 12: "soil"
}

# 1. POOL ALL DATA
pool = {"plant": [], "pest": [], "road": []}

print(f"Scanning source: {MERGED_ROOT}...")
for split in ["train", "valid", "test"]:
    img_dir = Path(MERGED_ROOT) / split / "images"
    lbl_dir = Path(MERGED_ROOT) / split / "labels"
    if not img_dir.exists(): continue

    for img in img_dir.iterdir():
        if img.suffix.lower() in IMG_EXTS:
            lbl = lbl_dir / img.with_suffix(".txt").name
            if lbl.exists():
                with open(lbl, "r") as f:
                    line = f.readline().strip()
                    if not line: continue
                    try:
                        cid = int(line.split()[0])
                        if cid <= 10: pool["plant"].append((img, lbl))
                        elif cid == 11: pool["pest"].append((img, lbl))
                        elif cid == 12: pool["road"].append((img, lbl))
                    except (ValueError, IndexError):
                        continue

# Store RAW counts for the report
raw_counts = {cat: len(pool[cat]) for cat in pool}

# 2. BALANCE & INITIAL SPLIT
min_count = min(len(pool[cat]) for cat in pool)
balanced_list = []
for cat in pool:
    random.shuffle(pool[cat])
    balanced_list.extend(pool[cat][:min_count])

random.shuffle(balanced_list)
test_idx = int(len(balanced_list) * TEST_SPLIT)
final_test_data = balanced_list[:test_idx]
kfold_data = balanced_list[test_idx:]

# 3. SAVE FINAL TEST SET
test_img_dir = Path(OUTPUT_ROOT) / "final_test_set" / "images"
test_lbl_dir = Path(OUTPUT_ROOT) / "final_test_set" / "labels"
test_img_dir.mkdir(parents=True, exist_ok=True)
test_lbl_dir.mkdir(parents=True, exist_ok=True)

for img, lbl in final_test_data:
    shutil.copy(img, test_img_dir / img.name)
    shutil.copy(lbl, test_lbl_dir / lbl.name)

# 4. PERFORM 5-FOLD (80/20) & GENERATE YAML
kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42)
fold_stats = []

for fold, (train_idx, val_idx) in enumerate(kf.split(kfold_data)):
    fold_num = fold + 1
    fold_path = Path(OUTPUT_ROOT) / f"fold_{fold_num}"
    fold_stats.append({"train": len(train_idx), "val": len(val_idx)})
    
    for name, indices in [("train", train_idx), ("val", val_idx)]:
        img_d = fold_path / name / "images"
        lbl_d = fold_path / name / "labels"
        img_d.mkdir(parents=True, exist_ok=True)
        lbl_d.mkdir(parents=True, exist_ok=True)

        for i in indices:
            img, lbl = kfold_data[i]
            shutil.copy(img, img_d / img.name)
            shutil.copy(lbl, lbl_d / lbl.name)

    data_config = {
        'path': str(fold_path),
        'train': 'train/images',
        'val': 'val/images',
        'test': '../final_test_set/images',
        'nc': 13,
        'names': class_names
    }
    with open(fold_path / 'data.yaml', 'w') as yaml_file:
        yaml.dump(data_config, yaml_file, default_flow_style=False, sort_keys=False)

# --- FINAL SUMMARY REPORT ---
print("\n" + "="*40)
print("       DATASET STATISTICS REPORT")
print("="*40)
print(f"{'Category':<10} | {'Raw Count':<10} | {'Balanced Count':<15}")
print("-" * 40)
for cat in pool:
    print(f"{cat.capitalize():<10} | {raw_counts[cat]:<10} | {min_count:<15}")

print("-" * 40)
print(f"Total Balanced Images: {len(balanced_list)}")
print(f"Final Test Set (10%):  {len(final_test_data)}")
print(f"Total K-Fold Pool:     {len(kfold_data)}")
print("-" * 40)

for i, stat in enumerate(fold_stats):
    print(f"Fold {i+1}: Train={stat['train']} images, Val={stat['val']} images")
print("="*40)

print(f"\n✅ SUCCESS!")
print(f"Location: {OUTPUT_ROOT}")
print(f"Structure: 10% Final Test set and {K_FOLDS} balanced folds (80% Train / 20% Val).")

Scanning source: /Users/sc/Desktop/GP/Arva-Autonomous-Robotic-System-for-Smart-Agriculture/Python/Merged Yolo Pipeline/combinded_dataset...

       DATASET STATISTICS REPORT
Category   | Raw Count  | Balanced Count 
----------------------------------------
Plant      | 1600       | 389            
Pest       | 18664      | 389            
Road       | 389        | 389            
----------------------------------------
Total Balanced Images: 1167
Final Test Set (10%):  116
Total K-Fold Pool:     1051
----------------------------------------
Fold 1: Train=840 images, Val=211 images
Fold 2: Train=841 images, Val=210 images
Fold 3: Train=841 images, Val=210 images
Fold 4: Train=841 images, Val=210 images
Fold 5: Train=841 images, Val=210 images

✅ SUCCESS!
Location: /Users/sc/Desktop/GP/Arva-Autonomous-Robotic-System-for-Smart-Agriculture/Python/Merged Yolo Pipeline/Final_Balanced_Project_Dataset
Structure: 10% Final Test set and 5 balanced folds (80% Train / 20% Val).
