### Pictures Describing the Structure of the Dataset Directory

In [3]:
ls

 Volume in drive C is Windows-SSD
 Volume Serial Number is F0F7-C1CE

 Directory of C:\Users\Ibrahim_Hegazi\Desktop\Graduation_Project

04/10/2025  11:55 AM    <DIR>          .
03/26/2025  05:02 AM    <DIR>          ..
04/10/2025  11:54 AM    <DIR>          .ipynb_checkpoints
04/10/2025  11:55 AM               628 Augmenting the Data.ipynb
03/14/2025  03:26 AM    <DIR>          dataset
03/19/2025  12:41 AM    <DIR>          healthy
03/15/2025  10:04 AM            31,969 Notebook.ipynb
03/15/2025  12:44 AM    <DIR>          runs
03/14/2025  03:25 AM         6,182,636 yolo11n-seg.pt
03/13/2025  07:15 PM         7,071,756 yolov8n-seg.pt
               4 File(s)     13,286,989 bytes
               6 Dir(s)  276,233,117,696 bytes free


### Run in case you dont have the segmentaion model downloaded on your device

In [None]:
!wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n-seg.pt

### Downloading the needed libraries

In [7]:
pip install ultralytics

Note: you may need to restart the kernel to use updated packages.


In [9]:
!pip install ultralytics albumentations -q

## Create Custom Augmentation Pipeline

In [37]:
import albumentations as A
from albumentations.pytorch import ToTensorV2

def get_augmentation_pipeline():
    return A.Compose([
        # Geometric transformations (applied to each image)
        A.HorizontalFlip(p=1.0),  # Changed p=0.5 to p=1.0 to always apply
        A.VerticalFlip(p=1.0),    # Changed p=0.5 to p=1.0 to always apply
        A.RandomRotate90(p=1.0),  # Changed p=0.5 to p=1.0 to always apply
        A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.1, rotate_limit=15, p=1.0),
        
        # Color transformations (applied to each image)
        A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=1.0),
        A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=20, val_shift_limit=10, p=1.0),
        A.CLAHE(p=1.0),          # Changed p=0.3 to p=1.0 to always apply
        A.RandomGamma(p=1.0),     # Changed p=0.3 to p=1.0 to always apply
        
        # Blur/noise (applied to each image)
        A.GaussianBlur(blur_limit=(3, 7), p=0.2),  # Changed p=0.2 to p=1.0
        A.GaussNoise(var_limit=(10.0, 50.0), p=0.2),  # Changed p=0.3 to p=1.0
        
        # Removed RandomShadow and RandomSunFlare
        
        # Always convert to tensor at the end
        ToTensorV2()
    ], bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels']))

## Create Custom Dataset Class

In [35]:
import os
import cv2
import numpy as np
from PIL import Image
from torch.utils.data import Dataset

class SegmentationDataset(Dataset):
    def __init__(self, image_dir, label_dir, transform=None, save_dir=None):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.transform = transform
        self.save_dir = save_dir
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith(('.jpg', '.png', '.jpeg'))])
        
        if self.save_dir:
            os.makedirs(os.path.join(self.save_dir, 'images'), exist_ok=True)
            os.makedirs(os.path.join(self.save_dir, 'labels'), exist_ok=True)
        
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        label_path = os.path.join(self.label_dir, self.image_files[idx].replace('.jpg', '.txt').replace('.png', '.txt'))
        
        # Load image
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Load YOLO format segmentation masks
        with open(label_path, 'r') as f:
            lines = f.readlines()
        
        masks = []
        class_labels = []
        original_polygons = []
        for line in lines:
            parts = line.strip().split()
            class_id = int(parts[0])
            polygon = list(map(float, parts[1:]))
            original_polygons.append((class_id, polygon))
            
            # Convert YOLO polygon to mask
            height, width = image.shape[:2]
            polygon = np.array(polygon).reshape(-1, 2) * np.array([width, height])
            polygon = polygon.astype(np.int32)
            
            mask = np.zeros((height, width), dtype=np.uint8)
            cv2.fillPoly(mask, [polygon], 1)
            masks.append(mask)
            class_labels.append(class_id)
        
        # Apply augmentations
        if self.transform:
            transformed = self.transform(image=image, masks=masks, class_labels=class_labels)
            image = transformed['image']
            masks = transformed['masks']
            class_labels = transformed['class_labels']
            
            # Convert masks back to YOLO format
            new_height, new_width = image.shape[:2] if isinstance(image, np.ndarray) else image.size()[1:]
            yolo_lines = []
            
            for mask, class_id in zip(masks, class_labels):
                # Convert mask to polygon
                contours, _ = cv2.findContours(mask.numpy().astype(np.uint8), 
                                             cv2.RETR_EXTERNAL, 
                                             cv2.CHAIN_APPROX_SIMPLE)
                
                for contour in contours:
                    # Simplify polygon if it has too many points
                    epsilon = 0.002 * cv2.arcLength(contour, True)
                    approx = cv2.approxPolyDP(contour, epsilon, True)
                    
                    if len(approx) >= 3:  # Need at least 3 points to form a polygon
                        # Normalize coordinates to YOLO format
                        polygon = approx.reshape(-1, 2) / np.array([new_width, new_height])
                        polygon = polygon.flatten().tolist()
                        
                        # Add to YOLO format lines
                        yolo_lines.append(f"{class_id} " + " ".join(map(str, polygon)))
        
        # Save augmented data if save_dir is provided
        if self.save_dir and self.transform:
            # Save augmented image
            aug_img_path = os.path.join(self.save_dir, 'images', f"aug_{self.image_files[idx]}")
            cv2.imwrite(aug_img_path, cv2.cvtColor(image.numpy().transpose(1, 2, 0), cv2.COLOR_RGB2BGR))
            
            # Save augmented labels
            aug_label_path = os.path.join(self.save_dir, 'labels', f"aug_{self.image_files[idx].replace('.jpg', '.txt').replace('.png', '.txt')}")
            with open(aug_label_path, 'w') as f:
                f.write("\n".join(yolo_lines))
            
            return image, yolo_lines
        
        return image, masks, class_labels

In [30]:
transform = get_augmentation_pipeline()
save_dir = 'C:\\Users\\Ibrahim_Hegazi\\Desktop\\Augmented Data\\Augmented Dataset\\Train_Augmented'  # Where to save augmented data

# Create dataset with augmentation and saving
dataset = SegmentationDataset(
    image_dir='C:\\Users\\Ibrahim_Hegazi\\Desktop\\Augmented Data\\Base Dataset\\train\\images',
    label_dir='C:\\Users\\Ibrahim_Hegazi\\Desktop\\Augmented Data\\Base Dataset\\train\\labels',
    transform=transform,
    save_dir=save_dir
)

# Process all images (this will save augmented versions)
for i in range(len(dataset)):
    _ = dataset[i]  # This triggers the augmentation and saving

  A.GaussNoise(var_limit=(10.0, 50.0), p=1.0),  # Changed p=0.3 to p=1.0


# Didnt like the output of the first augmentation pipeline , therefore plan B

### Starting point code, this code will need modification, DONT RUN  THIS CELL

In [None]:
import os
import cv2
import numpy as np
from torch.utils.data import Dataset

# Define separate augmentation pipelines
def get_flip_augmentation():
    return A.Compose([
        A.HorizontalFlip(p=1.0),  # Always apply
        ToTensorV2()
    ], bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels']))

def get_rotate_augmentation():
    return A.Compose([
        A.RandomRotate90(p=1.0),  # Always apply
        ToTensorV2()
    ], bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels']))

def get_shift_scale_rotate_augmentation():
    return A.Compose([
        A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.1, rotate_limit=15, p=1.0),
        ToTensorV2()
    ], bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels']))

# Add more pipelines as needed (e.g., blur, color changes)

class AugmentationDataset(Dataset):
    def __init__(self, image_dir, label_dir, save_dir):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.save_dir = save_dir
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith(('.jpg', '.png', '.jpeg'))])
        
        # Define augmentation types and their save paths
        self.augmentations = {
            "flip_data": get_flip_augmentation(),
            "rotate_data": get_rotate_augmentation(),
            "shift_scale_rotate_data": get_shift_scale_rotate_augmentation(),
            # Add more as needed
        }
        
        # Create subdirectories for each augmentation type
        for aug_name in self.augmentations.keys():
            os.makedirs(os.path.join(self.save_dir, aug_name, 'images'), exist_ok=True)
            os.makedirs(os.path.join(self.save_dir, aug_name, 'labels'), exist_ok=True)
    
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        label_path = os.path.join(self.label_dir, self.image_files[idx].replace('.jpg', '.txt').replace('.png', '.txt'))
        
        # Load image and masks (same as before)
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Load YOLO labels (same as before)
        with open(label_path, 'r') as f:
            lines = f.readlines()
        
        masks = []
        class_labels = []
        for line in lines:
            parts = line.strip().split()
            class_id = int(parts[0])
            polygon = list(map(float, parts[1:]))
            
            height, width = image.shape[:2]
            polygon = np.array(polygon).reshape(-1, 2) * np.array([width, height])
            polygon = polygon.astype(np.int32)
            
            mask = np.zeros((height, width), dtype=np.uint8)
            cv2.fillPoly(mask, [polygon], 1)
            masks.append(mask)
            class_labels.append(class_id)
        
        # Apply each augmentation and save separately
        for aug_name, transform in self.augmentations.items():
            transformed = transform(image=image, masks=masks, class_labels=class_labels)
            aug_image = transformed['image']
            aug_masks = transformed['masks']
            aug_class_labels = transformed['class_labels']
            
            # Convert masks back to YOLO format (same as before)
            new_height, new_width = aug_image.shape[1:]  # Tensor shape (C, H, W)
            yolo_lines = []
            
            for mask, class_id in zip(aug_masks, aug_class_labels):
                contours, _ = cv2.findContours(mask.numpy().astype(np.uint8), 
                                             cv2.RETR_EXTERNAL, 
                                             cv2.CHAIN_APPROX_SIMPLE)
                for contour in contours:
                    epsilon = 0.002 * cv2.arcLength(contour, True)
                    approx = cv2.approxPolyDP(contour, epsilon, True)
                    if len(approx) >= 3:
                        polygon = approx.reshape(-1, 2) / np.array([new_width, new_height])
                        polygon = polygon.flatten().tolist()
                        yolo_lines.append(f"{class_id} " + " ".join(map(str, polygon)))
            
            # Save augmented image
            aug_img_filename = f"{aug_name}_{self.image_files[idx]}"
            aug_img_path = os.path.join(self.save_dir, aug_name, 'images', aug_img_filename)
            cv2.imwrite(aug_img_path, cv2.cvtColor(aug_image.numpy().transpose(1, 2, 0), cv2.COLOR_RGB2BGR))
            
            # Save augmented labels
            aug_label_filename = f"{aug_name}_{self.image_files[idx].replace('.jpg', '.txt').replace('.png', '.txt')}"
            aug_label_path = os.path.join(self.save_dir, aug_name, 'labels', aug_label_filename)
            with open(aug_label_path, 'w') as f:
                f.write("\n".join(yolo_lines))
        
        return len(self.augmentations)  # Number of augmentations applied per image

### Define separate augmentation pipelines

In [44]:
import albumentations as A
from albumentations.pytorch import ToTensorV2

# --- Individual Augmentation Pipelines ---
def get_flip_pipeline():
    """Only horizontal/vertical flips"""
    return A.Compose([
        A.HorizontalFlip(p=1.0),
        A.VerticalFlip(p=1.0),
        ToTensorV2()
    ], bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels']))

def get_rotate_pipeline():
    """Only 90-degree rotations"""
    return A.Compose([
        A.RandomRotate90(p=1.0),
        ToTensorV2()
    ], bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels']))

def get_shift_scale_pipeline():
    """Only shift/scale/rotate (affine transforms)"""
    return A.Compose([
        A.ShiftScaleRotate(
            shift_limit=0.0625, 
            scale_limit=0.1, 
            rotate_limit=15, 
            p=1.0
        ),
        ToTensorV2()
    ], bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels']))

def get_color_pipeline():
    """Only color transformations"""
    return A.Compose([
        A.RandomBrightnessContrast(
            brightness_limit=0.2, 
            contrast_limit=0.2, 
            p=1.0
        ),
        A.HueSaturationValue(
            hue_shift_limit=10, 
            sat_shift_limit=20, 
            val_shift_limit=10, 
            p=1.0
        ),
        A.CLAHE(p=1.0),
        A.RandomGamma(p=1.0),
        ToTensorV2()
    ], bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels']))

def get_blur_noise_pipeline():
    """Only blur and noise"""
    return A.Compose([
        A.GaussianBlur(blur_limit=(3, 7), p=1.0),
        A.GaussNoise(var_limit=(10.0, 50.0), p=1.0),
        ToTensorV2()
    ], bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels']))

# I wont use it
# --- Combined Pipeline (if needed) ---
def get_full_augmentation_pipeline():
    """All augmentations combined (for reference)"""
    return A.Compose([
        *get_flip_pipeline().transforms,  # Unpack flip transforms
        *get_rotate_pipeline().transforms,
        *get_shift_scale_pipeline().transforms,
        *get_color_pipeline().transforms,
        *get_blur_noise_pipeline().transforms,
        ToTensorV2()
    ], bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels']))

### Create Custom Dataset Class

In [49]:
import os
import cv2
import numpy as np
from torch.utils.data import Dataset
import albumentations as A
from albumentations.pytorch import ToTensorV2



class AugmentationDataset(Dataset):
    def __init__(self, image_dir, label_dir, save_dir):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.save_dir = save_dir
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith(('.jpg', '.png', '.jpeg'))])
        
        # Define all augmentation pipelines
        self.augmentations = {
            "flip_data": get_flip_pipeline(),
            "rotate_data": get_rotate_pipeline(),
            "shift_scale_data": get_shift_scale_pipeline(),
            "color_data": get_color_pipeline(),
            "blur_data": get_blur_noise_pipeline()
        }
        
        # Create directory structure
        self._create_dirs()
    
    def _create_dirs(self):
        """Create all required subdirectories"""
        for aug_name in self.augmentations.keys():
            os.makedirs(os.path.join(self.save_dir, aug_name, 'images'), exist_ok=True)
            os.makedirs(os.path.join(self.save_dir, aug_name, 'labels'), exist_ok=True)
    
    def __len__(self):
        return len(self.image_files)
    
    def _load_image_and_masks(self, img_path, label_path):
        """Helper to load image and YOLO masks"""
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        with open(label_path, 'r') as f:
            lines = f.readlines()
        
        masks, class_labels = [], []
        for line in lines:
            parts = line.strip().split()
            class_id = int(parts[0])
            polygon = list(map(float, parts[1:]))
            
            height, width = image.shape[:2]
            polygon = np.array(polygon).reshape(-1, 2) * np.array([width, height])
            polygon = polygon.astype(np.int32)
            
            mask = np.zeros((height, width), dtype=np.uint8)
            cv2.fillPoly(mask, [polygon], 1)
            masks.append(mask)
            class_labels.append(class_id)
            
        return image, masks, class_labels
    
    def _masks_to_yolo(self, masks, class_labels, new_width, new_height):
        """Convert masks back to YOLO format"""
        yolo_lines = []
        for mask, class_id in zip(masks, class_labels):
            contours, _ = cv2.findContours(mask.numpy().astype(np.uint8), 
                                         cv2.RETR_EXTERNAL, 
                                         cv2.CHAIN_APPROX_SIMPLE)
            for contour in contours:
                epsilon = 0.002 * cv2.arcLength(contour, True)
                approx = cv2.approxPolyDP(contour, epsilon, True)
                if len(approx) >= 3:
                    polygon = approx.reshape(-1, 2) / np.array([new_width, new_height])
                    polygon = polygon.flatten().tolist()
                    yolo_lines.append(f"{class_id} " + " ".join(map(str, polygon)))
        return yolo_lines
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        base_name = os.path.splitext(self.image_files[idx])[0]
        label_path = os.path.join(self.label_dir, f"{base_name}.txt")
        
        # Load original data once
        image, masks, class_labels = self._load_image_and_masks(img_path, label_path)
        
        # Apply all augmentations
        for aug_name, transform in self.augmentations.items():
            try:
                transformed = transform(image=image.copy(), masks=[m.copy() for m in masks], 
                                      class_labels=class_labels.copy())
                aug_image = transformed['image']
                aug_masks = transformed['masks']
                aug_class_labels = transformed['class_labels']
                
                # Get new dimensions (handles both numpy and tensor formats)
                if isinstance(aug_image, np.ndarray):
                    new_height, new_width = aug_image.shape[:2]
                else:
                    new_height, new_width = aug_image.shape[1:3]
                
                # Convert to YOLO format
                yolo_lines = self._masks_to_yolo(aug_masks, aug_class_labels, new_width, new_height)
                
                # Save augmented image
                aug_img_path = os.path.join(self.save_dir, aug_name, 'images', f"{base_name}.jpg")
                if isinstance(aug_image, np.ndarray):
                    cv2.imwrite(aug_img_path, cv2.cvtColor(aug_image, cv2.COLOR_RGB2BGR))
                else:
                    cv2.imwrite(aug_img_path, 
                               cv2.cvtColor(aug_image.numpy().transpose(1, 2, 0), cv2.COLOR_RGB2BGR))
                
                # Save labels
                aug_label_path = os.path.join(self.save_dir, aug_name, 'labels', f"{base_name}.txt")
                with open(aug_label_path, 'w') as f:
                    f.write("\n".join(yolo_lines))
                    
            except Exception as e:
                print(f"Error applying {aug_name} to {self.image_files[idx]}: {str(e)}")
        
        return len(self.augmentations)

### Augmenting the Training Data

In [52]:
if __name__ == "__main__":
    dataset = AugmentationDataset(
        image_dir="C:\\Users\\Ibrahim_Hegazi\\Desktop\\Augmented Data\\Base Dataset\\train\\images",
        label_dir="C:\\Users\\Ibrahim_Hegazi\\Desktop\\Augmented Data\\Base Dataset\\train\\labels",
        save_dir="C:\\Users\\Ibrahim_Hegazi\\Desktop\\Augmented Data\\Augmented Dataset\\Train_Augmented"
    )
    
    # This will process all images and save augmentations in subfolders
    for i in range(len(dataset)):
        dataset[i]  # No need to store output, just trigger processing

  original_init(self, **validated_kwargs)
  self._set_keys()
  A.GaussNoise(var_limit=(10.0, 50.0), p=1.0),


### Augmenting the Validation Data

In [58]:
if __name__ == "__main__":
    dataset = AugmentationDataset(
        image_dir="C:\\Users\\Ibrahim_Hegazi\\Desktop\\Augmented Data\\Base Dataset\\valid\\images",
        label_dir="C:\\Users\\Ibrahim_Hegazi\\Desktop\\Augmented Data\\Base Dataset\\valid\\labels",
        save_dir="C:\\Users\\Ibrahim_Hegazi\\Desktop\\Augmented Data\\Augmented Dataset\\Valid_Augmented"
    )
    
    # This will process all images and save augmentations in subfolders
    for i in range(len(dataset)):
        dataset[i]  # No need to store output, just trigger processing

  A.GaussNoise(var_limit=(10.0, 50.0), p=1.0),


Error applying flip_data to healthy_192_jpg.rf.062384861d509ba68bd64ce63774eaca.jpg: masks cannot be empty
Error applying rotate_data to healthy_192_jpg.rf.062384861d509ba68bd64ce63774eaca.jpg: masks cannot be empty
Error applying shift_scale_data to healthy_192_jpg.rf.062384861d509ba68bd64ce63774eaca.jpg: masks cannot be empty
Error applying color_data to healthy_192_jpg.rf.062384861d509ba68bd64ce63774eaca.jpg: masks cannot be empty
Error applying blur_data to healthy_192_jpg.rf.062384861d509ba68bd64ce63774eaca.jpg: masks cannot be empty
Error applying flip_data to healthy_39_jpg.rf.0844bd05c59385ea391ec133f7fccac1.jpg: masks cannot be empty
Error applying rotate_data to healthy_39_jpg.rf.0844bd05c59385ea391ec133f7fccac1.jpg: masks cannot be empty
Error applying shift_scale_data to healthy_39_jpg.rf.0844bd05c59385ea391ec133f7fccac1.jpg: masks cannot be empty
Error applying color_data to healthy_39_jpg.rf.0844bd05c59385ea391ec133f7fccac1.jpg: masks cannot be empty
Error applying blur_d

## Changing the names of the images and labels before collecting them all in one place

In [6]:
import os

def rename_files_with_prefix(folder_path, prefix):
    """
    Renames all files in the specified folder by adding a prefix to their names.
    
    Args:
        folder_path (str): Path to the folder containing the files to rename
        prefix (str): The prefix to add to each file name
    """
    # Get list of files in the folder
    try:
        files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    except FileNotFoundError:
        print(f"Error: Folder '{folder_path}' not found.")
        return
    
    # Rename each file
    for filename in files:
        new_name = prefix + filename
        old_path = os.path.join(folder_path, filename)
        new_path = os.path.join(folder_path, new_name)
        
        try:
            os.rename(old_path, new_path)
            print(f"Renamed: '{filename}' -> '{new_name}'")
        except OSError as e:
            print(f"Error renaming '{filename}': {e}")

if __name__ == "__main__":
    
    # Train Augmented images
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Train_Augmented\blur_data\images", "blur_")
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Train_Augmented\color_data\images", "color_")
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Train_Augmented\flip_data\images", "flip_")
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Train_Augmented\rotate_data\images", "rotate_")
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Train_Augmented\shift_scale_data\images", "shift_scale_")
    

    # Train Augmented labels
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Train_Augmented\blur_data\labels", "blur_")
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Train_Augmented\color_data\labels", "color_")
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Train_Augmented\flip_data\labels", "flip_")
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Train_Augmented\rotate_data\labels", "rotate_")
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Train_Augmented\shift_scale_data\labels", "shift_scale_")
    

    
    # Valid Augmented images
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Valid_Augmented\blur_data\images", "blur_")
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Valid_Augmented\color_data\images", "color_")
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Valid_Augmented\flip_data\images", "flip_")
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Valid_Augmented\rotate_data\images", "rotate_")
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Valid_Augmented\shift_scale_data\images", "shift_scale_")
    

    # Vakid Augmented labels
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Valid_Augmented\blur_data\labels", "blur_")
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Valid_Augmented\color_data\labels", "color_")
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Valid_Augmented\flip_data\labels", "flip_")
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Valid_Augmented\rotate_data\labels", "rotate_")
    rename_files_with_prefix(r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset\Valid_Augmented\shift_scale_data\labels", "shift_scale_")
    


    print("\nRenaming complete!")

Renamed: 'blur_angular_leafspot10.jpg' -> 'blur_blur_angular_leafspot10.jpg'
Renamed: 'blur_angular_leafspot100.jpg' -> 'blur_blur_angular_leafspot100.jpg'
Renamed: 'blur_angular_leafspot101.jpg' -> 'blur_blur_angular_leafspot101.jpg'
Renamed: 'blur_angular_leafspot102.jpg' -> 'blur_blur_angular_leafspot102.jpg'
Renamed: 'blur_angular_leafspot103.jpg' -> 'blur_blur_angular_leafspot103.jpg'
Renamed: 'blur_angular_leafspot104.jpg' -> 'blur_blur_angular_leafspot104.jpg'
Renamed: 'blur_angular_leafspot106.jpg' -> 'blur_blur_angular_leafspot106.jpg'
Renamed: 'blur_angular_leafspot109.jpg' -> 'blur_blur_angular_leafspot109.jpg'
Renamed: 'blur_angular_leafspot11.jpg' -> 'blur_blur_angular_leafspot11.jpg'
Renamed: 'blur_angular_leafspot111.jpg' -> 'blur_blur_angular_leafspot111.jpg'
Renamed: 'blur_angular_leafspot112.jpg' -> 'blur_blur_angular_leafspot112.jpg'
Renamed: 'blur_angular_leafspot113.jpg' -> 'blur_blur_angular_leafspot113.jpg'
Renamed: 'blur_angular_leafspot114.jpg' -> 'blur_blur_an

In [8]:
import os
import shutil


def collect_files(source_folders, target_folder, file_type="images"):
    """Collects files from multiple sources into target folder"""
    os.makedirs(target_folder, exist_ok=True)
    
    for source in source_folders:
        try:
            files = [f for f in os.listdir(source) if os.path.isfile(os.path.join(source, f))]
            for filename in files:
                source_path = os.path.join(source, filename)
                target_path = os.path.join(target_folder, filename)
                
                # Handle duplicate filenames
                counter = 1
                while os.path.exists(target_path):
                    name, ext = os.path.splitext(filename)
                    target_path = os.path.join(target_folder, f"{name}_{counter}{ext}")
                    counter += 1
                
                shutil.copy2(source_path, target_path)
                print(f"Copied {file_type}: {filename} -> {os.path.basename(target_path)}")
                
        except FileNotFoundError:
            print(f"Warning: Source folder not found - {source}")
        except Exception as e:
            print(f"Error copying files from {source}: {e}")

if __name__ == "__main__":
    # Base directory
    base_dir = r"C:\Users\Ibrahim_Hegazi\Desktop\Augmented Strawberry Data\Augmented Dataset"
    
    # Create collected data folder structure
    collected_dir = os.path.join(os.path.dirname(base_dir), "Collected Data")
    images_dir = os.path.join(collected_dir, "images")
    labels_dir = os.path.join(collected_dir, "labels")
    
    
    
    print("\n=== Collecting Files ===")
    
    # Prepare source folders
    image_sources = [
        os.path.join(base_dir, "Train_Augmented", "blur_data", "images"),
        os.path.join(base_dir, "Train_Augmented", "color_data", "images"),
        os.path.join(base_dir, "Train_Augmented", "flip_data", "images"),
        os.path.join(base_dir, "Train_Augmented", "rotate_data", "images"),
        os.path.join(base_dir, "Train_Augmented", "shift_scale_data", "images"),
        os.path.join(base_dir, "Valid_Augmented", "blur_data", "images"),
        os.path.join(base_dir, "Valid_Augmented", "color_data", "images"),
        os.path.join(base_dir, "Valid_Augmented", "flip_data", "images"),
        os.path.join(base_dir, "Valid_Augmented", "rotate_data", "images"),
        os.path.join(base_dir, "Valid_Augmented", "shift_scale_data", "images")
    ]
    
    label_sources = [
        os.path.join(base_dir, "Train_Augmented", "blur_data", "labels"),
        os.path.join(base_dir, "Train_Augmented", "color_data", "labels"),
        os.path.join(base_dir, "Train_Augmented", "flip_data", "labels"),
        os.path.join(base_dir, "Train_Augmented", "rotate_data", "labels"),
        os.path.join(base_dir, "Train_Augmented", "shift_scale_data", "labels"),
        os.path.join(base_dir, "Valid_Augmented", "blur_data", "labels"),
        os.path.join(base_dir, "Valid_Augmented", "color_data", "labels"),
        os.path.join(base_dir, "Valid_Augmented", "flip_data", "labels"),
        os.path.join(base_dir, "Valid_Augmented", "rotate_data", "labels"),
        os.path.join(base_dir, "Valid_Augmented", "shift_scale_data", "labels")
    ]
    
    # Collect files
    collect_files(image_sources, images_dir, "images")
    collect_files(label_sources, labels_dir, "labels")
    
    print("\nOperation complete!")
    print(f"All files collected in: {collected_dir}")


=== Collecting Files ===
Copied images: blur_blur_angular_leafspot10.jpg -> blur_blur_angular_leafspot10.jpg
Copied images: blur_blur_angular_leafspot100.jpg -> blur_blur_angular_leafspot100.jpg
Copied images: blur_blur_angular_leafspot101.jpg -> blur_blur_angular_leafspot101.jpg
Copied images: blur_blur_angular_leafspot102.jpg -> blur_blur_angular_leafspot102.jpg
Copied images: blur_blur_angular_leafspot103.jpg -> blur_blur_angular_leafspot103.jpg
Copied images: blur_blur_angular_leafspot104.jpg -> blur_blur_angular_leafspot104.jpg
Copied images: blur_blur_angular_leafspot106.jpg -> blur_blur_angular_leafspot106.jpg
Copied images: blur_blur_angular_leafspot109.jpg -> blur_blur_angular_leafspot109.jpg
Copied images: blur_blur_angular_leafspot11.jpg -> blur_blur_angular_leafspot11.jpg
Copied images: blur_blur_angular_leafspot111.jpg -> blur_blur_angular_leafspot111.jpg
Copied images: blur_blur_angular_leafspot112.jpg -> blur_blur_angular_leafspot112.jpg
Copied images: blur_blur_angular

# Splitting the data

In [2]:
import os
import shutil
from collections import defaultdict

def stratified_split(source_images_dir, source_labels_dir, train_dir, valid_dir, valid_ratio=0.2):
    """
    Performs stratified split of data into train/valid while maintaining class proportions
    for each augmentation type.
    """
    # Create directories if they don't exist
    os.makedirs(os.path.join(train_dir, "images"), exist_ok=True)
    os.makedirs(os.path.join(train_dir, "labels"), exist_ok=True)
    os.makedirs(os.path.join(valid_dir, "images"), exist_ok=True)
    os.makedirs(os.path.join(valid_dir, "labels"), exist_ok=True)
    
    # Group files by their augmentation type (prefix)
    aug_groups = defaultdict(list)
    
    # Get all image files and group by augmentation prefix
    for filename in os.listdir(source_images_dir):
        if filename.endswith(('.jpg', '.jpeg', '.png', '.bmp')):
            # Extract augmentation prefix (blur_, color_, etc.)
            prefix = filename.split('_')[0] + '_'
            aug_groups[prefix].append(filename)
    
    # Process each augmentation group separately
    for prefix, files in aug_groups.items():
        print(f"\nProcessing {prefix} files ({len(files)} items)")
        
        # Calculate split sizes
        total_files = len(files)
        valid_size = int(total_files * valid_ratio)
        train_size = total_files - valid_size
        
        # Sort files to ensure consistent ordering (non-random)
        files.sort()
        
        # Split into train and valid
        train_files = files[:train_size]
        valid_files = files[train_size:]
        
        print(f"  Train: {len(train_files)} files")
        print(f"  Valid: {len(valid_files)} files")
        
        # Copy images and corresponding labels
        for filename in train_files:
            # Process image
            src_img = os.path.join(source_images_dir, filename)
            dst_img = os.path.join(train_dir, "images", filename)
            shutil.copy2(src_img, dst_img)
            
            # Process corresponding label
            label_name = os.path.splitext(filename)[0] + '.txt'
            src_lbl = os.path.join(source_labels_dir, label_name)
            dst_lbl = os.path.join(train_dir, "labels", label_name)
            if os.path.exists(src_lbl):
                shutil.copy2(src_lbl, dst_lbl)
        
        for filename in valid_files:
            # Process image
            src_img = os.path.join(source_images_dir, filename)
            dst_img = os.path.join(valid_dir, "images", filename)
            shutil.copy2(src_img, dst_img)
            
            # Process corresponding label
            label_name = os.path.splitext(filename)[0] + '.txt'
            src_lbl = os.path.join(source_labels_dir, label_name)
            dst_lbl = os.path.join(valid_dir, "labels", label_name)
            if os.path.exists(src_lbl):
                shutil.copy2(src_lbl, dst_lbl)

if __name__ == "__main__":
    # Path configuration
    base_dir = r"C:\Users\Ibrahim_Hegazi\Desktop"
    collected_dir = os.path.join(base_dir, "Collected Data")
    
    # Source directories (from previous collection)
    source_images = os.path.join(collected_dir, "images")
    source_labels = os.path.join(collected_dir, "labels")
    
    # Destination directories
    train_dir = os.path.join(base_dir, "Stratified Dataset", "train")
    valid_dir = os.path.join(base_dir, "Stratified Dataset", "valid")
    
    print("Starting stratified dataset split...")
    stratified_split(source_images, source_labels, train_dir, valid_dir)
    print("\nStratified split completed successfully!")
    print(f"Train data saved to: {train_dir}")
    print(f"Valid data saved to: {valid_dir}")

Starting stratified dataset split...

Processing angular_ files (435 items)
  Train: 348 files
  Valid: 87 files

Processing anthracnose_ files (97 items)
  Train: 78 files
  Valid: 19 files

Processing blossom_ files (208 items)
  Train: 167 files
  Valid: 41 files

Processing blur_ files (2748 items)
  Train: 2199 files
  Valid: 549 files

Processing color_ files (2748 items)
  Train: 2199 files
  Valid: 549 files

Processing flip_ files (2748 items)
  Train: 2199 files
  Valid: 549 files

Processing gray_ files (477 items)
  Train: 382 files
  Valid: 95 files

Processing healthy_ files (248 items)
  Train: 199 files
  Valid: 49 files

Processing leaf_ files (615 items)
  Train: 492 files
  Valid: 123 files

Processing powdery_ files (668 items)
  Train: 535 files
  Valid: 133 files

Processing rotate_ files (2748 items)
  Train: 2199 files
  Valid: 549 files

Processing shift_ files (2748 items)
  Train: 2199 files
  Valid: 549 files

Stratified split completed successfully!
Train d