In [1]:
import yaml
from pathlib import Path
import shutil

# First dataset
dataset_path = '/kaggle/input/bsed-datasets-v1/kaggle/working/Data Labeling/yolo_classification_dataset'

print("="*60)
print("üìä ANALYZING: YOLO CLASSIFICATION DATASET")
print("="*60)

# Create a working copy in writable directory
working_dir = Path('/kaggle/working')
dataset_copy_dir = working_dir / 'yolo_classification_dataset'

# Copy the dataset to working directory if not already copied
if not dataset_copy_dir.exists():
    print(f"üìÅ Copying dataset to {dataset_copy_dir}...")
    shutil.copytree(dataset_path, dataset_copy_dir)
    print("‚úÖ Dataset copied to working directory")
else:
    print("‚úÖ Using existing copy in working directory")

# Now use the copy
data_yaml_path = dataset_copy_dir / 'data.yaml'

# Load data.yaml
with open(data_yaml_path, 'r') as file:
    data_config = yaml.safe_load(file)

print(f"\nNumber of classes: {data_config['nc']}")
print(f"Class names: {data_config['names']}")

# Update paths in the working copy
if 'train' in data_config:
    # Fix path relative to current location
    train_relative = data_config['train'].replace('../', '')
    train_path = dataset_copy_dir / train_relative
    data_config['train'] = str(train_path)
    
if 'val' in data_config:
    val_relative = data_config['val'].replace('../', '')
    val_path = dataset_copy_dir / val_relative
    data_config['val'] = str(val_path)
    
if 'test' in data_config:
    test_relative = data_config['test'].replace('../', '')
    test_path = dataset_copy_dir / test_relative
    data_config['test'] = str(test_path)

# Count images
train_img_path = Path(data_config['train'])
val_img_path = Path(data_config['val'])

train_images = len(list(train_img_path.glob('*.jpg'))) + len(list(train_img_path.glob('*.png')))
val_images = len(list(val_img_path.glob('*.jpg'))) + len(list(val_img_path.glob('*.png')))

if 'test' in data_config:
    test_img_path = Path(data_config['test'])
    test_images = len(list(test_img_path.glob('*.jpg'))) + len(list(test_img_path.glob('*.png')))
else:
    test_images = 0

print(f"\nüìà Dataset Statistics:")
print(f"Training images: {train_images}")
print(f"Validation images: {val_images}")
print(f"Test images: {test_images}")

# Update and save yaml - Now it's writable!
data_config['path'] = str(dataset_copy_dir)
with open(data_yaml_path, 'w') as f:
    yaml.dump(data_config, f, default_flow_style=False)

print(f"\n‚úÖ Dataset configuration saved to: {data_yaml_path}")
print(f"Working directory: {dataset_copy_dir}")

üìä ANALYZING: YOLO CLASSIFICATION DATASET
üìÅ Copying dataset to /kaggle/working/yolo_classification_dataset...
‚úÖ Dataset copied to working directory

Number of classes: 3
Class names: {0: 'Stage1', 1: 'Stage2', 2: 'Stage3'}

üìà Dataset Statistics:
Training images: 3052
Validation images: 94
Test images: 94

‚úÖ Dataset configuration saved to: /kaggle/working/yolo_classification_dataset/data.yaml
Working directory: /kaggle/working/yolo_classification_dataset


In [2]:
import yaml
from pathlib import Path
import shutil

# First dataset
dataset_path = '/kaggle/input/roboflow-datasets/kaggle/working/Black-Sigatoka-4'

print("="*60)
print("üìä ANALYZING: YOLO CLASSIFICATION DATASET")
print("="*60)

# Create a working copy in writable directory
working_dir = Path('/kaggle/working')
dataset_copy_dir = working_dir / 'Black-Sigatoka-4'

# Copy the dataset to working directory if not already copied
if not dataset_copy_dir.exists():
    print(f"üìÅ Copying dataset to {dataset_copy_dir}...")
    shutil.copytree(dataset_path, dataset_copy_dir)
    print("‚úÖ Dataset copied to working directory")
else:
    print("‚úÖ Using existing copy in working directory")

# Now use the copy
data_yaml_path = dataset_copy_dir / 'data.yaml'

# Load data.yaml
with open(data_yaml_path, 'r') as file:
    data_config = yaml.safe_load(file)

print(f"\nNumber of classes: {data_config['nc']}")
print(f"Class names: {data_config['names']}")

# Update paths in the working copy
if 'train' in data_config:
    # Fix path relative to current location
    train_relative = data_config['train'].replace('../', '')
    train_path = dataset_copy_dir / train_relative
    data_config['train'] = str(train_path)
    
if 'val' in data_config:
    val_relative = data_config['val'].replace('../', '')
    val_path = dataset_copy_dir / val_relative
    data_config['val'] = str(val_path)
    
if 'test' in data_config:
    test_relative = data_config['test'].replace('../', '')
    test_path = dataset_copy_dir / test_relative
    data_config['test'] = str(test_path)

# Count images
train_img_path = Path(data_config['train'])
val_img_path = Path(data_config['val'])

train_images = len(list(train_img_path.glob('*.jpg'))) + len(list(train_img_path.glob('*.png')))
val_images = len(list(val_img_path.glob('*.jpg'))) + len(list(val_img_path.glob('*.png')))

if 'test' in data_config:
    test_img_path = Path(data_config['test'])
    test_images = len(list(test_img_path.glob('*.jpg'))) + len(list(test_img_path.glob('*.png')))
else:
    test_images = 0

print(f"\nüìà Dataset Statistics:")
print(f"Training images: {train_images}")
print(f"Validation images: {val_images}")
print(f"Test images: {test_images}")

# Update and save yaml - Now it's writable!
data_config['path'] = str(dataset_copy_dir)
with open(data_yaml_path, 'w') as f:
    yaml.dump(data_config, f, default_flow_style=False)

print(f"\n‚úÖ Dataset configuration saved to: {data_yaml_path}")
print(f"Working directory: {dataset_copy_dir}")

üìä ANALYZING: YOLO CLASSIFICATION DATASET
üìÅ Copying dataset to /kaggle/working/Black-Sigatoka-4...
‚úÖ Dataset copied to working directory

Number of classes: 4
Class names: ['0', '1', '2', '3']

üìà Dataset Statistics:
Training images: 5115
Validation images: 472
Test images: 239

‚úÖ Dataset configuration saved to: /kaggle/working/Black-Sigatoka-4/data.yaml
Working directory: /kaggle/working/Black-Sigatoka-4


# **COMBINE DATASETS WITH CLASS MAPPING**

In [3]:
import yaml
from pathlib import Path
import shutil
from tqdm import tqdm
import os
import re

print("="*80)
print("üîÑ COMBINING BSED AND ROBoFLOW DATASETS")
print("="*80)

# ============================================================================
# CLASS MAPPING CONFIGURATION
# ============================================================================
# Final classification scheme (from data-labeling-classification.ipynb)
FINAL_CLASS_ORDER = ['Healthy', 'Stage1', 'Stage2', 'Stage3', 'Stage4', 'Stage5', 'Stage6']

# Roboflow class mapping: actual class names ‚Üí final class names
# Based on actual Roboflow dataset class names
ROBOFLOW_CLASS_NAME_MAPPING = {
    'Functional': 'Healthy',   # Functional ‚Üí Healthy
    'Mild': 'Stage4',           # Mild ‚Üí Stage4
    'Moderate': 'Stage5',       # Moderate ‚Üí Stage5
    'Severe': 'Stage6'          # Severe ‚Üí Stage6
}

# Also map numeric IDs (used in label files) ‚Üí class names
# Label files use numeric IDs (0, 1, 2, 3) which correspond to:
ROBOFLOW_CLASS_ID_MAPPING = {
    '0': 'Functional',   # Label file ID 0 ‚Üí Functional ‚Üí Healthy
    '1': 'Mild',          # Label file ID 1 ‚Üí Mild ‚Üí Stage4
    '2': 'Moderate',       # Label file ID 2 ‚Üí Moderate ‚Üí Stage5
    '3': 'Severe'          # Label file ID 3 ‚Üí Severe ‚Üí Stage6
}

# Create mapping: class_name ‚Üí final_class_id
FINAL_CLASS_TO_ID = {cls: idx for idx, cls in enumerate(FINAL_CLASS_ORDER)}

print("\nüìã Class Mapping Configuration:")
print(f"   Final class order: {FINAL_CLASS_ORDER}")
print(f"\n   Roboflow class name mapping:")
for old_name, new_name in ROBOFLOW_CLASS_NAME_MAPPING.items():
    new_id = FINAL_CLASS_TO_ID[new_name]
    print(f"      {old_name} ‚Üí {new_name} (Final ID: {new_id})")
print(f"\n   Roboflow label file ID mapping:")
for old_id, class_name in ROBOFLOW_CLASS_ID_MAPPING.items():
    final_name = ROBOFLOW_CLASS_NAME_MAPPING[class_name]
    final_id = FINAL_CLASS_TO_ID[final_name]
    print(f"      ID {old_id} ({class_name}) ‚Üí {final_name} (Final ID: {final_id})")

# ============================================================================
# DATASET PATHS
# ============================================================================
working_dir = Path('/kaggle/working')

# BSED dataset (already copied)
bsed_dataset_dir = working_dir / 'yolo_classification_dataset'

# Roboflow dataset (already copied)
roboflow_dataset_dir = working_dir / 'Black-Sigatoka-4'

# Combined dataset output
combined_dataset_dir = working_dir / 'combined_yolo_dataset'

print(f"\nüìÅ Dataset Paths:")
print(f"   BSED: {bsed_dataset_dir}")
print(f"   Roboflow: {roboflow_dataset_dir}")
print(f"   Combined Output: {combined_dataset_dir}")

# Validate datasets exist
if not bsed_dataset_dir.exists():
    raise FileNotFoundError(f"BSED dataset not found: {bsed_dataset_dir}")
if not roboflow_dataset_dir.exists():
    raise FileNotFoundError(f"Roboflow dataset not found: {roboflow_dataset_dir}")

# Create combined dataset structure
splits = ['train', 'valid', 'test']
for split in splits:
    (combined_dataset_dir / split / 'images').mkdir(parents=True, exist_ok=True)
    (combined_dataset_dir / split / 'labels').mkdir(parents=True, exist_ok=True)

print(f"\n‚úÖ Combined dataset directory structure created")


üîÑ COMBINING BSED AND ROBoFLOW DATASETS

üìã Class Mapping Configuration:
   Final class order: ['Healthy', 'Stage1', 'Stage2', 'Stage3', 'Stage4', 'Stage5', 'Stage6']

   Roboflow class name mapping:
      Functional ‚Üí Healthy (Final ID: 0)
      Mild ‚Üí Stage4 (Final ID: 4)
      Moderate ‚Üí Stage5 (Final ID: 5)
      Severe ‚Üí Stage6 (Final ID: 6)

   Roboflow label file ID mapping:
      ID 0 (Functional) ‚Üí Healthy (Final ID: 0)
      ID 1 (Mild) ‚Üí Stage4 (Final ID: 4)
      ID 2 (Moderate) ‚Üí Stage5 (Final ID: 5)
      ID 3 (Severe) ‚Üí Stage6 (Final ID: 6)

üìÅ Dataset Paths:
   BSED: /kaggle/working/yolo_classification_dataset
   Roboflow: /kaggle/working/Black-Sigatoka-4
   Combined Output: /kaggle/working/combined_yolo_dataset

‚úÖ Combined dataset directory structure created


In [4]:
# ============================================================================
# FUNCTION: Update label file with new class mapping
# ============================================================================
def update_label_file(label_path, old_to_new_class_mapping):
    """
    Update YOLO label file with new class IDs.
    
    Args:
        label_path: Path to .txt label file
        old_to_new_class_mapping: Dict mapping old_class_id ‚Üí new_class_id
    
    Returns:
        True if updated, False if no changes needed
    """
    try:
        with open(label_path, 'r') as f:
            lines = f.readlines()
        
        updated_lines = []
        changed = False
        
        for line in lines:
            parts = line.strip().split()
            if len(parts) >= 5:  # YOLO format: class_id x y w h
                old_class_id = parts[0]
                
                # Check if this class needs mapping
                if old_class_id in old_to_new_class_mapping:
                    new_class_id = old_to_new_class_mapping[old_class_id]
                    parts[0] = str(new_class_id)
                    changed = True
                
                updated_lines.append(' '.join(parts) + '\n')
            else:
                updated_lines.append(line)
        
        if changed:
            with open(label_path, 'w') as f:
                f.writelines(updated_lines)
        
        return changed
    except Exception as e:
        print(f"Error updating {label_path}: {e}")
        return False

# ============================================================================
# FUNCTION: Rename image file with new class name
# ============================================================================
def rename_image_with_class(filename, class_name_mapping):
    """
    Rename image filename by replacing old class names with new ones.
    
    Args:
        filename: Original filename (e.g., "Functional-1-_png.rf.xxx.jpg")
        class_name_mapping: Dict mapping old_class_name ‚Üí new_class_name
    
    Returns:
        New filename with updated class name
    """
    new_filename = filename
    
    # Replace class names in filename (case-insensitive)
    for old_name, new_name in class_name_mapping.items():
        # Replace both capitalized and lowercase versions
        if old_name in new_filename:
            new_filename = new_filename.replace(old_name, new_name)
        elif old_name.lower() in new_filename.lower():
            # Case-insensitive replacement
            pattern = re.compile(re.escape(old_name), re.IGNORECASE)
            new_filename = pattern.sub(new_name, new_filename)
    
    return new_filename

# ============================================================================
# PROCESS ROBoFLOW DATASET: Map classes and copy files
# ============================================================================
print("\n" + "="*80)
print("üì¶ PROCESSING ROBoFLOW DATASET")
print("="*80)

# Load Roboflow data.yaml to understand structure
roboflow_yaml_path = roboflow_dataset_dir / 'data.yaml'
with open(roboflow_yaml_path, 'r') as f:
    roboflow_config = yaml.safe_load(f)

roboflow_class_names = roboflow_config.get('names', [])
print(f"\nüìã Roboflow Original Classes from YAML: {roboflow_class_names}")

# Create mapping: old_class_id (from label files) ‚Üí new_class_id (final)
# Label files use numeric IDs (0, 1, 2, 3), we need to map these to final IDs
roboflow_class_id_mapping = {}

# Handle different YAML formats
if isinstance(roboflow_class_names, dict):
    # Format: {0: 'Functional', 1: 'Mild', 2: 'Moderate', 3: 'Severe'}
    for old_id, class_name in roboflow_class_names.items():
        # Map class name to final class name
        if class_name in ROBOFLOW_CLASS_NAME_MAPPING:
            final_class_name = ROBOFLOW_CLASS_NAME_MAPPING[class_name]
            final_id = FINAL_CLASS_TO_ID[final_class_name]
            roboflow_class_id_mapping[str(old_id)] = str(final_id)
        else:
            # Fallback: try to use the ID mapping
            if str(old_id) in ROBOFLOW_CLASS_ID_MAPPING:
                mapped_name = ROBOFLOW_CLASS_ID_MAPPING[str(old_id)]
                final_class_name = ROBOFLOW_CLASS_NAME_MAPPING[mapped_name]
                final_id = FINAL_CLASS_TO_ID[final_class_name]
                roboflow_class_id_mapping[str(old_id)] = str(final_id)
elif isinstance(roboflow_class_names, list):
    # Format: ['Functional', 'Mild', 'Moderate', 'Severe'] or ['0', '1', '2', '3']
    for idx, class_name in enumerate(roboflow_class_names):
        # Check if it's a class name or just an ID string
        if isinstance(class_name, str) and class_name in ROBOFLOW_CLASS_NAME_MAPPING:
            # It's a class name like 'Functional'
            final_class_name = ROBOFLOW_CLASS_NAME_MAPPING[class_name]
            final_id = FINAL_CLASS_TO_ID[final_class_name]
            roboflow_class_id_mapping[str(idx)] = str(final_id)
        elif str(idx) in ROBOFLOW_CLASS_ID_MAPPING:
            # Use ID mapping (for cases where YAML has ['0', '1', '2', '3'])
            mapped_name = ROBOFLOW_CLASS_ID_MAPPING[str(idx)]
            final_class_name = ROBOFLOW_CLASS_NAME_MAPPING[mapped_name]
            final_id = FINAL_CLASS_TO_ID[final_class_name]
            roboflow_class_id_mapping[str(idx)] = str(final_id)

# Also add direct ID mappings (in case YAML doesn't have names)
for old_id, class_name in ROBOFLOW_CLASS_ID_MAPPING.items():
    if old_id not in roboflow_class_id_mapping:
        final_class_name = ROBOFLOW_CLASS_NAME_MAPPING[class_name]
        final_id = FINAL_CLASS_TO_ID[final_class_name]
        roboflow_class_id_mapping[old_id] = str(final_id)

print(f"\nüîÑ Label File Class ID Mapping (for .txt files):")
for old_id, new_id in sorted(roboflow_class_id_mapping.items(), key=lambda x: int(x[0])):
    old_name = ROBOFLOW_CLASS_ID_MAPPING.get(old_id, 'Unknown')
    final_name = ROBOFLOW_CLASS_NAME_MAPPING.get(old_name, 'Unknown')
    print(f"   Label ID {old_id} ({old_name}) ‚Üí Final ID {new_id} ({final_name})")

# Process each split
roboflow_stats = {'train': 0, 'valid': 0, 'test': 0}

for split in splits:
    print(f"\nüìÇ Processing Roboflow {split.upper()} split...")
    
    # Source paths
    src_images_dir = roboflow_dataset_dir / split / 'images'
    src_labels_dir = roboflow_dataset_dir / split / 'labels'
    
    # Destination paths
    dst_images_dir = combined_dataset_dir / split / 'images'
    dst_labels_dir = combined_dataset_dir / split / 'labels'
    
    if not src_images_dir.exists() or not src_labels_dir.exists():
        print(f"   ‚ö†Ô∏è  Split {split} not found, skipping...")
        continue
    
    # Get all image files
    image_files = list(src_images_dir.glob('*.jpg')) + list(src_images_dir.glob('*.png')) + \
                  list(src_images_dir.glob('*.JPG')) + list(src_images_dir.glob('*.PNG'))
    
    copied_count = 0
    updated_count = 0
    
    for img_path in tqdm(image_files, desc=f"  Copying {split} images", leave=False):
        # Rename image file: replace class names in filename
        # Example: "Functional-1-_png.rf.xxx.jpg" ‚Üí "Healthy-1-_png.rf.xxx.jpg"
        renamed_img_name = rename_image_with_class(img_path.name, ROBOFLOW_CLASS_NAME_MAPPING)
        
        # Use renamed filename directly (no prefix)
        dst_img_name = renamed_img_name
        dst_img_path = dst_images_dir / dst_img_name
        
        # Copy image with new name
        shutil.copy2(img_path, dst_img_path)
        
        # Find and update corresponding label file
        # Label file name should match the renamed image name
        # Get the stem (filename without extension) from the renamed image
        renamed_img_stem = Path(renamed_img_name).stem
        label_name = img_path.stem + '.txt'  # Original label name
        src_label_path = src_labels_dir / label_name
        
        # Label file should match the renamed image name
        dst_label_name = renamed_img_stem + '.txt'  # Match the new image name exactly
        dst_label_path = dst_labels_dir / dst_label_name
        
        if src_label_path.exists():
            # Copy label file with new name
            shutil.copy2(src_label_path, dst_label_path)
            
            # Update class IDs in label file
            if update_label_file(dst_label_path, roboflow_class_id_mapping):
                updated_count += 1
            
            copied_count += 1
        else:
            # If no label file, create one with default (whole image classification)
            with open(dst_label_path, 'w') as f:
                # YOLO format: class_id center_x center_y width height
                # For whole image: 0.5 0.5 1.0 1.0
                # Use first class as default (shouldn't happen, but safety)
                default_class = list(roboflow_class_id_mapping.values())[0] if roboflow_class_id_mapping else '0'
                f.write(f"{default_class} 0.5 0.5 1.0 1.0\n")
            copied_count += 1
    
    roboflow_stats[split] = copied_count
    print(f"   ‚úÖ Copied {copied_count} images, updated {updated_count} label files")

print(f"\n‚úÖ Roboflow dataset processing complete!")
print(f"   Train: {roboflow_stats['train']} images")
print(f"   Valid: {roboflow_stats['valid']} images")
print(f"   Test: {roboflow_stats['test']} images")



üì¶ PROCESSING ROBoFLOW DATASET

üìã Roboflow Original Classes from YAML: ['0', '1', '2', '3']

üîÑ Label File Class ID Mapping (for .txt files):
   Label ID 0 (Functional) ‚Üí Final ID 0 (Healthy)
   Label ID 1 (Mild) ‚Üí Final ID 4 (Stage4)
   Label ID 2 (Moderate) ‚Üí Final ID 5 (Stage5)
   Label ID 3 (Severe) ‚Üí Final ID 6 (Stage6)

üìÇ Processing Roboflow TRAIN split...


                                                                             

   ‚úÖ Copied 5115 images, updated 5115 label files

üìÇ Processing Roboflow VALID split...


                                                                           

   ‚úÖ Copied 472 images, updated 472 label files

üìÇ Processing Roboflow TEST split...


                                                                          

   ‚úÖ Copied 239 images, updated 239 label files

‚úÖ Roboflow dataset processing complete!
   Train: 5115 images
   Valid: 472 images
   Test: 239 images




In [5]:
# ============================================================================
# PROCESS BSED DATASET: Map classes and copy files
# ============================================================================
print("\n" + "="*80)
print("üì¶ PROCESSING BSED DATASET")
print("="*80)

# Load BSED data.yaml
bsed_yaml_path = bsed_dataset_dir / 'data.yaml'
with open(bsed_yaml_path, 'r') as f:
    bsed_config = yaml.safe_load(f)

print(f"\nüìã BSED Original Classes: {bsed_config.get('names', 'N/A')}")

# BSED classes: Stage1, Stage2, Stage3
# These map directly to final classes (no remapping needed, just need to get correct IDs)
bsed_class_id_mapping = {}
bsed_class_names = bsed_config.get('names', {})

if isinstance(bsed_class_names, dict):
    # Format: {0: 'Stage1', 1: 'Stage2', 2: 'Stage3'}
    for old_id, class_name in bsed_class_names.items():
        new_id = FINAL_CLASS_TO_ID[class_name]
        bsed_class_id_mapping[str(old_id)] = str(new_id)
elif isinstance(bsed_class_names, list):
    # Format: ['Stage1', 'Stage2', 'Stage3']
    for old_id, class_name in enumerate(bsed_class_names):
        new_id = FINAL_CLASS_TO_ID[class_name]
        bsed_class_id_mapping[str(old_id)] = str(new_id)

print(f"\nüîÑ Class ID Mapping:")
for old_id, new_id in bsed_class_id_mapping.items():
    class_name = bsed_class_names[int(old_id)] if isinstance(bsed_class_names, dict) else bsed_class_names[int(old_id)]
    print(f"   BSED Class {old_id} ({class_name}) ‚Üí Final Class ID {new_id}")

# Process each split
bsed_stats = {'train': 0, 'valid': 0, 'test': 0}

for split in splits:
    print(f"\nüìÇ Processing BSED {split.upper()} split...")
    
    # Source paths
    src_images_dir = bsed_dataset_dir / split / 'images'
    src_labels_dir = bsed_dataset_dir / split / 'labels'
    
    # Destination paths
    dst_images_dir = combined_dataset_dir / split / 'images'
    dst_labels_dir = combined_dataset_dir / split / 'labels'
    
    if not src_images_dir.exists() or not src_labels_dir.exists():
        print(f"   ‚ö†Ô∏è  Split {split} not found, skipping...")
        continue
    
    # Get all image files
    image_files = list(src_images_dir.glob('*.jpg')) + list(src_images_dir.glob('*.png')) + \
                  list(src_images_dir.glob('*.JPG')) + list(src_images_dir.glob('*.PNG'))
    
    copied_count = 0
    updated_count = 0
    
    for img_path in tqdm(image_files, desc=f"  Copying {split} images", leave=False):
        # Copy image (use original name, no prefix)
        dst_img_name = img_path.name
        dst_img_path = dst_images_dir / dst_img_name
        shutil.copy2(img_path, dst_img_path)
        
        # Find and update corresponding label file
        label_name = img_path.stem + '.txt'
        src_label_path = src_labels_dir / label_name
        dst_label_name = label_name  # Use original name, no prefix
        dst_label_path = dst_labels_dir / dst_label_name
        
        if src_label_path.exists():
            # Copy label file
            shutil.copy2(src_label_path, dst_label_path)
            
            # Update class IDs in label file
            if update_label_file(dst_label_path, bsed_class_id_mapping):
                updated_count += 1
            
            copied_count += 1
        else:
            # If no label file, create one
            with open(dst_label_path, 'w') as f:
                default_class = list(bsed_class_id_mapping.values())[0] if bsed_class_id_mapping else '0'
                f.write(f"{default_class} 0.5 0.5 1.0 1.0\n")
    
    bsed_stats[split] = copied_count
    print(f"   ‚úÖ Copied {copied_count} images, updated {updated_count} label files")

print(f"\n‚úÖ BSED dataset processing complete!")
print(f"   Train: {bsed_stats['train']} images")
print(f"   Valid: {bsed_stats['valid']} images")
print(f"   Test: {bsed_stats['test']} images")



üì¶ PROCESSING BSED DATASET

üìã BSED Original Classes: {0: 'Stage1', 1: 'Stage2', 2: 'Stage3'}

üîÑ Class ID Mapping:
   BSED Class 0 (Stage1) ‚Üí Final Class ID 1
   BSED Class 1 (Stage2) ‚Üí Final Class ID 2
   BSED Class 2 (Stage3) ‚Üí Final Class ID 3

üìÇ Processing BSED TRAIN split...


                                                                             

   ‚úÖ Copied 3052 images, updated 3052 label files

üìÇ Processing BSED VALID split...


                                                              

   ‚úÖ Copied 94 images, updated 94 label files

üìÇ Processing BSED TEST split...


                                                             

   ‚úÖ Copied 94 images, updated 94 label files

‚úÖ BSED dataset processing complete!
   Train: 3052 images
   Valid: 94 images
   Test: 94 images




In [6]:
# ============================================================================
# CREATE COMBINED DATA.YAML
# ============================================================================
print("\n" + "="*80)
print("üìù CREATING COMBINED DATA.YAML")
print("="*80)

# Create final class mapping for YAML
final_class_names = {idx: name for idx, name in enumerate(FINAL_CLASS_ORDER)}

# Create combined data.yaml
combined_yaml_config = {
    'path': str(combined_dataset_dir.absolute()),
    'train': 'train/images',
    'val': 'valid/images',
    'test': 'test/images',
    'names': final_class_names,
    'nc': len(FINAL_CLASS_ORDER)
}

# Save combined data.yaml
combined_yaml_path = combined_dataset_dir / 'data.yaml'
with open(combined_yaml_path, 'w') as f:
    yaml.dump(combined_yaml_config, f, default_flow_style=False, sort_keys=False)

print(f"\n‚úÖ Combined data.yaml created: {combined_yaml_path}")
print(f"\nüìã Configuration:")
print(f"   Path: {combined_yaml_config['path']}")
print(f"   Train: {combined_yaml_config['train']}")
print(f"   Validation: {combined_yaml_config['val']}")
print(f"   Test: {combined_yaml_config['test']}")
print(f"   Number of classes: {combined_yaml_config['nc']}")
print(f"\n   Class mapping:")
for idx, name in final_class_names.items():
    print(f"      {idx}: {name}")

# ============================================================================
# FINAL STATISTICS
# ============================================================================
print("\n" + "="*80)
print("üìä FINAL COMBINED DATASET STATISTICS")
print("="*80)

total_stats = {
    'train': bsed_stats['train'] + roboflow_stats['train'],
    'valid': bsed_stats['valid'] + roboflow_stats['valid'],
    'test': bsed_stats['test'] + roboflow_stats['test']
}

print(f"\nüìà Total Images:")
print(f"   Training: {total_stats['train']} images")
print(f"   Validation: {total_stats['valid']} images")
print(f"   Test: {total_stats['test']} images")
print(f"   TOTAL: {sum(total_stats.values())} images")

print(f"\nüì¶ Source Breakdown:")
print(f"   BSED Dataset:")
print(f"      Train: {bsed_stats['train']}, Valid: {bsed_stats['valid']}, Test: {bsed_stats['test']}")
print(f"   Roboflow Dataset:")
print(f"      Train: {roboflow_stats['train']}, Valid: {roboflow_stats['valid']}, Test: {roboflow_stats['test']}")

# Verify label files match
print(f"\nüîç Verifying label files...")
for split in splits:
    images_dir = combined_dataset_dir / split / 'images'
    labels_dir = combined_dataset_dir / split / 'labels'
    
    if images_dir.exists() and labels_dir.exists():
        images = list(images_dir.glob('*.jpg')) + list(images_dir.glob('*.png')) + \
                 list(images_dir.glob('*.JPG')) + list(images_dir.glob('*.PNG'))
        labels = list(labels_dir.glob('*.txt'))
        
        print(f"   {split.upper()}: {len(images)} images, {len(labels)} labels", end="")
        if len(images) == len(labels):
            print(" ‚úÖ")
        else:
            print(f" ‚ö†Ô∏è  (mismatch!)")

print("\n" + "="*80)
print("‚úÖ DATASET COMBINATION COMPLETE!")
print("="*80)
print(f"\nüìÅ Combined dataset location: {combined_dataset_dir}")
print(f"üìÑ Configuration file: {combined_yaml_path}")

# ============================================================================
# CLEANUP: Delete copied datasets to save space
# ============================================================================
print("\n" + "="*80)
print("üßπ CLEANING UP COPIED DATASETS")
print("="*80)

# Option to skip cleanup (set to False if you want to keep the copies)
DELETE_COPIED_DATASETS = True

if DELETE_COPIED_DATASETS:
    print("\nüóëÔ∏è  Deleting copied datasets (already merged)...")
    
    # Delete BSED copied dataset
    if bsed_dataset_dir.exists():
        try:
            shutil.rmtree(bsed_dataset_dir)
            print(f"   ‚úÖ Deleted: {bsed_dataset_dir}")
        except Exception as e:
            print(f"   ‚ö†Ô∏è  Error deleting {bsed_dataset_dir}: {e}")
    else:
        print(f"   ‚ÑπÔ∏è  BSED dataset not found (already deleted or not copied)")
    
    # Delete Roboflow copied dataset
    if roboflow_dataset_dir.exists():
        try:
            shutil.rmtree(roboflow_dataset_dir)
            print(f"   ‚úÖ Deleted: {roboflow_dataset_dir}")
        except Exception as e:
            print(f"   ‚ö†Ô∏è  Error deleting {roboflow_dataset_dir}: {e}")
    else:
        print(f"   ‚ÑπÔ∏è  Roboflow dataset not found (already deleted or not copied)")
    
    print(f"\n‚úÖ Cleanup complete! Only combined dataset remains.")
    print(f"   Combined dataset: {combined_dataset_dir}")
else:
    print("\nüí° Cleanup skipped (DELETE_COPIED_DATASETS = False)")
    print(f"   BSED dataset: {bsed_dataset_dir}")
    print(f"   Roboflow dataset: {roboflow_dataset_dir}")
    print(f"   Combined dataset: {combined_dataset_dir}")

print("\n" + "="*80)
print("üí° Next step: Use this dataset for training:")
print(f"   from ultralytics import YOLO")
print(f"   model = YOLO('yolov8n.pt')")
print(f"   model.train(data='{combined_yaml_path}', epochs=100)")
print("="*80)



üìù CREATING COMBINED DATA.YAML

‚úÖ Combined data.yaml created: /kaggle/working/combined_yolo_dataset/data.yaml

üìã Configuration:
   Path: /kaggle/working/combined_yolo_dataset
   Train: train/images
   Validation: valid/images
   Test: test/images
   Number of classes: 7

   Class mapping:
      0: Healthy
      1: Stage1
      2: Stage2
      3: Stage3
      4: Stage4
      5: Stage5
      6: Stage6

üìä FINAL COMBINED DATASET STATISTICS

üìà Total Images:
   Training: 8167 images
   Validation: 566 images
   Test: 333 images
   TOTAL: 9066 images

üì¶ Source Breakdown:
   BSED Dataset:
      Train: 3052, Valid: 94, Test: 94
   Roboflow Dataset:
      Train: 5115, Valid: 472, Test: 239

üîç Verifying label files...
   TRAIN: 8167 images, 8167 labels ‚úÖ
   VALID: 566 images, 566 labels ‚úÖ
   TEST: 333 images, 333 labels ‚úÖ

‚úÖ DATASET COMBINATION COMPLETE!

üìÅ Combined dataset location: /kaggle/working/combined_yolo_dataset
üìÑ Configuration file: /kaggle/working/comb

In [7]:
!zip -r /kaggle/working/BananaEarlyDataSets.zip /kaggle/working

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/combined_yolo_dataset/ (stored 0%)
  adding: kaggle/working/combined_yolo_dataset/train/ (stored 0%)
  adding: kaggle/working/combined_yolo_dataset/train/images/ (stored 0%)
  adding: kaggle/working/combined_yolo_dataset/train/images/Stage3_Stage3_IMG_20251122_094045_tile_0_1_aug1.jpg (deflated 1%)
  adding: kaggle/working/combined_yolo_dataset/train/images/Healthy-215-_png.rf.06265c698e1d3c8e30a34eb50c21eea6.jpg (deflated 5%)
  adding: kaggle/working/combined_yolo_dataset/train/images/Stage5-202-_jpg.rf.439e23d7020553b6e8c3cb53bc946ad3.jpg (deflated 2%)
  adding: kaggle/working/combined_yolo_dataset/train/images/Stage2_Stage2_IMG_20251122_091531_tile_2_2_aug2.jpg (deflated 1%)
  adding: kaggle/working/combined_yolo_dataset/train/images/Healthy-140-_png.rf.f552605f981a98b5510495be9efb1982.jpg (deflated 5%)
  adding: kaggle/working/combined_yolo_dataset/train/images/Stage4-399-_jpg.rf.dd6148cbf9c9543f490f942607acfe72.jpg (de