# Dataset Cleaning Notebook
## Drone vs. Bird YOLO Dataset
This notebook cleans a YOLO-formatted dataset by:
1. Removing corrupt images
2. Verifying label files exist
3. Creating cleaned folders with valid pairs


# =============================================
# 1: Imports and Setup
# =============================================

In [1]:
import os
from PIL import Image
import cv2
import shutil
from tqdm import tqdm

print("All modules are installed correctly!")


All modules are installed correctly!



# =============================================
# 2: Path configuration 
# =============================================


In [None]:
# Configure paths (UPDATE THESE TO MATCH YOUR SYSTEM)
base_path = r"C:\Users\josep\Documents\My_Data_Science\Projects\School_projects\phase_4\Dataset"
raw_folders = {
    'train': os.path.join(base_path, 'train'),
    'test': os.path.join(base_path, 'test'),
    'valid': os.path.join(base_path, 'valid')
}

# Where to save cleaned data (will create cleaned_train, cleaned_test, cleaned_valid)
cleaned_base = os.path.join(base_path, 'cleaned_dataset')

print("✅ Paths configured:")
print(f"Base dataset path: {base_path}")
print(f"Cleaned data will be saved to: {cleaned_base}")


✅ Paths configured:
Base dataset path: C:\Users\josep\Documents\My_Data_Science\Projects\School_projects\phase_4\Dataset
Cleaned data will be saved to: C:\Users\josep\Documents\My_Data_Science\Projects\School_projects\phase_4\Dataset\cleaned_dataset


# =============================================
# 3: CREATE CLEANED FOLDER STRUCTURE
# =============================================

In [None]:
def create_cleaned_structure():
    """Creates identical folder structure for cleaned data"""
    os.makedirs(cleaned_base, exist_ok=True)
    
    for split in ['train', 'test', 'valid']:
        # Create image and label subfolders for each split
        os.makedirs(os.path.join(cleaned_base, f'cleaned_{split}', 'images'), exist_ok=True)
        os.makedirs(os.path.join(cleaned_base, f'cleaned_{split}', 'labels'), exist_ok=True)
        print(f"Created: {os.path.join(cleaned_base, f'cleaned_{split}')}")

create_cleaned_structure()

Created: C:\Users\josep\Documents\My_Data_Science\Projects\School_projects\phase_4\Dataset\cleaned_dataset\cleaned_train
Created: C:\Users\josep\Documents\My_Data_Science\Projects\School_projects\phase_4\Dataset\cleaned_dataset\cleaned_test
Created: C:\Users\josep\Documents\My_Data_Science\Projects\School_projects\phase_4\Dataset\cleaned_dataset\cleaned_valid


# =============================================
# 3: IMAGE VALIDATION FUNCTION
# =============================================

In [8]:
def validate_drone_bird_image(img_path):
    """
    Validates images with strict naming patterns:
    - Bird images: BT(number).jpg (e.g., BT1.jpg, BT2.jpg)
    - Drone images: DT(number).jpg (e.g., DT1.jpg, DT2.jpg)
    
    Returns:
        tuple: (is_valid: bool, category: str or None, error: str or None)
    """
    filename = os.path.basename(img_path)
    
    # 1. Check filename pattern
    if not (filename.startswith('BT(') and filename.endswith(').jpg')) and \
       not (filename.startswith('DT(') and filename.endswith(').jpg')):
        return False, None, f"Invalid filename pattern: {filename}"
    
    # 2. Extract category
    category = 'bird' if filename.startswith('BT(') else 'drone'
    
    try:
        # 3. Verify image integrity
        with Image.open(img_path) as img:
            img.verify()
            img.load()
            
            # 4. Check resolution (optional)
            if img.size != (640, 640):
                return False, category, f"Incorrect size: {img.size}"
                
        # 5. OpenCV validation
        if cv2.imread(img_path) is None:
            return False, category, "OpenCV read failed"
            
        return True, category, None
        
    except Exception as e:
        return False, category, f"Corruption: {str(e)[:50]}"

# Example usage
sample_images = [
    r"C:\Users\josep\Documents\My_Data_Science\Projects\School_projects\phase_4\Dataset\train\images\BT(1).jpg",
    r"C:\Users\josep\Documents\My_Data_Science\Projects\School_projects\phase_4\Dataset\train\images\DT(1).jpg",
    r"C:\Users\josep\Documents\My_Data_Science\Projects\School_projects\phase_4\Dataset\train\images\invalid.jpg"
]

print("🔍 Image Validation Results:")
for img_path in sample_images:
    is_valid, category, error = validate_drone_bird_image(img_path)
    
    if is_valid:
        print(f"✅ {os.path.basename(img_path)} - Valid {category} image")
    else:
        print(f"❌ {os.path.basename(img_path)} - {error}")

# Full processing function
def process_all_images(base_path):
    """Processes all images in the dataset"""
    for split in ['train', 'test', 'valid']:
        img_dir = os.path.join(base_path, split, 'images')
        print(f"\nProcessing {split} images...")
        
        for img_file in os.listdir(img_dir):
            img_path = os.path.join(img_dir, img_file)
            is_valid, category, error = validate_drone_bird_image(img_path)
            
            if not is_valid:
                continue  # Skip invalid files
                
            # Process valid images here
            print(f"Processing {category} image: {img_file}")
            # Your processing code here

# Run on your actual dataset
process_all_images(r"C:\Users\josep\Documents\My_Data_Science\Projects\School_projects\phase_4\Dataset")

🔍 Image Validation Results:
❌ BT(1).jpg - Corruption: [Errno 2] No such file or directory: 'C:\\Users\\j
❌ DT(1).jpg - Corruption: [Errno 2] No such file or directory: 'C:\\Users\\j
❌ invalid.jpg - Invalid filename pattern: invalid.jpg

Processing train images...

Processing test images...

Processing valid images...


# =============================================
# CELL 4: MAIN CLEANING PROCESS
# =============================================

In [9]:
def clean_dataset():
    """Processes all dataset splits to create cleaned version"""
    stats = {}  # To track processing statistics
    
    for split in ['train', 'test', 'valid']:
        print(f"\n🔍 Processing {split} set...")
        stats[split] = {
            'total': 0,
            'valid': 0,
            'corrupt': 0,
            'missing_label': 0
        }
        
        # Path setup
        img_dir = os.path.join(raw_folders[split], 'images')
        label_dir = os.path.join(raw_folders[split], 'labels')
        cleaned_img_dir = os.path.join(cleaned_base, f'cleaned_{split}', 'images')
        cleaned_label_dir = os.path.join(cleaned_base, f'cleaned_{split}', 'labels')
        
        # Get all image files (JPG, JPEG, PNG)
        image_files = [f for f in os.listdir(img_dir) 
                      if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        stats[split]['total'] = len(image_files)
        
        # Process each image with progress bar
        for img_file in tqdm(image_files, desc=f"Validating {split} images"):
            img_path = os.path.join(img_dir, img_file)
            label_file = os.path.splitext(img_file)[0] + '.txt'
            label_path = os.path.join(label_dir, label_file)
            
            # Skip if image is invalid
            if not validate_image(img_path):
                stats[split]['corrupt'] += 1
                continue
                
            # Skip if label is missing
            if not os.path.exists(label_path):
                stats[split]['missing_label'] += 1
                continue
                
            # Copy valid pairs to cleaned folders
            shutil.copy2(img_path, os.path.join(cleaned_img_dir, img_file))
            shutil.copy2(label_path, os.path.join(cleaned_label_dir, label_file))
            stats[split]['valid'] += 1
            
        # Print summary for this split
        print(f"✔️ {split.upper()} Results:")
        print(f"  - Total images: {stats[split]['total']}")
        print(f"  - Valid pairs copied: {stats[split]['valid']}")
        print(f"  - Corrupt images: {stats[split]['corrupt']}")
        print(f"  - Missing labels: {stats[split]['missing_label']}")
    
    return stats

# Run the cleaning process
cleaning_stats = clean_dataset()


🔍 Processing train set...


Validating train images: 100%|██████████| 18323/18323 [14:54<00:00, 20.47it/s]  


✔️ TRAIN Results:
  - Total images: 18323
  - Valid pairs copied: 18323
  - Corrupt images: 0
  - Missing labels: 0

🔍 Processing test set...


Validating test images: 100%|██████████| 889/889 [00:23<00:00, 37.68it/s]


✔️ TEST Results:
  - Total images: 889
  - Valid pairs copied: 889
  - Corrupt images: 0
  - Missing labels: 0

🔍 Processing valid set...


Validating valid images: 100%|██████████| 1740/1740 [01:18<00:00, 22.23it/s]

✔️ VALID Results:
  - Total images: 1740
  - Valid pairs copied: 1740
  - Corrupt images: 0
  - Missing labels: 0





# =============================================
# CELL 5: VERIFICATION AND SUMMARY
# =============================================

In [None]:
print("\n🎉 FINAL CLEANING SUMMARY")

# Calculate percentages
for split in cleaning_stats:
    stats = cleaning_stats[split]
    valid_pct = (stats['valid'] / stats['total']) * 100
    print(f"\n{split.upper()}:")
    print(f"  - Valid pairs: {stats['valid']}/{stats['total']} ({valid_pct:.1f}%)")
    print(f"  - Issues detected: {stats['corrupt'] + stats['missing_label']}")

# Verify file counts match
print("\n📂 Final folder contents:")
for split in ['train', 'test', 'valid']:
    img_dir = os.path.join(cleaned_base, f'cleaned_{split}', 'images')
    label_dir = os.path.join(cleaned_base, f'cleaned_{split}', 'labels')
    
    img_count = len(os.listdir(img_dir))
    label_count = len(os.listdir(label_dir))
    
    print(f"{split.upper()}:")
    print(f"  - Images: {img_count}")
    print(f"  - Labels: {label_count}")
    print(f"  - {'✅ Matched' if img_count == label_count else '❌ Mismatch'}")



🎉 FINAL CLEANING SUMMARY

TRAIN:
  - Valid pairs: 18323/18323 (100.0%)
  - Issues detected: 0

TEST:
  - Valid pairs: 889/889 (100.0%)
  - Issues detected: 0

VALID:
  - Valid pairs: 1740/1740 (100.0%)
  - Issues detected: 0

📂 Final folder contents:
TRAIN:
  - Images: 18323
  - Labels: 18323
  - ✅ Matched
TEST:
  - Images: 889
  - Labels: 889
  - ✅ Matched
VALID:
  - Images: 1740
  - Labels: 1740
  - ✅ Matched
