In [8]:
!pip install imagehash


Looking in indexes: http://pulp-web-svc.pulp-system.svc.cluster.local:24880/pypi/pypi/simple/
Collecting imagehash
  Downloading http://pulp-web-svc.pulp-system.svc.cluster.local:24880/pulp/content/pypi/ImageHash-4.3.2-py2.py3-none-any.whl?redirect=https://files.pythonhosted.org/packages/31/2c/5f0903a53a62029875aaa3884c38070cc388248a2c1b9aa935632669e5a7/ImageHash-4.3.2-py2.py3-none-any.whl
[2K     [32m\[0m [32m296.7 kB[0m [31m2.5 MB/s[0m [33m0:00:00[0m
Installing collected packages: imagehash
Successfully installed imagehash-4.3.2


In [13]:


import os
import imagehash
from PIL import Image
from collections import defaultdict

def find_dataset_conflicts(dataset_root_path):
    """
    Scans a dataset for duplicate images and checks for label conflicts.
    Assumes folder structure: dataset_root_path/split/class/image.jpg
    Example: data/Training/glioma/img1.jpg
    """
    
    # Dictionary to store hash -> list of image info
    # Format: { 'hash_string': [ {'path': '...', 'split': '...', 'label': '...'}, ... ] }
    hashes = defaultdict(list)
    
    print(f"Scanning images in {dataset_root_path}...")
    
    image_count = 0
    
    # Walk through the directory tree
    for root, dirs, files in os.walk(dataset_root_path):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tif')):
                file_path = os.path.join(root, file)
                
                try:
                    # Extract label and split from path
                    # Adjust these indices based on your actual folder structure!
                    # Example path: .../Training/glioma/image.jpg
                    parts = file_path.split(os.sep)
                    label = parts[-2]  # e.g., 'glioma'
                    split = parts[-3]  # e.g., 'Training' or 'Testing'
                    
                    # Open image and calculate perceptual hash
                    with Image.open(file_path) as img:
                        # phash is good for finding slight variations/resizing
                        img_hash = str(imagehash.phash(img))
                        
                    hashes[img_hash].append({
                        'path': file_path,
                        'split': split,
                        'label': label
                    })
                    
                    image_count += 1
                    if image_count % 500 == 0:
                        print(f"Processed {image_count} images...")
                        
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

    print(f"\nFinished scanning {image_count} images.")
    print("-" * 50)

    # Analyze hashes for conflicts
    duplicates = 0
    conflicts = 0
    leakage = 0

    print("DETECTED ISSUES:\n")

    for h, info_list in hashes.items():
        if len(info_list) > 1:
            duplicates += 1
            
            # Get unique labels and splits for this hash
            unique_labels = set(item['label'] for item in info_list)
            unique_splits = set(item['split'] for item in info_list)
            
            is_conflict = len(unique_labels) > 1
            is_leakage = len(unique_splits) > 1
            
            if is_conflict:
                conflicts += 1
                print(f"[CONFLICT] Same image found with DIFFERENT labels: {unique_labels}")
                for item in info_list:
                    print(f"   - {item['split']}/{item['label']}: {item['path']}")
                print("")
                
            elif is_leakage:
                leakage += 1
                # Uncomment below to see simple leakage (same label, different split)
                # print(f"[LEAKAGE] Same image found in multiple splits: {unique_splits}")
                # for item in info_list:
                #    print(f"   - {item['split']}/{item['label']}: {item['path']}")

    print("-" * 50)
    print(f"Summary:")
    print(f"Total Duplicate Groups Found: {duplicates}")
    print(f"Label Conflicts (Same img, different class): {conflicts}")
    print(f"Data Leakage (Same img, different split): {leakage}")

# --- USAGE ---
# Replace with the actual path to your downloaded dataset
dataset_path = "/home/jovyan//nnunet2-mig-7g-80gb-datavol-1/data2_clean"
find_dataset_conflicts(dataset_path)


Scanning images in /home/jovyan//nnunet2-mig-7g-80gb-datavol-1/data2_clean...
Processed 500 images...
Processed 1000 images...
Processed 1500 images...
Processed 2000 images...
Processed 2500 images...
Processed 3000 images...
Processed 3500 images...
Processed 4000 images...
Processed 4500 images...
Processed 5000 images...
Processed 5500 images...

Finished scanning 5910 images.
--------------------------------------------------
DETECTED ISSUES:

--------------------------------------------------
Summary:
Total Duplicate Groups Found: 0
Label Conflicts (Same img, different class): 0
Data Leakage (Same img, different split): 0


In [12]:
import os
import imagehash
from PIL import Image
from collections import defaultdict

# --- CONFIGURATION ---
dataset_root_path = "/home/jovyan//nnunet2-mig-7g-80gb-datavol-1/data2_clean" # <--- UPDATE THIS PATH
DRY_RUN = False  # Set to False to actually delete files
# ---------------------

def clean_duplicates(dataset_root):
    hashes = defaultdict(list)
    print(f"Scanning {dataset_root} for duplicates to clean...")

    # 1. HASH ALL IMAGES
    for root, dirs, files in os.walk(dataset_root):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tif')):
                full_path = os.path.join(root, file)
                try:
                    # Get split name to prioritize keeping 'Training' over 'Testing'
                    # Assuming structure: .../Training/glioma/image.jpg
                    split_name = full_path.split(os.sep)[-3] 
                    
                    with Image.open(full_path) as img:
                        h = str(imagehash.phash(img))
                        hashes[h].append({
                            'path': full_path,
                            'split': split_name,
                            'file': file
                        })
                except Exception as e:
                    print(f"Skipping corrupt file {full_path}: {e}")

    # 2. DETERMINE DELETIONS
    files_to_delete = []
    
    for h, items in hashes.items():
        if len(items) > 1:
            # Sort items to determine which one to KEEP.
            # We want to keep 'Training' over 'Testing'.
            # Sorting by split name usually works if folders are named "Training" and "Testing"
            # "Training" < "Testing" is False alphabetically? No, Tr comes after Te.
            # So we sort so that Training is at index 0.
            
            # Custom sort key: 0 if Training, 1 if Testing, 2 otherwise
            def sort_priority(item):
                s = item['split'].lower()
                if 'train' in s: return 0
                if 'test' in s or 'val' in s: return 1
                return 2

            items.sort(key=sort_priority)
            
            # The first item is the "Master" (Keep), the rest are duplicates (Delete)
            keeper = items[0]
            duplicates = items[1:]
            
            for dup in duplicates:
                files_to_delete.append(dup['path'])

    # 3. EXECUTE DELETION
    print(f"\nFound {len(files_to_delete)} duplicate files to remove.")
    
    if len(files_to_delete) > 0:
        print(f"Mode: {'DRY RUN (No files will be deleted)' if DRY_RUN else 'LIVE DELETION'}")
        print("-" * 40)
        
        for file_path in files_to_delete:
            if DRY_RUN:
                print(f"[Would Delete]: {file_path}")
            else:
                try:
                    os.remove(file_path)
                    print(f"[Deleted]: {file_path}")
                except OSError as e:
                    print(f"[Error Deleting]: {file_path} - {e}")
        
        print("-" * 40)
        if DRY_RUN:
            print("To actually delete these files, change DRY_RUN = False in the script.")
        else:
            print("Cleanup complete.")
    else:
        print("Dataset is clean! No duplicates found.")

# Run the function
clean_duplicates(dataset_root_path)

Scanning /home/jovyan//nnunet2-mig-7g-80gb-datavol-1/data2_clean for duplicates to clean...

Found 1113 duplicate files to remove.
Mode: LIVE DELETION
----------------------------------------
[Deleted]: /home/jovyan//nnunet2-mig-7g-80gb-datavol-1/data2_clean/Training/notumor/Tr-no_1158.jpg
[Deleted]: /home/jovyan//nnunet2-mig-7g-80gb-datavol-1/data2_clean/Training/notumor/Tr-no_1241.jpg
[Deleted]: /home/jovyan//nnunet2-mig-7g-80gb-datavol-1/data2_clean/Training/notumor/Tr-no_0488.jpg
[Deleted]: /home/jovyan//nnunet2-mig-7g-80gb-datavol-1/data2_clean/Training/notumor/Tr-no_1053.jpg
[Deleted]: /home/jovyan//nnunet2-mig-7g-80gb-datavol-1/data2_clean/Training/notumor/Tr-no_1025.jpg
[Deleted]: /home/jovyan//nnunet2-mig-7g-80gb-datavol-1/data2_clean/Training/notumor/Tr-no_0565.jpg
[Deleted]: /home/jovyan//nnunet2-mig-7g-80gb-datavol-1/data2_clean/Testing/notumor/Te-no_0277.jpg
[Deleted]: /home/jovyan//nnunet2-mig-7g-80gb-datavol-1/data2_clean/Testing/notumor/Te-no_0311.jpg
[Deleted]: /home/j