In [None]:
import os
import imagehash
import random
import shutil
from PIL import Image
from pathlib import Path

In [14]:
""" GLOBAL VARIABLES """
DATA_DIR: Path = Path("C:/capstone/dataset/raw")
EXTENSIONS: list[str] = [".png", ".jpg", ".jpeg", ".bmp", ".gif"]

## CHECK DUPLICATES FOR EACH CLASS

This only check duplicates for each class assuming you provide DATA_DIR path to a folder consisting subfolders, example: 
```
dataset         
├───subfolder_1 
│   ├───class_1 
│   ├───...     
│   └───class_n 
└───subfolder_n 
    ├───class_1 
    ├─── ...    
    └───class_n 
```
This is tipically what's available on Kaggle

In [20]:
def print_log(duplicates_log: dict[str, list[tuple[str, str]]], 
                   total_duplicates: dict[str, int]) -> None:
    """Print duplicate images and counts per subfolder."""
    if duplicates_log:
        print("\nDuplicate images found:")
        for class_name, duplicate_pairs in duplicates_log.items():
            print(f"\nClass: {class_name}")
            for duplicate, original in duplicate_pairs:
                print(f"Duplicate: {duplicate} | Original: {original}")
        
        print("\nTotal duplicates found per subfolder:")
        for subfolder, count in total_duplicates.items():
            print(f"{subfolder}: {count}")
    else:
        print("No duplicates found in log; \
              Duplicates log not found")
    
def check_for_dupes(ds_dir: Path):
    """Check for duplicate images in the dataset."""
    duplicates_log: dict[str, list[tuple[str, str]]] = {}
    total_duplicates: dict[str, int] = {subfolder.name: 0 for subfolder \
                                        in ds_dir.iterdir() if subfolder.is_dir()}

    for subfolder in ds_dir.iterdir():
        """ Iterate through subfolders (1st layer) """
        subfolder_path: Path = ds_dir / subfolder.name
        print(f"In subfolder {subfolder}:")
        
        for class_folder in subfolder_path.iterdir():
            """ Iterate through class folders (2nd layer) """
            if class_folder.is_dir():
                print(f"Processing {class_folder.name}")
                
                hashes: dict[str, str] = {} # initialize to store hashes for each class
                
                for image_file in class_folder.iterdir():
                    """ Iterate through image files (3rd; last layer) """
                    if image_file.suffix.lower() in EXTENSIONS:
                        try:
                            with Image.open(image_file) as img:
                                img_hash = imagehash.phash(img) # calculate hash based on perceptual hash
                            
                            if img_hash in hashes: # add duplicates in each class to log
                                duplicates_log.setdefault(class_folder.name, []) \
                                .append((str(image_file), hashes[img_hash]))

                                total_duplicates[subfolder.name] += 1  # Increment total duplicate count
                            else: # add new unique image to hashes
                                hashes[img_hash] = str(image_file)
                        except Exception as e:
                            print(f"Error processing {image_file}: {e}")
    print_log(duplicates_log, total_duplicates)


In [21]:
check_for_dupes(DATA_DIR)

In subfolder C:\capstone\dataset\raw\images_default:
Processing aerosol_cans
Processing aluminum_food_cans
Processing aluminum_soda_cans
Processing cardboard_boxes
Processing cardboard_packaging
Processing clothing
Processing coffee_grounds
Processing disposable_plastic_cutlery
Processing eggshells
Processing food_waste
Processing glass_beverage_bottles
Processing glass_cosmetic_containers
Processing glass_food_jars
Processing magazines
Processing newspaper
Processing office_paper
Processing paper_cups
Processing plastic_cup_lids
Processing plastic_detergent_bottles
Processing plastic_food_containers
Processing plastic_shopping_bags
Processing plastic_soda_bottles
Processing plastic_straws
Processing plastic_trash_bags
Processing plastic_water_bottles
Processing shoes
Processing steel_food_cans
Processing styrofoam_cups
Processing styrofoam_food_containers
Processing tea_bags
In subfolder C:\capstone\dataset\raw\images_real:
Processing aerosol_cans
Processing aluminum_food_cans
Process

## DELETE SAID DUPLICATES
You can see kalo the algorithm is sama

In [None]:
def copy_if_not_exist(source: Path, destination: Path) -> None:
    if not Path.exists(destination):
        shutil.copy2(source, destination)

def delete_dupes(ds_dir: Path) -> None:
    """Check for duplicate images in the dataset."""
    duplicates_log: dict[str, list[tuple[str, str]]] = {}
    total_duplicates: dict[str, int] = {subfolder.name: 0 for subfolder \
                                        in ds_dir.iterdir() if subfolder.is_dir()}

    for subfolder in ds_dir.iterdir():
        """ Iterate through subfolders (1st layer) """
        subfolder_path: Path = ds_dir / subfolder.name
        cleaned_sub_path: Path = ds_dir.parent / 'dupe_cleaned' / subfolder.name

        print(f"In subfolder {subfolder}:")
        
        for class_folder in subfolder_path.iterdir():
            """ Iterate through class folders (2nd layer) """
            if class_folder.is_dir():
                print(f"Processing {class_folder.name}")

                cleaned_class_path: Path = cleaned_sub_path / class_folder.name
                if not Path.exists(cleaned_class_path): # make dir for dupe-cleaned ds
                    cleaned_class_path.mkdir(parents=True, exist_ok=False)
                
                hashes: dict[str, str] = {} # initialize to store hashes for each class
                
                for image_file in class_folder.iterdir():
                    """ Iterate through image files (3rd; last layer) """
                    if image_file.suffix.lower() in EXTENSIONS:
                        try:
                            with Image.open(image_file) as img:
                                img_hash = imagehash.phash(img) # calculate hash based on perceptual hash
                            
                            if img_hash not in hashes: #skip copy if in hash (duplicate)
                                destination = cleaned_class_path / image_file.name
                                copy_if_not_exist(image_file, destination)
                                hashes[img_hash] = str(image_file)

                        except Exception as e:
                            print(f"Error processing {image_file}: {e}")


In [24]:
delete_dupes(DATA_DIR)

In subfolder C:\capstone\dataset\raw\images_default:
Processing aerosol_cans
Processing aluminum_food_cans
Processing aluminum_soda_cans
Processing cardboard_boxes
Processing cardboard_packaging
Processing clothing
Processing coffee_grounds
Processing disposable_plastic_cutlery
Processing eggshells
Processing food_waste
Processing glass_beverage_bottles
Processing glass_cosmetic_containers
Processing glass_food_jars
Processing magazines
Processing newspaper
Processing office_paper
Processing paper_cups
Processing plastic_cup_lids
Processing plastic_detergent_bottles
Processing plastic_food_containers
Processing plastic_shopping_bags
Processing plastic_soda_bottles
Processing plastic_straws
Processing plastic_trash_bags
Processing plastic_water_bottles
Processing shoes
Processing steel_food_cans
Processing styrofoam_cups
Processing styrofoam_food_containers
Processing tea_bags
In subfolder C:\capstone\dataset\raw\images_real:
Processing aerosol_cans
Processing aluminum_food_cans
Process

## SPLIT DATASET INTO (TRAIN, VALIDATION AND TEST)

* **WIP: STILL EXPERIMENTAL**

**ONLY USE THIS CODE IF YOU WANT YOUR DATASET LOOKS LIKE:** <br>
```
dataset             
├───subfolder_1     
│   ├───train       
│   ├───test        
│   └───validation  
└───subfolder_n     
    ├───train       
    ├───test        
    └───validation  
```
**Note:** Classes inside your splitted dataset will remain the same <br>
Example: `dataset/subfolder/train/class_n`

In [None]:
""" GLOBAL VARIABLES FOR SPLITTING """
CLEAN_DS_DIR: Path = DATA_DIR.parent / 'dupe_cleaned'
SPLIT_RATIOS: dict[str, float] = {'train': 0.7, 'validation': 0.2, 'test': 0.1}

In [50]:
def copy_if_not_exist(source: Path, destination: Path) -> None:
    if not Path.exists(destination):
        shutil.copy2(source, destination)

def split_files(files: list[Path]) -> dict[str, list[Path]]:
    """ This function splits the dataset using the list index """
    random.seed(42)
    random.shuffle(files)
    train_i_end: int = int(len(files) * SPLIT_RATIOS['train'])
    val_i_end: int = train_i_end + int(len(files) * SPLIT_RATIOS['validation'])
    return {
        'train': files[:train_i_end],
        'validation': files[train_i_end:val_i_end],
        'test': files[val_i_end:]
    }

def split_dataset(clean_ds_dir: Path) -> None:
    for subfolder in clean_ds_dir.iterdir():
        """ Iterate through subfolders (1st layer) """
        subfolder_path: Path = CLEAN_DS_DIR / subfolder.name
        split_path: Path = CLEAN_DS_DIR.parent / 'split'

        print(f"In subfolder {subfolder}:")
        for class_folder in subfolder_path.iterdir():
            """ Iterate through class folders (2nd layer) """
            if class_folder.is_dir():
                print(f"Processing {class_folder.name}")
                
                files = [ file for file in class_folder.iterdir() # iterate through files in 
                         if file.suffix.lower() in EXTENSIONS ]   # class folders;
                files = split_files(files)                        # store and split files
                
                for split, files in files.items():
                    destination_path: Path = split_path / subfolder.name / split /class_folder.name
                    if not Path.exists(destination_path):
                        destination_path.mkdir(parents=True, exist_ok=False)
                    for file_path in files:
                        destination = destination_path / file_path.name
                        copy_if_not_exist(file_path, destination)


In [49]:
split_dataset(CLEAN_DS_DIR)

In subfolder C:\capstone\dataset\dupe_cleaned\images_default:
Processing aerosol_cans
Processing aluminum_food_cans
Processing aluminum_soda_cans
Processing cardboard_boxes
Processing cardboard_packaging
Processing clothing
Processing coffee_grounds
Processing disposable_plastic_cutlery
Processing eggshells
Processing food_waste
Processing glass_beverage_bottles
Processing glass_cosmetic_containers
Processing glass_food_jars
Processing magazines
Processing newspaper
Processing office_paper
Processing paper_cups
Processing plastic_cup_lids
Processing plastic_detergent_bottles
Processing plastic_food_containers
Processing plastic_shopping_bags
Processing plastic_soda_bottles
Processing plastic_straws
Processing plastic_trash_bags
Processing plastic_water_bottles
Processing shoes
Processing steel_food_cans
Processing styrofoam_cups
Processing styrofoam_food_containers
Processing tea_bags
In subfolder C:\capstone\dataset\dupe_cleaned\images_real:
Processing aerosol_cans
Processing aluminum

# CHECK TRAIN, TEST, VALIDATION FOR DUPES
* **WIP: STILL EXPERIMENTAL**
* **MANUAL**

In [53]:
def check_for_data_leakage():
    # Dictionary to store image hashes and their associated file paths and dataset type
    hash_dict = {}
    # Dictionary to store leaks found between datasets
    data_leakage_log = {subfolder: [] for subfolder in SUB_DIRS}

    # Iterate through each folder in train, validation, and test
    for subfolder in SUB_DIRS:
        subfolder_path = DATA_DIR / subfolder
        print(f"Checking for data leakage in '{subfolder}' set...")

        # Iterate through each class folder
        for class_folder in subfolder_path.iterdir():
            if class_folder.is_dir():
                # Iterate through images in the class folder
                for image_file in class_folder.iterdir():
                    if image_file.suffix.lower() in EXTENSIONS:
                        try:
                            # Open image and calculate hash
                            with Image.open(image_file) as img:
                                img_hash = imagehash.phash(img)

                            # Check if this hash already exists in another dataset
                            if img_hash in hash_dict:
                                # Check if it came from a different subset
                                if hash_dict[img_hash]['subfolder'] != subfolder:
                                    data_leakage_log[subfolder].append((image_file, hash_dict[img_hash]['path']))
                                    print(f"Duplicates: {image_file} (in '{subfolder}') | {hash_dict[img_hash]['path']} (in '{hash_dict[img_hash]['subfolder']}')")
                            else:
                                # If the hash is unique so far, store it with the file path and subfolder
                                hash_dict[img_hash] = {'path': image_file, 'subfolder': subfolder}
                        except Exception as e:
                            print(f"Error processing {image_file}: {e}")

    # # Output data leakage log if any leaks are found
    # if any(data_leakage_log[subfolder] for subfolder in SUB_DIRS):
    #     print("\nData leakage detected across datasets:")
    #     for subfolder, leakage_pairs in data_leakage_log.items():
    #         if leakage_pairs:
    #             print(f"\nLeakage in '{subfolder}' set:")
    #             for duplicate, original in leakage_pairs:
    #                 print(f"Duplicate in {duplicate} matches with {original}")
    # else:
    #     print("No data leakage found between train, validation, and test sets.")

# Execute the function
check_for_data_leakage()

Checking for data leakage in 'test' set...
Checking for data leakage in 'train' set...
Duplicates: C:\Users\Keny\Downloads\virtualenv\dataset\cleaned\images_default_dupe_cleaned_set\train\aluminum_food_cans\Image_103.png (in 'train') | C:\Users\Keny\Downloads\virtualenv\dataset\cleaned\images_default_dupe_cleaned_set\test\steel_food_cans\Image_113.png (in 'test')
Duplicates: C:\Users\Keny\Downloads\virtualenv\dataset\cleaned\images_default_dupe_cleaned_set\train\aluminum_food_cans\Image_115.png (in 'train') | C:\Users\Keny\Downloads\virtualenv\dataset\cleaned\images_default_dupe_cleaned_set\test\steel_food_cans\Image_104.png (in 'test')
Duplicates: C:\Users\Keny\Downloads\virtualenv\dataset\cleaned\images_default_dupe_cleaned_set\train\aluminum_food_cans\Image_6.png (in 'train') | C:\Users\Keny\Downloads\virtualenv\dataset\cleaned\images_default_dupe_cleaned_set\test\steel_food_cans\Image_5.png (in 'test')
Duplicates: C:\Users\Keny\Downloads\virtualenv\dataset\cleaned\images_default_du