In [8]:
import os
from PIL import Image
import imagehash
import numpy as np
import cv2

def get_perceptual_hash(image_path, hash_size=8):
    """Generate a perceptual hash for an image using the ImageHash library."""
    with Image.open(image_path) as img:
        hash = imagehash.phash(img, hash_size=hash_size)
    return hash

def get_color_histogram_hsv(image_path):
    """Generate a color histogram for an image in the HSV color space."""
    img = Image.open(image_path)
    img = img.convert('RGB')
    img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2HSV)
    histogram = cv2.calcHist([img_cv], [0, 1, 2], None, [8, 8, 8], [0, 180, 0, 256, 0, 256])
    cv2.normalize(histogram, histogram)
    return histogram.flatten()

def compare_histograms(hist1, hist2):
    """Use correlation for comparing histograms."""
    return cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)

def find_and_delete_duplicates(directory, hash_diff_threshold=5, color_threshold=0.3:
    hashes = {}
    histograms = {}
    confirmed_duplicates = []

    for filename in os.listdir(directory):
        if filename.lower().endswith(('png', 'jpg', 'jpeg', 'gif', 'bmp')):
            file_path = os.path.join(directory, filename)
            img_hash = get_perceptual_hash(file_path)
            img_histogram = get_color_histogram_hsv(file_path)
            print(f"Processing: {filename}, Hash: {img_hash}")

            duplicate_found = False
            for stored_hash, stored_filename in hashes.items():
                hash_diff = img_hash - stored_hash
                histogram_similarity = compare_histograms(histograms[stored_filename], img_histogram)

                if hash_diff <= hash_diff_threshold and histogram_similarity > color_threshold:
                    confirmed_duplicates.append((filename, stored_filename))
                    print(f"Duplicate found: {filename} and {stored_filename}")
                    os.remove(file_path)
                    print(f"Deleted: {filename}")
                    duplicate_found = True
                    break

            if not duplicate_found:
                hashes[img_hash] = filename
                histograms[filename] = img_histogram

    return confirmed_duplicates

# Usage example
directory = r'C:\Users\Admin\Desktop\beach sofas'
duplicates = find_and_delete_duplicates(directory)
print("Duplicates Found:", duplicates)


Processing: 1924eclectic-sofas.jpg, Hash: bf6cc1934aca3961
Processing: 1925eclectic-sofas.jpg, Hash: bf48c1b34a5a39a5
Duplicate found: 1925eclectic-sofas.jpg and 1924eclectic-sofas.jpg
Deleted: 1925eclectic-sofas.jpg
Processing: 2534beach-style-sofas.jpg, Hash: fe4780f8916ad19c
Processing: 2674beach-style-sofas.jpg, Hash: fe4780f8916ad0bc
Duplicate found: 2674beach-style-sofas.jpg and 2534beach-style-sofas.jpg
Deleted: 2674beach-style-sofas.jpg
Duplicates Found: [('1925eclectic-sofas.jpg', '1924eclectic-sofas.jpg'), ('2674beach-style-sofas.jpg', '2534beach-style-sofas.jpg')]
