In [1]:
import os
import hashlib

def get_file_hash(file_path):
    """Generate MD5 hash for a file to check for duplicates."""
    hash_md5 = hashlib.md5()
    with open(file_path, 'rb') as f:
        # Read the file in chunks to avoid memory overload with large files
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def delete_duplicates_in_folder(folder_path):
    """Delete duplicate PDFs in a folder based on file content."""
    file_hashes = {}
    duplicates = []
    
    # Traverse through the folder
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith('.pdf'):
                file_path = os.path.join(root, file)
                file_hash = get_file_hash(file_path)
                
                # If the file hash is already seen, it's a duplicate
                if file_hash in file_hashes:
                    duplicates.append(file_path)
                    print(f"Duplicate found: {file_path}")
                else:
                    file_hashes[file_hash] = file_path
    print(len(duplicates))
    return duplicates


# Usage example
folder_path = r"C:\Users\amith\Kenpath\OFQUAL\mpawards_pdfs"
duplicates = delete_duplicates_in_folder(folder_path)


Duplicate found: C:\Users\amith\Kenpath\OFQUAL\mpawards_pdfs\mpqc level 2 diploma in field and laboratory geotechnical activities v1.pdf
Duplicate found: C:\Users\amith\Kenpath\OFQUAL\mpawards_pdfs\mpqc level 2 nvq certificate in plant operations construction v1.pdf
Duplicate found: C:\Users\amith\Kenpath\OFQUAL\mpawards_pdfs\mpqc level 2 nvq diploma in plant operations construction v1.pdf
Duplicate found: C:\Users\amith\Kenpath\OFQUAL\mpawards_pdfs\mpqc level 3 award in advanced first aid for remote and mining environments v1.pdf
Duplicate found: C:\Users\amith\Kenpath\OFQUAL\mpawards_pdfs\mpqc level 3 award in supervision of energy isolation and lock-out activities v1.pdf
Duplicate found: C:\Users\amith\Kenpath\OFQUAL\mpawards_pdfs\mpqc level 3 diploma in field and laboratory geotechnical activities v1.pdf
Duplicate found: C:\Users\amith\Kenpath\OFQUAL\mpawards_pdfs\mpqc level 4 diploma in safety health and environmental management v10.pdf
Duplicate found: C:\Users\amith\Kenpath\OFQU

In [2]:
# Delete duplicate files
for duplicate in duplicates:
    try:
        os.remove(duplicate)
        print(f"Deleted duplicate: {duplicate}")
    except Exception as e:
        print(f"Failed to delete {duplicate}: {str(e)}")

Deleted duplicate: C:\Users\amith\Kenpath\OFQUAL\mpawards_pdfs\mpqc level 2 diploma in field and laboratory geotechnical activities v1.pdf
Deleted duplicate: C:\Users\amith\Kenpath\OFQUAL\mpawards_pdfs\mpqc level 2 nvq certificate in plant operations construction v1.pdf
Deleted duplicate: C:\Users\amith\Kenpath\OFQUAL\mpawards_pdfs\mpqc level 2 nvq diploma in plant operations construction v1.pdf
Deleted duplicate: C:\Users\amith\Kenpath\OFQUAL\mpawards_pdfs\mpqc level 3 award in advanced first aid for remote and mining environments v1.pdf
Deleted duplicate: C:\Users\amith\Kenpath\OFQUAL\mpawards_pdfs\mpqc level 3 award in supervision of energy isolation and lock-out activities v1.pdf
Deleted duplicate: C:\Users\amith\Kenpath\OFQUAL\mpawards_pdfs\mpqc level 3 diploma in field and laboratory geotechnical activities v1.pdf
Deleted duplicate: C:\Users\amith\Kenpath\OFQUAL\mpawards_pdfs\mpqc level 4 diploma in safety health and environmental management v10.pdf
Deleted duplicate: C:\Users\am