In [None]:
import os
import hashlib
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm  # For progress bar visualization

In [None]:
def hash_file(file_path):
    """Generate a hash for a given file."""
    hash_algo = hashlib.sha256()
    try:
        with open(file_path, 'rb') as file:
            while chunk := file.read(8192):
                hash_algo.update(chunk)
        return file_path, hash_algo.hexdigest()
    except Exception as e:
        # Return None to indicate failure
        return file_path, None

def handle_file_conflict(dst):
    """Handle file conflicts by renaming the destination file."""
    base, extension = os.path.splitext(dst)
    counter = 1
    new_dst = dst
    while os.path.exists(new_dst):
        new_dst = f"{base}_{counter}{extension}"
        counter += 1
    return new_dst

def process_files(files_to_process, duplicate_folder):
    """Process files using multithreading, moving duplicates to the 'duplicated' folder."""
    file_hashes = {}
    duplicates = []
    total_size_saved = 0

    os.makedirs(duplicate_folder, exist_ok=True)

    with ThreadPoolExecutor() as executor, tqdm(total=len(files_to_process), desc="Processing files", unit="file") as pbar:
        futures = {executor.submit(hash_file, fp): fp for fp in files_to_process}
        for future in as_completed(futures):
            file_path, file_hash = future.result()
            if file_hash:
                if file_hash in file_hashes:
                    try:
                        # Move (cut) the duplicate file
                        dst_path = os.path.join(duplicate_folder, os.path.basename(file_path))
                        dst_path = handle_file_conflict(dst_path)
                        shutil.move(file_path, dst_path)  # Move instead of copy
                        total_size_saved += os.path.getsize(dst_path)
                        duplicates.append(file_path)
                    except PermissionError:
                        print(f"\nPermission denied: {file_path}")
                    except Exception as e:
                        print(f"\nError moving file {file_path}: {e}")
                else:
                    file_hashes[file_hash] = file_path
            pbar.update(1)  # Update progress bar for each file processed
    return duplicates, total_size_saved

def main():
    directory = input("Enter the directory to scan for duplicates: ")
    duplicate_folder = os.path.join(directory, 'duplicated')
    os.makedirs(duplicate_folder, exist_ok=True)

    # Collect all files to process
    files_to_process = [
        os.path.join(root, filename)
        for root, _, files in os.walk(directory)
        for filename in files
    ]

    print("Processing files...")
    duplicates, total_size_saved = process_files(files_to_process, duplicate_folder)

    print("\nProcess Complete!")
    print(f"Duplicates moved to '{duplicate_folder}'.")
    print(f"Total duplicates found: {len(duplicates)}")
    print(f"Total space saved: {total_size_saved / (1024 * 1024):.2f} MB")

if __name__ == "__main__":
    main()


In [None]:
# next i want to add a basic UI - better logic for the duplicate file handler 

In [None]:
# new task that handle filtering images that contains humans 