In [7]:
!

zsh:cd:1: no such file or directory: new_stim


In [16]:
import os
import hashlib
from PIL import Image
from collections import defaultdict
import argparse
import pandas as pd
import json
from datetime import datetime

In [14]:

def get_image_hash(filepath):
    """
    Calculate hash of an image file.
    Returns None if file cannot be opened as image.
    """
    try:
        with Image.open(filepath) as img:
            # Convert to bytes for hashing
            img_bytes = img.tobytes()
            return hashlib.md5(img_bytes).hexdigest()
    except Exception as e:
        print(f"Could not process {filepath}: {e}")
        return None

def find_duplicate_images(folder_path):
    """
    Find duplicate images in folder and all subfolders.
    Returns a dictionary where keys are hashes and values are lists of file paths.
    """
    # Dictionary to store hash -> list of file paths
    image_hashes = defaultdict(list)
    
    # Supported image extensions
    supported_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'}
    
    # Walk through all directories and subdirectories
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            # Check if file has image extension
            _, ext = os.path.splitext(file.lower())
            if ext in supported_extensions:
                filepath = os.path.join(root, file)
                # print(f"Processing: {filepath}")
                
                # Calculate hash
                img_hash = get_image_hash(filepath)
                if img_hash:
                    image_hashes[img_hash].append(filepath)
    
    return image_hashes

def analyze_duplicates(image_hashes):
    """
    Analyze and print information about duplicate images.
    """
    total_files = sum(len(paths) for paths in image_hashes.values())
    duplicate_groups = 0
    duplicate_files = 0
    
    print(f"\n{'='*60}")
    print(f"SCAN RESULTS")
    print(f"{'='*60}")
    print(f"Total image files processed: {total_files}")
    
    duplicates_found = False
    
    for hash_value, file_paths in image_hashes.items():
        if len(file_paths) > 1:
            duplicates_found = True
            duplicate_groups += 1
            duplicate_files += len(file_paths) - 1  # Don't count the original
            
            print(f"\n{'-'*40}")
            print(f"Duplicate Group {duplicate_groups}:")
            print(f"Found {len(file_paths)} identical images:")
            
            for i, path in enumerate(file_paths, 1):
                # Extract just the filename and parent folder for cleaner display
                parts = path.split(os.sep)
                if len(parts) >= 2:
                    display_path = os.path.join("...", parts[-2], parts[-1])
                else:
                    display_path = parts[-1]
                print(f"  {i}. {display_path}")
                print(f"     Full path: {path}")
    
    print(f"\n{'='*60}")
    print(f"SUMMARY")
    print(f"{'='*60}")
    if duplicates_found:
        print(f"Found {duplicate_groups} groups of duplicate images")
        print(f"Total duplicate files: {duplicate_files}")
        print(f"Space could be saved by removing {duplicate_files} files")
    else:
        print("âœ“ No duplicate images found!")
    print(f"{'='*60}")

# Main execution cell
# Replace this path with your actual folder path
FOLDER_TO_SCAN = "stimuli"  # Change this to your folder path

# Verify the folder exists
if not os.path.exists(FOLDER_TO_SCAN):
    print(f"Error: Folder '{FOLDER_TO_SCAN}' does not exist!")
    print("Please update the FOLDER_TO_SCAN variable with the correct path.")
else:
    print(f"Scanning for duplicate images in: {FOLDER_TO_SCAN}")
    print("This may take a while for large folders...\n")
    
    # Find duplicates
    image_hashes = find_duplicate_images(FOLDER_TO_SCAN)
    
    # Analyze and display results
    analyze_duplicates(image_hashes)

# Optional: Display additional statistics
print("\nAdditional Statistics:")
print(f"Unique images found: {len(image_hashes)}")

# Count files by extension
extensions_count = defaultdict(int)
for hash_value, file_paths in image_hashes.items():
    for filepath in file_paths:
        _, ext = os.path.splitext(filepath.lower())
        extensions_count[ext] += 1

print("\nFile types found:")
for ext, count in sorted(extensions_count.items()):
    print(f"  {ext}: {count} files")



# Create a list of all duplicate files for DataFrame
duplicate_data = []
group_number = 1

for hash_value, file_paths in image_hashes.items():
    if len(file_paths) > 1:
        for i, path in enumerate(file_paths):
            duplicate_data.append({
                'Group': group_number,
                'File_Number': i + 1,
                'Filename': os.path.basename(path),
                'Directory': os.path.dirname(path),
                # 'Full_Path': path,
                # 'Hash': hash_value[:10] + '...'  # Show only first 10 chars of hash
            })
        group_number += 1

if duplicate_data:
    df_duplicates = pd.DataFrame(duplicate_data)
    print("\nDuplicate Files Summary (showing in tabular format):")
    print(df_duplicates.to_string(index=False))
    
    # Save to CSV if desired
    # df_duplicates.to_csv('duplicate_images_report.csv', index=False)
    # print("\nReport saved to 'duplicate_images_report.csv'")
else:
    print("\nNo duplicates to display in table format.")

Scanning for duplicate images in: stimuli
This may take a while for large folders...


SCAN RESULTS
Total image files processed: 2489

----------------------------------------
Duplicate Group 1:
Found 2 identical images:
  1. .../stimuli/219_1.jpg
     Full path: stimuli/219_1.jpg
  2. .../demo_stims/219_1.jpg
     Full path: stimuli/demo_stims/219_1.jpg

----------------------------------------
Duplicate Group 2:
Found 2 identical images:
  1. .../stimuli/219_2.jpg
     Full path: stimuli/219_2.jpg
  2. .../demo_stims/219_2.jpg
     Full path: stimuli/demo_stims/219_2.jpg

----------------------------------------
Duplicate Group 3:
Found 2 identical images:
  1. .../stimuli/219_3.jpg
     Full path: stimuli/219_3.jpg
  2. .../demo_stims/219_3.jpg
     Full path: stimuli/demo_stims/219_3.jpg

----------------------------------------
Duplicate Group 4:
Found 2 identical images:
  1. .../stimuli/281_2.jpg
     Full path: stimuli/281_2.jpg
  2. .../demo_stims/281_2.jpg
     Full path: sti

In [21]:
def get_image_hash(filepath):
    """Calculate hash of an image file."""
    try:
        with Image.open(filepath) as img:
            img_bytes = img.tobytes()
            return hashlib.md5(img_bytes).hexdigest()
    except Exception as e:
        print(f"Could not process {filepath}: {e}")
        return None

def find_duplicate_images(folder_path):
    """Find duplicate images in folder and all subfolders."""
    image_hashes = defaultdict(list)
    supported_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'}
    
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            _, ext = os.path.splitext(file.lower())
            if ext in supported_extensions:
                filepath = os.path.join(root, file)
                # print(f"Processing: {filepath}")
                
                img_hash = get_image_hash(filepath)
                if img_hash:
                    image_hashes[img_hash].append(filepath)
    
    return image_hashes

# 2. SAVE DUPLICATE LIST
def save_duplicate_list(image_hashes, output_path="duplicate_images_list.json"):
    """Save list of duplicate images to JSON file."""
    duplicates_data = {
        "scan_date": datetime.now().isoformat(),
        "duplicate_groups": []
    }
    
    group_id = 1
    all_duplicates = []
    
    for hash_value, file_paths in image_hashes.items():
        if len(file_paths) > 1:
            group_data = {
                "group_id": group_id,
                "hash": hash_value,
                "files": file_paths
            }
            duplicates_data["duplicate_groups"].append(group_data)
            
            # Add all files in this group to our flat list
            all_duplicates.extend(file_paths)
            group_id += 1
    
    # Save JSON file
    with open(output_path, 'w') as f:
        json.dump(duplicates_data, f, indent=2)
    
    print(f"\nDuplicate list saved to: {output_path}")
    print(f"Total duplicate files: {len(all_duplicates)}")
    
    return all_duplicates


In [22]:
def check_files_in_csv(file_list, csv_paths, folder_path):
    """Check if files appear in the specified CSV files."""
    print(f"\n{'='*60}")
    print("CHECKING DUPLICATES IN CSV FILES")
    print(f"{'='*60}")
    
    results = {}
    
    for csv_path in csv_paths:
        # CSV files are in the current working directory, not in folder_path
        full_csv_path = os.path.join(os.getcwd(), csv_path)
        
        if not os.path.exists(full_csv_path):
            print(f"\nWarning: {csv_path} not found in {folder_path}")
            continue
        
        try:
            # Read CSV file
            df = pd.read_csv(full_csv_path)
            print(f"\nChecking {csv_path}...")
            print(f"CSV shape: {df.shape}")
            print(f"Columns: {list(df.columns)}")
            
            # Look for columns that might contain file paths
            potential_path_columns = []
            for col in df.columns:
                # Check if column contains file paths
                if col.lower() in ['path', 'file', 'filename', 'filepath', 'image', 'source']:
                    potential_path_columns.append(col)
                elif df[col].dtype == 'object':
                    # Check if values look like file paths
                    sample_values = df[col].dropna().head(5).astype(str)
                    if any('/' in str(val) or '\\' in str(val) or '.' in str(val) for val in sample_values):
                        potential_path_columns.append(col)
            
            if not potential_path_columns:
                print(f"No obvious path columns found in {csv_path}")
                print("All columns:", list(df.columns))
                # Let user choose columns to check
                continue
            
            # Check each potential path column
            found_files = set()
            for col in potential_path_columns:
                print(f"\nChecking column '{col}'...")
                
                for file_path in file_list:
                    filename = os.path.basename(file_path)
                    
                    # Check if filename appears in this column
                    matches = df[df[col].astype(str).str.contains(filename, na=False, case=False)]
                    
                    if not matches.empty:
                        found_files.add(file_path)
                        print(f"  Found: {filename}")
                        print(f"    Rows: {len(matches)}")
                        for idx, row in matches.head(3).iterrows():
                            print(f"    Example: {row[col]}")
            
            results[csv_path] = {
                'total_files_checked': len(file_list),
                'files_found': len(found_files),
                'found_files': list(found_files),
                'columns_checked': potential_path_columns
            }
            
            print(f"\nSummary for {csv_path}:")
            print(f"  Files found: {len(found_files)} out of {len(file_list)}")
            
        except Exception as e:
            print(f"Error reading {csv_path}: {e}")
            results[csv_path] = {'error': str(e)}
    
    return results


In [31]:

# 4. MAIN EXECUTION
# Configure paths
FOLDER_TO_SCAN = "new_stimuli"  # Change this to your folder path
# CSV_FILES = ["initial_practice.csv", "prac.csv", "demo.csv"]
CSV_FILES = ["selected_and_foils.csv"]
# Verify the folder exists
if not os.path.exists(FOLDER_TO_SCAN):
    print(f"Error: Folder '{FOLDER_TO_SCAN}' does not exist!")
    print("Please update the FOLDER_TO_SCAN variable with the correct path.")
else:
    print(f"Scanning for duplicate images in: {FOLDER_TO_SCAN}")
    print("This may take a while for large folders...\n")
    
    # Step 1: Find duplicates
    image_hashes = find_duplicate_images(FOLDER_TO_SCAN)
    
    # Step 2: Save duplicate list
    duplicate_files = save_duplicate_list(image_hashes)
    
    # Step 3: Check if duplicates appear in CSV files
    if duplicate_files:
        csv_results = check_files_in_csv(duplicate_files, CSV_FILES, FOLDER_TO_SCAN)
        
        # Save CSV check results
        csv_results_path = "csv_check_results.json"
        with open(csv_results_path, 'w') as f:
            json.dump(csv_results, f, indent=2)
        print(f"\nCSV check results saved to: {csv_results_path}")
        
        # Create summary report
        print(f"\n{'='*60}")
        print("FINAL SUMMARY")
        print(f"{'='*60}")
        
        for csv_file, result in csv_results.items():
            if 'error' not in result:
                print(f"\n{csv_file}:")
                print(f"  Duplicates found: {result['files_found']}/{result['total_files_checked']}")
                if result['files_found'] > 0:
                    print(f"  Found files:")
                    for file_path in result['found_files']:  # Show first 5
                        print(f"    - {os.path.basename(file_path)}")
                    # if len(result['found_files']) > 5:
                    #     print(f"    ... and {len(result['found_files']) - 5} more")
    else:
        print("\nNo duplicate images found, so nothing to check in CSV files.")

# # 5. OPTIONAL: More detailed analysis
# def analyze_csv_structure(csv_paths, folder_path):
#     """Analyze the structure of CSV files to help identify relevant columns."""
#     print(f"\n{'='*60}")
#     print("CSV STRUCTURE ANALYSIS")
#     print(f"{'='*60}")
    
#     for csv_path in csv_paths:
#         # CSV files are in the current working directory, not in folder_path
#         full_csv_path = os.path.join(os.getcwd(), csv_path)
        
#         if os.path.exists(full_csv_path):
#             try:
#                 df = pd.read_csv(full_csv_path, nrows=10)  # Read only first 10 rows for analysis
#                 print(f"\n{csv_path}:")
#                 print(f"  Shape: {df.shape}")
#                 print(f"  Columns: {list(df.columns)}")
#                 print(f"  Sample data:")
#                 for col in df.columns:
#                     print(f"    {col}: {df[col].iloc[0] if not df.empty else 'N/A'}")
#             except Exception as e:
#                 print(f"\nError analyzing {csv_path}: {e}")

# Run CSV structure analysis
# analyze_csv_structure(CSV_FILES, FOLDER_TO_SCAN)

# # 6. INTERACTIVE COLUMN SELECTION (if needed)
# def check_specific_columns(file_list, csv_paths, folder_path, columns_to_check):
#     """Check specific columns in CSV files."""
#     print(f"\n{'='*60}")
#     print("CHECKING SPECIFIC COLUMNS")
#     print(f"{'='*60}")
    
#     for csv_path in csv_paths:
#         # CSV files are in the current working directory, not in folder_path
#         full_csv_path = os.path.join(os.getcwd(), csv_path)
        
#         if os.path.exists(full_csv_path):
#             try:
#                 df = pd.read_csv(full_csv_path)
#                 print(f"\nChecking {csv_path}, columns: {columns_to_check}")
                
#                 for col in columns_to_check:
#                     if col in df.columns:
#                         print(f"\nColumn '{col}':")
#                         for file_path in file_list[:5]:  # Check first 5 files
#                             filename = os.path.basename(file_path)
#                             matches = df[df[col].astype(str).str.contains(filename, na=False, case=False)]
#                             if not matches.empty:
#                                 print(f"  Found {filename}: {len(matches)} matches")
#                     else:
#                         print(f"\nColumn '{col}' not found in {csv_path}")
#             except Exception as e:
#                 print(f"Error: {e}")

# # Example: Uncomment and modify to check specific columns
# # check_specific_columns(duplicate_files, CSV_FILES, FOLDER_TO_SCAN, ['filename', 'path'])

Scanning for duplicate images in: new_stimuli
This may take a while for large folders...


Duplicate list saved to: duplicate_images_list.json
Total duplicate files: 28

CHECKING DUPLICATES IN CSV FILES

Checking selected_and_foils.csv...
CSV shape: (240, 1)
Columns: ['ImagePath']

Checking column 'ImagePath'...
  Found: 97_3.jpg
    Rows: 1
    Example: ./stimuli/297_3.jpg

Summary for selected_and_foils.csv:
  Files found: 1 out of 28

CSV check results saved to: csv_check_results.json

FINAL SUMMARY

selected_and_foils.csv:
  Duplicates found: 1/28
  Found files:
    - 97_3.jpg
