## 1.1 Hash: Make new folder for duplicates and copy distinct to One folder

In [None]:
import os
import hashlib
import shutil

# Paths
source_folder = "P:\CAD_Retrival\Images"  # Replace with your image folder path
output_folder = "P:\CAD_Retrival\Duplicates"  # Folder where duplicate folders will be stored
distinct_folder = "P:\CAD_Retrival\Distinct"  # Folder for unique images

# Ensure output folders exist
os.makedirs(output_folder, exist_ok=True)
os.makedirs(distinct_folder, exist_ok=True)

# Function to compute hash of an image
def compute_hash(image_path):
    try:
        hasher = hashlib.md5()
        with open(image_path, "rb") as f:
            buf = f.read()
            hasher.update(buf)
        return hasher.hexdigest()
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

# Dictionary to store image hashes and their original paths
hash_dict = {}
duplicates_dict = {}

# Scan folder for images
count = 0
for root, _, files in os.walk(source_folder):  # Use os.walk() to scan all files
    for filename in files:
        file_path = os.path.join(root, filename)
        
        if os.path.isfile(file_path):  # Ensure it's a file
            img_hash = compute_hash(file_path)
            
            if img_hash:
                if img_hash in hash_dict:
                    if img_hash not in duplicates_dict:
                        duplicates_dict[img_hash] = []  # Create list for duplicates

                    duplicates_dict[img_hash].append(file_path)
                else:
                    hash_dict[img_hash] = file_path  # Store first occurrence

        count += 1
        if count % 500 == 0:
            print(f"Processed {count} images...")  # Print progress every 500 images

# Move distinct images to Distinct_Objects folder
for img_hash, original_path in hash_dict.items():
    shutil.move(original_path, os.path.join(distinct_folder, os.path.basename(original_path)))

# Create folders and move duplicates
for img_hash, duplicate_paths in duplicates_dict.items():
    distinct_image_name = os.path.basename(hash_dict[img_hash]).split('.')[0]
    duplicate_folder = os.path.join(output_folder, distinct_image_name)
    
    os.makedirs(duplicate_folder, exist_ok=True)  # Create folder for each distinct image

    # Move duplicate images into the respective folder
    for dup in duplicate_paths:
        shutil.move(dup, os.path.join(duplicate_folder, os.path.basename(dup)))

print(f"\n Processed {count} images.")
print(f" Moved distinct images to {distinct_folder}.")
print(f" Moved duplicate images into separate folders in {output_folder}.")


## 1.2 Move One image from each duplicate folder -> Distict Folder

In [None]:
import os
import shutil

# Paths
duplicates_folder = "P:\CAD_Retrival\Duplicates"  # Folder containing duplicate directories
distinct_folder = "P:\CAD_Retrival\Distinct"  # Folder where one image per duplicate set will be copied

# Ensure the distinct folder exists
os.makedirs(distinct_folder, exist_ok=True)

# Process each duplicate folder
for folder_name in os.listdir(duplicates_folder):
    folder_path = os.path.join(duplicates_folder, folder_name)
    
    if os.path.isdir(folder_path):  # Ensure it's a directory
        files = os.listdir(folder_path)
        
        if files:  # Ensure the folder is not empty
            first_image = os.path.join(folder_path, files[0])  # Pick the first image
            destination_path = os.path.join(distinct_folder, files[0])
            
            shutil.copy(first_image, destination_path)  # Copy the file
            
            print(f"✅ Copied: {first_image} → {destination_path}")

print(f"\n✅ Successfully copied one image from each duplicate folder to {distinct_folder}.")


## 2. Creat folder of max 1000 images

In [None]:
import os
import shutil

# Define source and destination directories
src = r"P:\CAD_Retrival\Filtered_2_N"
dest = r"P:\CAD_Retrival\b_a_Filtered_2_N_Sets(1000)"

# Create destination directory if it doesn't exist
os.makedirs(dest, exist_ok=True)

# List all files in the source directory
files = os.listdir(src)

count = 0
foldernum = 1
current_folder = os.path.join(dest, f"Set_{foldernum}")
os.makedirs(current_folder, exist_ok=True)

for file in files:
    file_path = os.path.join(src, file)
    # Check if it is a file (you may add additional image file type checks if needed)
    if os.path.isfile(file_path):
        shutil.copy(file_path, current_folder)
        count += 1
        if count == 1000:
            foldernum += 1
            count = 0
            current_folder = os.path.join(dest, f"Set_{foldernum}")
            os.makedirs(current_folder, exist_ok=True)


## 3. DB Scan on Sets

In [None]:
import os
import shutil
import torch
from torchvision import transforms, models
from sklearn.cluster import DBSCAN
import numpy as np
import time
from PIL import Image

# Configuration
BASE_IMAGE_DIR = r"P:\CAD_Retrival\b_a_Filtered_2_N_Sets(1000)"  # Parent folder containing your sets (e.g., Set_1, Set_2, ...)
BASE_TARGET_DIR = r"P:\CAD_Retrival\b_c_Filtered_2_N_Clusters"          # Parent folder for all clustering results (for clusters)
FEATURE_OUTPUT_DIR = r"P:\CAD_Retrival\b_b_Filtered_2_N_Features"      # Directory where feature files will be stored
ALL_NOISE_DIR = r"P:\CAD_Retrival\b_d_Filtered_2_N_All_Noise"            # Common directory for all noise images from all sets
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 20  # (Not used in this example, but available for future batch processing)

# DBSCAN parameters
EPSILON = 10     # Adjust based on image similarity
MIN_SAMPLES = 2   # Minimum images per cluster

# Control for which sets to process (1-indexed)
START_SET = 1  # Process starting from this set (inclusive)
END_SET = 198  # Process up to this set (inclusive)

# Create necessary directories if they don't exist
os.makedirs(FEATURE_OUTPUT_DIR, exist_ok=True)
os.makedirs(ALL_NOISE_DIR, exist_ok=True)

# Load VGG16 Model (Feature Extractor)
print("Loading VGG16 model...")
model = models.vgg16(pretrained=True).to(DEVICE)
model.classifier = model.classifier[:-1]  # Remove the last classification layer
model.eval()
print("Model loaded successfully.\n")

# Image Preprocessing
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def extract_features(image_paths):
    """Extracts features from images using VGG16 on GPU."""
    print("Extracting features...")
    all_features = []
    valid_image_paths = []

    with torch.no_grad():
        for i, img_path in enumerate(image_paths):
            try:
                img = Image.open(img_path).convert("RGB")
                img_tensor = preprocess(img).unsqueeze(0).to(DEVICE)
                torch.cuda.synchronize()  # Ensure GPU work is completed
                features = model(img_tensor).cpu().numpy()
                all_features.append(features.flatten())
                valid_image_paths.append(img_path)
                print(f"Processed {i+1}/{len(image_paths)} images", end="\r")
            except Exception as e:
                print(f"Error processing {img_path}: {e}")

    print("\nFeature extraction completed.\n")
    return np.array(all_features), valid_image_paths

def perform_clustering(features_array):
    """Clusters features using DBSCAN and returns the labels."""
    print("Performing DBSCAN clustering...")
    dbscan = DBSCAN(eps=EPSILON, min_samples=MIN_SAMPLES, metric='euclidean')
    labels = dbscan.fit_predict(features_array)
    print("DBSCAN clustering completed.\n")
    return labels

def organize_images(image_paths, labels, cluster_target_dir, noise_target_dir, cluster_prefix):
    """
    Sorts images into cluster folders under cluster_target_dir.
    Images labeled as noise (cluster == -1) are stored in noise_target_dir.
    Cluster folders are named using the cluster_prefix (e.g., "S1_cluster0").
    """
    print("Organizing images into cluster folders...\n")
    os.makedirs(cluster_target_dir, exist_ok=True)
    os.makedirs(noise_target_dir, exist_ok=True)

    for i, (path, cluster) in enumerate(zip(image_paths, labels)):
        if cluster == -1:
            dest_dir = noise_target_dir
        else:
            dest_dir = os.path.join(cluster_target_dir, f"{cluster_prefix}_cluster{cluster}")
            os.makedirs(dest_dir, exist_ok=True)
        shutil.copy2(path, dest_dir)
        print(f"Image {i+1}/{len(image_paths)} moved to {'noise' if cluster == -1 else f'{cluster_prefix}_cluster{cluster}'}", end="\r")
    print("\nImages successfully organized.\n")

def extract_set_number(folder_name):
    """Extracts the numeric part from a folder name formatted as 'Set_<number>'."""
    try:
        return int(folder_name.split('_')[1])
    except Exception as e:
        print(f"Error extracting number from folder '{folder_name}': {e}")
        return None

if __name__ == "__main__":
    total_start_time = time.time()
    
    # Get list of set folders from BASE_IMAGE_DIR.
    all_set_folders = [d for d in os.listdir(BASE_IMAGE_DIR) if os.path.isdir(os.path.join(BASE_IMAGE_DIR, d))]
    
    # Filter and sort folders based on the numeric part of their names.
    valid_set_folders = []
    for folder in all_set_folders:
        num = extract_set_number(folder)
        if num is not None and START_SET <= num <= END_SET:
            valid_set_folders.append(folder)
    valid_set_folders = sorted(valid_set_folders, key=lambda x: extract_set_number(x))
    
    print(f"Processing the following set folders: {valid_set_folders}\n")

    for set_folder in valid_set_folders:
        print(f"Processing set: {set_folder}")
        IMAGE_DIR = os.path.join(BASE_IMAGE_DIR, set_folder)
        # Cluster output for this set is stored in a folder like "Set_1_output"
        cluster_target_dir = os.path.join(BASE_TARGET_DIR, f"{set_folder}_output")
        # Noise images from all sets are stored in ALL_NOISE_DIR under a subfolder named as "noise_setX"
        set_num = extract_set_number(set_folder)
        if set_num is not None:
            noise_subfolder = f"noise_set{set_num}"
            cluster_prefix = f"S{set_num}"
        else:
            noise_subfolder = f"noise_{set_folder}"
            cluster_prefix = set_folder
        noise_target_dir = os.path.join(ALL_NOISE_DIR, noise_subfolder)
        feature_file_path = os.path.join(FEATURE_OUTPUT_DIR, f"{set_folder}_features.npz")
        
        # Check if cluster or noise folders already exist and have content
        if ((os.path.exists(cluster_target_dir) and os.listdir(cluster_target_dir)) or 
            (os.path.exists(noise_target_dir) and os.listdir(noise_target_dir))):
            answer = input(f"Cluster and/or noise folders for {set_folder} already exist. Do you want to delete them and replace with new clusters? (y/n): ")
            if answer.lower() == 'y':
                if os.path.exists(cluster_target_dir):
                    shutil.rmtree(cluster_target_dir)
                if os.path.exists(noise_target_dir):
                    shutil.rmtree(noise_target_dir)
            else:
                print(f"Skipping processing for {set_folder}.\n")
                continue
        
        # Collect all image paths in the current set
        image_paths = [os.path.join(IMAGE_DIR, f) for f in os.listdir(IMAGE_DIR) if os.path.isfile(os.path.join(IMAGE_DIR, f))]
        print(f"Found {len(image_paths)} images in {set_folder}.\n")
        
        # Check if feature file already exists
        if os.path.exists(feature_file_path):
            print(f"Feature file for {set_folder} exists. Loading features from file...\n")
            data = np.load(feature_file_path, allow_pickle=True)
            features_array = data["features"]
            valid_image_paths = list(data["image_paths"])
            image_names = list(data["image_names"])
            # Perform clustering on loaded features
            labels = perform_clustering(features_array)
        else:
            # Extract features if feature file does not exist
            features_array, valid_image_paths = extract_features(image_paths)
            if features_array.size == 0:
                print(f"No valid features extracted for {set_folder}. Skipping clustering.\n")
                continue
            labels = perform_clustering(features_array)
            image_names = [os.path.basename(path) for path in valid_image_paths]
            # Save the extracted features, image names, and image paths to a file
            np.savez_compressed(feature_file_path,
                                features=features_array,
                                image_names=image_names,
                                image_paths=valid_image_paths)
            print(f"Saved features to {feature_file_path}\n")
        
        if len(labels) > 0:
            organize_images(valid_image_paths, labels, cluster_target_dir, noise_target_dir, cluster_prefix)
            print(f"Completed clustering for {set_folder}.\n")
        else:
            print(f"No clusters generated for {set_folder}.\n")

    print(f"Total execution time: {time.time() - total_start_time:.2f} seconds.\n")


## 4.1 Visualization

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def visualize_cluster_summary_all_sets(base_cluster_dir, base_noise_dir):
    """
    Aggregates and visualizes summary statistics for each set.
    
    Assumptions:
      - Each set's cluster output is in a subdirectory of base_cluster_dir (e.g., "Set_156_output").
      - Cluster folders inside a set are named like "S156_clusterX".
      - Noise images for a set are stored in base_noise_dir in a subfolder named "noise_set156".
    
    Returns a DataFrame with the following columns:
      - set: The set number.
      - num_clusters: The number of cluster subfolders.
      - total_cluster_images: The total number of images in all clusters.
      - noise_count: The number of images in the noise folder.
      - avg_cluster_size: The average number of images per cluster.
      - median_cluster_size: The median number of images per cluster.
    
    Also produces a bar chart comparing total cluster images and noise images per set.
    """
    # Get a list of set folders from base_cluster_dir
    set_folders = [d for d in os.listdir(base_cluster_dir) if os.path.isdir(os.path.join(base_cluster_dir, d))]
    summary_list = []
    
    for set_folder in set_folders:
        set_path = os.path.join(base_cluster_dir, set_folder)
        # Expecting folder names like "Set_156_output"
        try:
            parts = set_folder.split('_')
            set_num = int(parts[1])
        except Exception as e:
            print(f"Error extracting set number from {set_folder}: {e}")
            continue
        
        # For each cluster folder inside this set folder, count the number of image files
        cluster_counts = []
        for subfolder in os.listdir(set_path):
            subfolder_path = os.path.join(set_path, subfolder)
            if os.path.isdir(subfolder_path):
                count = len([f for f in os.listdir(subfolder_path) 
                             if os.path.isfile(os.path.join(subfolder_path, f))])
                cluster_counts.append(count)
        num_clusters = len(cluster_counts)
        total_cluster_images = sum(cluster_counts) if cluster_counts else 0
        avg_cluster_size = np.mean(cluster_counts) if cluster_counts else 0
        median_cluster_size = np.median(cluster_counts) if cluster_counts else 0
        
        # Get noise count from the corresponding noise folder in base_noise_dir (e.g., "noise_set156")
        noise_folder = os.path.join(base_noise_dir, f"noise_set{set_num}")
        if os.path.exists(noise_folder):
            noise_count = len([f for f in os.listdir(noise_folder) 
                               if os.path.isfile(os.path.join(noise_folder, f))])
        else:
            noise_count = 0
        
        summary_list.append({
            "set": set_num,
            "num_clusters": num_clusters,
            "total_cluster_images": total_cluster_images,
            "noise_count": noise_count,
            "avg_cluster_size": avg_cluster_size,
            "median_cluster_size": median_cluster_size
        })
    
    df = pd.DataFrame(summary_list)
    df.sort_values("set", inplace=True)
    
    # Display the DataFrame
    display(df)
    
    # Plotting a bar chart for total cluster images vs. noise images per set
    plt.figure(figsize=(12, 6))
    x_labels = df["set"].astype(str)
    plt.bar(x_labels, df["total_cluster_images"], label="Cluster Images", alpha=0.7)
    plt.bar(x_labels, df["noise_count"], label="Noise Images", alpha=0.7)
    plt.xlabel("Set Number")
    plt.ylabel("Number of Images")
    plt.title("Cluster Images vs. Noise Images per Set")
    plt.xticks(rotation=90)
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    return df


In [None]:
df_summary = visualize_cluster_summary_all_sets(r"P:\CAD_Retrival\c_Clusters", r"P:\CAD_Retrival\d_All_Noise")


## 4.2 Detailed Information about Sets and Cluster -> csv

In [None]:
# Assuming you've run the visualization function and stored the DataFrame in 'df_summary'
df_summary.to_csv("cluster_summary.csv", index=False)


In [None]:
import os
import pandas as pd

def get_detailed_cluster_counts(base_cluster_dir, base_noise_dir):
    """
    Creates a detailed DataFrame with the number of images in each cluster (and in the noise folder)
    for each set.
    
    Assumptions:
      - Each set's cluster output is stored in a subdirectory of base_cluster_dir,
        with names like "Set_156_output".
      - Inside each set folder, cluster subdirectories are named using a pattern like "S156_cluster0", "S156_cluster1", etc.
      - The corresponding noise folder for a set is stored in base_noise_dir, with a name like "noise_set156".
    
    Returns:
      A Pandas DataFrame with columns:
        - set: The set number.
        - cluster: The cluster name (e.g., "S156_cluster0", or "noise" for the noise folder).
        - num_images: The number of images in that cluster.
    """
    records = []
    
    # Get list of set folders in base_cluster_dir
    set_folders = [d for d in os.listdir(base_cluster_dir) if os.path.isdir(os.path.join(base_cluster_dir, d))]
    
    # Sort the set folders by the numeric part (assumes format like "Set_156_output")
    def extract_set_num(folder_name):
        try:
            # Example folder: "Set_156_output"
            parts = folder_name.split('_')
            return int(parts[1])
        except Exception:
            return -1  # fallback, so these come first
    set_folders = sorted(set_folders, key=extract_set_num)
    
    for set_folder in set_folders:
        set_num = extract_set_num(set_folder)
        if set_num == -1:
            continue
        set_path = os.path.join(base_cluster_dir, set_folder)
        
        # Process each cluster folder inside this set folder
        for cluster_folder in os.listdir(set_path):
            cluster_folder_path = os.path.join(set_path, cluster_folder)
            if os.path.isdir(cluster_folder_path):
                count = len([f for f in os.listdir(cluster_folder_path) 
                             if os.path.isfile(os.path.join(cluster_folder_path, f))])
                records.append({"set": set_num, "cluster": cluster_folder, "num_images": count})
        
        # Process the noise folder for this set
        noise_folder = os.path.join(base_noise_dir, f"noise_set{set_num}")
        if os.path.exists(noise_folder):
            noise_count = len([f for f in os.listdir(noise_folder)
                               if os.path.isfile(os.path.join(noise_folder, f))])
        else:
            noise_count = 0
        records.append({"set": set_num, "cluster": "noise", "num_images": noise_count})
    
    df = pd.DataFrame(records)
    df.sort_values(["set", "cluster"], inplace=True)
    return df

# Example usage:
base_cluster_dir = r"P:\CAD_Retrival\c_Clusters"
base_noise_dir = r"P:\CAD_Retrival\d_All_Noise"

df_detailed = get_detailed_cluster_counts(base_cluster_dir, base_noise_dir)
display(df_detailed)

# Save the DataFrame to CSV
df_detailed.to_csv("detailed_cluster_counts.csv", index=False)


## 5. Moving a images from each cluster of each sets and creating meta data(set name , imagename)

In [None]:
import os
import shutil
import numpy as np
import torch
from torchvision import transforms, models
from sklearn.cluster import DBSCAN
import time
from PIL import Image

# ----------------- Configuration -----------------
BASE_IMAGE_DIR = r"P:\CAD_Retrival\a_Set(1000_images)"  # Folder with original sets (e.g., Set_156, Set_157, …)
BASE_TARGET_DIR = r"P:\CAD_Retrival\a_c_Filtered_2_C_Clusters"         # Folder containing each set's cluster output (e.g., Set_156_output, etc.)
FEATURE_OUTPUT_DIR = r"P:\CAD_Retrival\a_b_Filtered_2_C_Features"       # Folder where individual feature files are stored
DEST_FOLDER = r"P:\CAD_Retrival\Filtered_3_C"      # Single destination folder for representative images
OUTPUT_META_FILE = os.path.join(FEATURE_OUTPUT_DIR, "f_C_Filtered_DBscan1.npz")
START_SET = 1    # Process sets starting from this number (inclusive)
END_SET = 359      # Process sets up to this number (inclusive)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ----------------- Model Setup (VGG16 example) -----------------
print("Loading VGG16 model...")
model = models.vgg16(pretrained=True).to(DEVICE)
model.classifier = model.classifier[:-1]  # Remove final classification layer (output feature size = 4096)
model.eval()
print("Model loaded successfully.\n")

# ----------------- Preprocessing -----------------
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# ----------------- Helper Functions -----------------

def extract_set_number(folder_name):
    """Extracts the numeric part from a folder name (e.g., 'Set_156' or 'Set_156_output')."""
    try:
        return int(folder_name.split('_')[1])
    except Exception as e:
        print(f"Error extracting number from folder '{folder_name}': {e}")
        return None

def copy_representative_images_and_save_metadata(base_cluster_dir, feature_output_dir, dest_folder, output_meta_file, start_set, end_set):
    """
    For each set folder in base_cluster_dir (e.g., "Set_156_output"), this function:
      - Loads the corresponding feature file from feature_output_dir (e.g., "Set_156_features.npz").
      - Iterates over each non-noise cluster subfolder.
      - Copies one representative image (the first image found) from each cluster to dest_folder.
      - Retrieves its metadata by matching the image filename in the feature file.
      - Accumulates only the set name and image name.
    Finally, it saves all metadata into a single NPZ file with keys:
      "sets" and "image_names".
    """
    os.makedirs(dest_folder, exist_ok=True)
    meta_list = []  # List of dicts with keys: "set" and "image_name"

    # List valid set folders in the cluster output directory.
    all_set_folders = [d for d in os.listdir(base_cluster_dir) if os.path.isdir(os.path.join(base_cluster_dir, d))]
    valid_set_folders = []
    for folder in all_set_folders:
        num = extract_set_number(folder)
        if num is not None and start_set <= num <= end_set:
            valid_set_folders.append(folder)
    valid_set_folders = sorted(valid_set_folders, key=lambda x: extract_set_number(x))
    
    for set_folder in valid_set_folders:
        print(f"Processing set: {set_folder}")
        set_num = extract_set_number(set_folder)
        set_cluster_dir = os.path.join(base_cluster_dir, set_folder)
        # Assume corresponding feature file is named "Set_<num>_features.npz"
        feat_file = os.path.join(feature_output_dir, f"Set_{set_num}_features.npz")
        if not os.path.exists(feat_file):
            print(f"Feature file for {set_folder} not found. Skipping.")
            continue
        data = np.load(feat_file, allow_pickle=True)
        # We only need the image names (filenames only)
        image_names = list(data["image_names"])
        
        # Process each non-noise cluster folder (skip folders with "noise")
        cluster_folders = [d for d in os.listdir(set_cluster_dir)
                           if os.path.isdir(os.path.join(set_cluster_dir, d)) and "noise" not in d.lower()]
        for cl_folder in cluster_folders:
            cluster_path = os.path.join(set_cluster_dir, cl_folder)
            images = [f for f in os.listdir(cluster_path) if os.path.isfile(os.path.join(cluster_path, f))]
            if not images:
                continue
            rep_img_name = images[0]  # Use the first image as representative.
            try:
                idx = image_names.index(rep_img_name)
            except ValueError:
                print(f"Representative image {rep_img_name} not found in feature file for {set_folder}.")
                continue
            # Copy representative image to dest_folder (without renaming)
            src_img = os.path.join(cluster_path, rep_img_name)
            dest_img = os.path.join(dest_folder, rep_img_name)
            shutil.copy2(src_img, dest_img)
            print(f"Copied {src_img} to {dest_img}")
            # Append metadata: save only the set folder name and image name
            meta_list.append({
                "set": set_folder,
                "image_name": rep_img_name
            })
    
    # Save the accumulated metadata into a single NPZ file.
    if meta_list:
        sets_arr = np.array([d["set"] for d in meta_list])
        image_names_arr = np.array([d["image_name"] for d in meta_list])
        np.savez_compressed(output_meta_file,
                            sets=sets_arr,
                            image_names=image_names_arr)
        print(f"\nSaved metadata to {output_meta_file}")
    else:
        print("No metadata collected.")

# ----------------- Main Execution -----------------

if __name__ == "__main__":
    total_start_time = time.time()
    copy_representative_images_and_save_metadata(
        base_cluster_dir=BASE_TARGET_DIR,
        feature_output_dir=FEATURE_OUTPUT_DIR,
        dest_folder=DEST_FOLDER,
        output_meta_file=OUTPUT_META_FILE,
        start_set=START_SET,
        end_set=END_SET
    )
    print(f"\nTotal execution time: {time.time() - total_start_time:.2f} seconds.\n")


## 5.2 Copy noise images to one file and meta data

In [None]:
import os
import shutil
import numpy as np

def copy_all_noise_images_and_save_metadata(noise_base_dir, dest_folder, output_meta_file):
    """
    Copies all images from each noise folder in noise_base_dir to a single destination folder (dest_folder)
    and saves metadata for each copied image. The metadata contains:
      - "set": the name of the noise folder (e.g., "noise_set156")
      - "image_name": the final image filename in dest_folder

    If a filename conflict occurs in dest_folder, a counter is appended to the filename to avoid overwriting.

    Parameters:
      noise_base_dir (str): Base directory containing noise folders (e.g., "P:\CAD_Retrival\d_All_Noise")
      dest_folder (str): Destination folder where all noise images will be copied.
      output_meta_file (str): Path to the NPZ file where metadata will be saved.
    """
    os.makedirs(dest_folder, exist_ok=True)
    meta_list = []  # To accumulate metadata for each copied image

    # List all noise folders in the noise base directory.
    noise_folders = [d for d in os.listdir(noise_base_dir) if os.path.isdir(os.path.join(noise_base_dir, d))]
    
    for noise_folder in noise_folders:
        folder_path = os.path.join(noise_base_dir, noise_folder)
        # List all image files in the current noise folder.
        image_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
        for file_name in image_files:
            src_path = os.path.join(folder_path, file_name)
            dest_path = os.path.join(dest_folder, file_name)
            # If a file with the same name exists, append a counter to avoid overwriting.
            if os.path.exists(dest_path):
                base, ext = os.path.splitext(file_name)
                counter = 1
                new_name = f"{base}_{counter}{ext}"
                dest_path = os.path.join(dest_folder, new_name)
                while os.path.exists(dest_path):
                    counter += 1
                    new_name = f"{base}_{counter}{ext}"
                    dest_path = os.path.join(dest_folder, new_name)
                final_name = new_name
            else:
                final_name = file_name
            shutil.copy2(src_path, dest_path)
            print(f"Copied {src_path} to {dest_path}")
            meta_list.append({
                "set": noise_folder,
                "image_name": final_name
            })
    
    if meta_list:
        sets_arr = np.array([d["set"] for d in meta_list])
        image_names_arr = np.array([d["image_name"] for d in meta_list])
        np.savez_compressed(output_meta_file, sets=sets_arr, image_names=image_names_arr)
        print(f"\nSaved metadata to {output_meta_file}")
    else:
        print("No noise images found to copy.")

# ----------------- Example Usage -----------------
# Adjust these paths to your environment:
copy_all_noise_images_and_save_metadata(
    noise_base_dir=r"P:\CAD_Retrival\a_d_Filtered_2_C_All_Noise",
    dest_folder=r"P:\CAD_Retrival\Filtered_3_N",
    output_meta_file=r"P:\CAD_Retrival\a_b_Filtered_2_C_Features"
)


In [None]:
import os
import numpy as np

def combine_feature_files(feature_dir, output_file):
    """
    Combines all NPZ feature files in the given directory into a single NPZ file.
    
    Each NPZ file is expected to contain:
      - 'features': a numpy array of shape (num_images, feature_dim)
      - 'image_names': a list (or array) of image filenames (strings)
      - Optionally, 'full_image_paths': a list of full image paths.
    
    The function loads all NPZ files ending with '_features.npz' in feature_dir,
    concatenates the arrays and lists, and saves the combined data into output_file.
    
    Parameters:
      feature_dir (str): Directory containing individual NPZ feature files.
      output_file (str): Path where the combined NPZ file will be saved.
    """
    all_features = []
    all_image_names = []
    all_full_paths = []
    
    # List all NPZ files in feature_dir that follow the naming pattern
    for fname in os.listdir(feature_dir):
        if fname.endswith("_features.npz"):
            fpath = os.path.join(feature_dir, fname)
            print(f"Loading features from: {fpath}")
            data = np.load(fpath, allow_pickle=True)
            features = data["features"]
            image_names = list(data["image_names"])
            # If full image paths exist, load them; otherwise, use None for each image.
            if "full_image_paths" in data:
                full_paths = list(data["full_image_paths"])
            else:
                full_paths = [None] * len(image_names)
            
            all_features.append(features)
            all_image_names.extend(image_names)
            all_full_paths.extend(full_paths)
    
    if all_features:
        combined_features = np.concatenate(all_features, axis=0)
    else:
        combined_features = np.array([])
    
    np.savez_compressed(output_file,
                        features=combined_features,
                        image_names=all_image_names,
                        full_image_paths=all_full_paths)
    print(f"Combined feature file saved to: {output_file}")

# Example usage:
feature_dir = r"P:\CAD_Retrival\b_Features"
output_file = os.path.join(feature_dir, "combined_features.npz")
combine_feature_files(feature_dir, output_file)


## GROK

In [None]:
import os
import shutil
import numpy as np
import torch
import torch.nn.functional as F
from torchvision import models, transforms
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import faiss
from tqdm import tqdm

# Set device to GPU
device = torch.device("cuda")

# Define a custom module to extract features using VGG16
class VGG16Features(torch.nn.Module):
    def __init__(self):
        super(VGG16Features, self).__init__()
        vgg16 = models.vgg16(pretrained=True).to(device)
        self.features = vgg16.features
        self.avgpool = vgg16.avgpool
        # Use classifier up to the second-to-last layer (4096 features)
        self.classifier = torch.nn.Sequential(*list(vgg16.classifier.children())[:-1])

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

# Initialize the feature extractor and move to GPU
model = VGG16Features().to(device)
model.eval()

# Define transformations for PNG images
transform = transforms.Compose([
    transforms.Resize((224, 224)),           # Resize to VGG16 input size
    transforms.ToTensor(),                   # Convert to tensor
    transforms.Normalize(                    # Normalize for VGG16
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    ),
])

# Custom Dataset class for loading images
class ImageDataset(Dataset):
    def __init__(self, image_paths, transform=None):
        self.image_paths = image_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert('RGB')  # Load PNG and convert to RGB
        if self.transform:
            image = self.transform(image)
        return image, img_path

# Specify your image directory
image_dir = "P:\CAD_Retrival\Testing3"  # Replace with your directory
image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith('.png')]

# Create dataset and dataloader for batch processing
dataset = ImageDataset(image_paths, transform=transform)
if __name__ == '__main__':
    dataloader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=0, pin_memory=True)
    # Your code here

    # Extract features in batches on GPU
    features = []
    for batch_images, _ in tqdm(dataloader, desc="Extracting features"):
        batch_images = batch_images.to(device)
        with torch.no_grad():  # No gradient computation
            batch_features = model(batch_images)
        features.append(batch_features.cpu())  # Move to CPU to save GPU memory
    features = torch.cat(features, dim=0)  # Concatenate all features

    # Normalize features for similarity computation
    features = F.normalize(features, p=2, dim=1)

    # Convert features to NumPy array for FAISS
    features_np = features.numpy()

    # FAISS GPU setup
    res = faiss.StandardGpuResources()  # Use GPU resources
    index = faiss.IndexFlatIP(4096)  # 4096 dimensions from VGG16
    gpu_index = faiss.index_cpu_to_gpu(res, 0, index)  # Move index to GPU

    # Add features to GPU index
    gpu_index.add(features_np)

    # Perform range search to find similar images
    threshold = 0.9  # Similarity threshold (0 to 1)
    lim, D, I = gpu_index.range_search(x=features_np, thresh=threshold)

    # Union-find to group similar images
    parent = list(range(len(image_paths)))

    def find(x):
        if parent[x] != x:
            parent[x] = find(parent[x])  # Path compression
        return parent[x]

    def union(x, y):
        px, py = find(x), find(y)
        if px != py:
            parent[px] = py  # Union operation

    # Group similar images
    for i in range(len(image_paths)):
        start = lim[i]
        end = lim[i + 1]
        for j in I[start:end]:
            if j != i:  # Skip self-similarity
                union(i, j)

    # Identify unique groups
    groups = {}
    for i in range(len(image_paths)):
        root = find(i)
        if root not in groups:
            groups[root] = []
        groups[root].append(image_paths[i])

    # Create subfolders and copy similar images
    similar_dir = "P:\CAD_Retrival\Testing3(Output)"
    os.makedirs(similar_dir, exist_ok=True)
    group_counter = 1
    for root, img_list in groups.items():
        if len(img_list) > 1:  # Only copy groups with multiple images
            subfolder = os.path.join(similar_dir, f'group_{group_counter}')
            os.makedirs(subfolder, exist_ok=True)
            for img_path in img_list:
                shutil.copy(img_path, subfolder)  # Copy, not move
            group_counter += 1

    print(f"Done! Similar images are copied to subfolders in '{similar_dir}'.")

In [None]:
import os
import shutil
import numpy as np
import torch
import torch.nn.functional as F
from torchvision import models, transforms
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import cv2
import faiss
from tqdm import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ResNet50 feature extractor
class ResNet50Features(torch.nn.Module):
    def __init__(self):
        super(ResNet50Features, self).__init__()
        resnet50 = models.resnet50(pretrained=True).to(device)
        self.features = torch.nn.Sequential(*list(resnet50.children())[:-1])

    def forward(self, x):
        x = self.features(x)
        return x.view(x.size(0), -1)

# Dataset
class ImageDataset(Dataset):
    def __init__(self, image_paths, transform=None):
        self.image_paths = image_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        try:
            img = cv2.imread(img_path)
            if img is None:
                raise ValueError("Image not loaded")
            else:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img = Image.fromarray(img)
                if self.transform:
                    img = self.transform(img)
                return img, os.path.basename(img_path)
        except Exception as e:
            print(f"Error loading {img_path}: {e}")
            return None, os.path.basename(img_path)

# Batch feature extraction
def extract_features_batch(image_paths, model, transform, device):
    if not image_paths:
        raise ValueError(f"No image files found in {image_folder_path}")
    dataset = ImageDataset(image_paths, transform)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0, pin_memory=True)
    features = []
    image_names = []
    model.eval()
    for batch_images, batch_names in tqdm(dataloader, desc="Extracting features"):
        valid_images = [img for img in batch_images if img is not None]
        valid_names = [name for img, name in zip(batch_images, batch_names) if img is not None]
        if not valid_images:
            print(f"Skipping batch: No valid images")
            continue
        batch_tensor = torch.stack(valid_images).to(device)
        with torch.no_grad():
            batch_features = model(batch_tensor)
        features.append(batch_features.cpu())
        image_names.extend(valid_names)
    if not features:
        raise ValueError("No features extracted. Check image files and paths.")
    return torch.cat(features, dim=0), image_names

# FAISS index
def build_faiss_index(feature_vectors):
    d = feature_vectors.shape[1]
    index = faiss.IndexFlatL2(d)
    print("Adding vectors to FAISS index...")
    index.add(feature_vectors)
    print("FAISS IndexFlatL2 built.")
    return index

# Find similar images
def find_similar_images(index, feature_vectors, image_names, k=10, distance_threshold=0.5):
    similar_image_folders = {}
    for i in range(len(image_names)):
        query_vector = np.expand_dims(feature_vectors[i], axis=0)
        distances, indices = index.search(query_vector, k + 1)
        similar_images = []
        for j in range(1, len(indices[0])):
            neighbor_idx = indices[0][j]
            distance = distances[0][j]
            if distance < distance_threshold:
                similar_images.append(image_names[neighbor_idx])
        if similar_images:
            similar_image_folders[image_names[i]] = similar_images
            print(f"Found {len(similar_images)} similar images for {image_names[i]}")
    return similar_image_folders

# Copy images
def copy_similar_images_to_folders(similar_image_folders, image_folder_path, output_base_folder):
    os.makedirs(output_base_folder, exist_ok=True)
    for rep_image, sim_images in similar_image_folders.items():
        group_folder = os.path.join(output_base_folder, f"similar_to_{os.path.splitext(rep_image)[0]}")
        os.makedirs(group_folder, exist_ok=True)
        shutil.copy(os.path.join(image_folder_path, rep_image), os.path.join(group_folder, rep_image))
        for sim_image in sim_images:
            shutil.copy(os.path.join(image_folder_path, sim_image), os.path.join(group_folder, sim_image))

# Main
if __name__ == "__main__":
    image_folder_path = "P:\\CAD_Retrival\\Testing3"
    output_base_folder = "P:\\CAD_Retrival\\Testing3(Output)"
    distance_threshold = 0.5
    num_neighbors = 10

    # Validate directory
    if not os.path.exists(image_folder_path):
        raise FileNotFoundError(f"Directory not found: {image_folder_path}")

    # Model and transform
    model = ResNet50Features().to(device)
    transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Extract features
    image_paths = [os.path.join(image_folder_path, f) for f in os.listdir(image_folder_path) if f.endswith('.png')]
    print(f"Found {len(image_paths)} PNG files in {image_folder_path}")
    features, image_names = extract_features_batch(image_paths, model, transform, device)
    
    # Validate features
    features_np = features.numpy().astype('float32')
    if np.any(~np.isfinite(features_np)):
        print("Warning: Feature vectors contain NaN or infinite values.")
        features_np = features_np[np.isfinite(features_np).all(axis=1)]
        image_names = [name for i, name in enumerate(image_names) if np.isfinite(features_np[i]).all()]
    
    # Build FAISS index
    index = build_faiss_index(features_np)

    # Find similar images
    similar_image_folders = find_similar_images(index, features_np, image_names, k=num_neighbors, distance_threshold=distance_threshold)

    # Copy to folders
    copy_similar_images_to_folders(similar_image_folders, image_folder_path, output_base_folder)
    print("Done!")

Using device: cuda
Found 0 PNG files in P:\CAD_Retrival\Testing3


ValueError: No image files found in P:\CAD_Retrival\Testing3

In [3]:
import os
import shutil
import time
import torch
import torchvision.transforms as transforms
import torchvision.models as models
from PIL import Image
import faiss
import numpy as np
from tqdm import tqdm

class ImageFeatureExtractor:
    """
    Extracts features from images using a pre-trained PyTorch model.
    """
    def __init__(self, model_name='resnet50', pretrained=True, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.device = device
        self.model_name = model_name
        self.model = self.load_model(pretrained)
        self.transform = self.get_transform()

    def load_model(self, pretrained):
        """Loads a pre-trained model and sets it to evaluation mode."""
        model = getattr(models, self.model_name)(pretrained=pretrained)
        # Remove the classification layer (fc or classifier) to get features
        if self.model_name.startswith('resnet'):
             modules = list(model.children())[:-1]  # Remove last fc layer
        elif self.model_name.startswith('vgg'):
             modules = list(model.features)  # Remove classifier
        elif self.model_name.startswith('efficientnet'):
            modules = list(model.features)
        else:
            raise ValueError(f"Unsupported model architecture: {self.model_name}") #Add exception if model not supported

        model = torch.nn.Sequential(*modules)
        model = model.to(self.device)
        model.eval()  # Set to evaluation mode
        return model


    def get_transform(self):
        """Defines the image transformations."""
        return transforms.Compose([
            transforms.Resize((224, 224)),  # Resize to the model's input size
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # ImageNet normalization
        ])

    def extract_features(self, image_path):
        """Extracts features from a single image."""
        try:
            img = Image.open(image_path).convert('RGB')  # Ensure RGB format
        except (FileNotFoundError, OSError) as e:
            print(f"Error opening image {image_path}: {e}")
            return None

        img = self.transform(img)
        img = img.unsqueeze(0)  # Add batch dimension (1 x C x H x W)
        img = img.to(self.device)

        with torch.no_grad():  # Disable gradient calculation
            features = self.model(img)
            features = features.squeeze()  # Remove extra dimensions

        return features.cpu().numpy()  # Move to CPU and convert to NumPy array



class ImageSimilaritySearch:
    """
    Uses FAISS to build an index and find similar images.
    """
    def __init__(self, dimension):
        self.dimension = dimension
        #self.index = faiss.IndexFlatL2(dimension)  # L2 distance (Euclidean)
        self.index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension))  # Store image IDs

    def build_index(self, features, ids):
        """Builds the FAISS index."""
        features = np.array(features).astype('float32')
        ids = np.array(ids)
        self.index.add_with_ids(features,ids) # add with ids

    def search(self, query_features, k=10):
        """Searches for the k most similar images."""
        query_features = np.array(query_features).astype('float32').reshape(1, -1)  # Reshape for single query
        distances, indices = self.index.search(query_features, k)
        return distances, indices


def organize_images(image_dir, output_dir, feature_extractor, similarity_search, threshold=10.0, k_neighbors=10):
    """
    Finds similar images and moves them to subfolders.
    """

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    image_paths = [os.path.join(image_dir, filename) for filename in os.listdir(image_dir) if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))] # consider different image formats
    image_ids = list(range(len(image_paths))) #create a list of ids

    # Extract features for all images
    all_features = []
    valid_image_ids = []  # Keep track of images with valid features
    print("Extracting features...")
    for i, image_path in enumerate(tqdm(image_paths)):  # Use tqdm for progress
        features = feature_extractor.extract_features(image_path)
        if features is not None:  # Check if features were extracted
            all_features.append(features)
            valid_image_ids.append(image_ids[i])  # Use the original index

    print("Building FAISS index...")
    similarity_search.build_index(all_features, valid_image_ids)

    print("Organizing images...")
    processed_images = set()  # Keep track of images that have been moved
    folder_counter = 0

    for i, image_id in enumerate(tqdm(valid_image_ids)):
        image_path = image_paths[image_id]  # Corrected index
        if image_id in processed_images:
            continue

        # Create a new subfolder
        folder_counter += 1
        new_folder_path = os.path.join(output_dir, f"group_{folder_counter}")
        os.makedirs(new_folder_path, exist_ok=True)

        # Move the current image to the new folder
        shutil.move(image_path, os.path.join(new_folder_path, os.path.basename(image_path)))
        processed_images.add(image_id)
        # Find similar images
        distances, indices = similarity_search.search(all_features[i], k=k_neighbors) # Pass the feature directly, the index already built

        # Move similar images to the same folder
        for j in range(indices.shape[1]):  # Iterate over the results
             neighbor_id = indices[0][j] # Get the id from index result.  indices is an array of arrays
             if neighbor_id != image_id and neighbor_id not in processed_images and distances[0][j] < threshold :
                neighbor_path = image_paths[neighbor_id] # Get the correct path of image
                try:
                    shutil.move(neighbor_path, os.path.join(new_folder_path, os.path.basename(neighbor_path)))
                    processed_images.add(neighbor_id)
                except FileNotFoundError:
                    print(f"Warning: Image file not found during move: {neighbor_path}")
                except Exception as e: #Catch other errors during move
                    print(f"Error moving image: {neighbor_path}, {e}")


def main():
    image_dir = "P:\CAD_Retrival\Testing3"  # Replace with your image directory
    output_dir = "P:\CAD_Retrival\Testing3(Output)"  # Replace with your desired output directory
    model_name = 'resnet50'
    threshold = 250.0   # Adjust this threshold based on your images and model.  EXPERIMENT!
    k_neighbors= 20   # Number of neighbors considered, could affect your grouping

    #Feature dimension extraction
    if model_name.startswith('resnet'):
        feature_dimension = 2048  # For ResNet50 (and other ResNets)
    elif model_name.startswith('vgg'):
        feature_dimension = 25088  # For VGG16/VGG19 (check features[-1].out_features)
    elif model_name.startswith('efficientnet'):
         feature_dimension = 1280 #For efficientnet_b0
    else:
         raise ValueError(f"Unsupported model architecture: {model_name}") #Add exception if model not supported


    feature_extractor = ImageFeatureExtractor(model_name=model_name)
    similarity_search = ImageSimilaritySearch(dimension=feature_dimension)
    start_time = time.time()

    organize_images(image_dir, output_dir, feature_extractor, similarity_search, threshold=threshold,k_neighbors=k_neighbors)
    end_time = time.time()
    print(f"Total time taken: {end_time - start_time:.2f} seconds")

if __name__ == "__main__":
    main()



Extracting features...


100%|██████████| 1656/1656 [00:52<00:00, 31.43it/s]


Building FAISS index...
Organizing images...


  0%|          | 0/1656 [00:00<?, ?it/s]

: 