# For each of the metrics: VLAD, Fisher, SSIM, MS-SSIM:

- Load the pre-computed data (if any) or compute on the fly
- Cluster the images based on the computed data

#  Import libraries

In [1]:
import os
import time
import matplotlib
import matplotlib.pyplot as plt
import re

# Use Agg backend to save figures
matplotlib.use('Agg')

from src.utils import *
from src.datasets import ExcavatorDataset
from src.metrics import VLAD, FisherVector
from src.config import ROOT

  from .autonotebook import tqdm as notebook_tqdm
  check_for_updates()


In [2]:
root = ROOT

In [3]:
train_dataset = ExcavatorDataset(return_type='image+mask+path', purpose='train')
val_dataset = ExcavatorDataset(return_type='image+mask+path', purpose='test')

  key: torch.tensor(value / 255.0, dtype=torch.float32)


# Cluster images based on their VLAD/Fisher vectors (DONE)

In [None]:
from src.utils import *
import tqdm

k_means_models = [
    model for model in os.listdir(rf'{root}/models/pickle_model_files') if 'k_means' in model
]


vectors_dir = f'{root}/res/vlad'
output_dir = f'{root}/res/similarity_matrix'

for num_clusters in range(10, 61, 5):
    for model in k_means_models:
        # h5_file_name = model.replace('.pkl', '.h5')
        # data = load_hdf5(f'{root}/res/vlad/train/{h5_file_name}')
        # image_paths = list(data.keys())
        # vectors = np.array([data[path] for path in image_paths])
        # image_paths = [f'{root}/excavator_dataset_w_masks/{path}' for path in image_paths]
        num_cls = int(re.findall(r'\d+', model)[0])
        vectors = np.empty((0, 128 * num_cls))
        image_paths = []
        for img, *_, path in train_dataset:
            vectors = np.append(vectors, VLAD(
                image=img,
                k_means=load_model(rf'{root}/models/pickle_model_files/{model}'),
                flatten=True,
                feature='root_sift' if 'root_sift' in model else 'sift'
            ).vector.reshape(1, -1), axis=0)
            image_paths.append(path)

        cluster_images_and_save(image_paths,
                                vectors,
                                n_clusters=num_clusters,
                                output_dir=f'{root}/res/clustered_datasets_{num_clusters}_clusters/vlad/{model}')

# Cluster images based on their Fisher vectors (DONE)

In [None]:
gmm_models = [
    model for model in os.listdir(rf'{root}/models/pickle_model_files') if 'gmm' in model
]

vectors_dir = f'{root}/res/fisher'
output_dir = f'{root}/res/similarity_matrix'

for num_clusters in range(10, 61, 5):
    for model in gmm_models:
        num_cls = int(re.findall(r'\d+', model)[0])
        vectors = np.empty((0, 2 * 128 * num_cls + num_cls))
        image_paths = []

        for img, *_, path in train_dataset:
            vectors = np.append(vectors, FisherVector(
                image=img,
                gmm=load_model(rf'{root}/models/pickle_model_files/{model}'),
                flatten=True,
                feature='root_sift' if 'root_sift' in model else 'sift'
            ).vector.reshape(1, -1), axis=0)
            image_paths.append(path)
        cluster_images_and_save(image_paths,
                                vectors,
                                n_clusters=num_clusters,
                                output_dir=f'{root}/res/clustered_datasets_{num_clusters}_clusters/fisher/{model}')

# Cluster images based on their SSIM and MS-SSIM values (TODO)

SSIM and MS-SSIM data had to be pre-computed and saved as hdf5 files because the computation was horrifyingly slow and intensive. Problem is, file paths were saved as basenames only and not full paths, so the matrix has to be sorted again to the correct order.

In [7]:
SORTED_PATHS = {os.path.basename(path): i for i, (*_, path) in enumerate(train_dataset)}
SORTED_FULLPATHS = {path: i for i, (*_, path) in enumerate(train_dataset)}

def sort_ssim_data(ssim_data: dict[str, dict[str, np.ndarray, np.ndarray]]) -> dict[str, dict[str, np.ndarray, np.ndarray]]:
    """
    Sorts the 'ssim' and 'ms_ssim' matrices in ssim_data according to the correct order
    defined by the 'sorted_paths'. It reorders both rows and columns so that the order
    corresponds to the indexes provided by the sorted_paths.

    :param ssim_data:
        A dictionary that should contain:
        'train_paths' : list of str representing the baseline names.
        'ssim'        : np.ndarray of shape (N, N)
        'ms_ssim'     : np.ndarray of shape (N, N)
        'sorted_paths': list of tuples (idx, full_path) representing the correct order.

    :returns:
        Updated ssim_data with reordered 'ssim' and 'ms_ssim' matrices.
    """
    train_paths = ssim_data['image_paths']
    ssim = ssim_data['ssim']
    ms_ssim = ssim_data['ms_ssim']
    num_train = len(train_paths)
    if ssim.shape[0] != num_train or ssim.shape[1] != num_train:
        raise ValueError("'ssim' matrix shape does not match the number of train paths.")
    if ms_ssim.shape[0] != num_train or ms_ssim.shape[1] != num_train:
        raise ValueError("'ms_ssim' matrix shape does not match the number of train paths.")

    new_order = [SORTED_PATHS[basename.decode('utf-8')] for basename in train_paths] # Hash table lookup x loop  -> O(N) * O(1) = O(N)
    sorted_paths = train_paths[new_order]
    ssim = ssim[new_order][:, new_order]
    ms_ssim = ms_ssim[new_order][:, new_order]
    sorted_data = {'image_paths': sorted_paths,'ssim': ssim,'ms_ssim': ms_ssim}
    return sorted_data

## Cluster images based on SSIM and MS-SSIM values

In [8]:
for num_clusters in range(15, 61, 5):
    for sigma in range(0, 12, 2):
        hdf5_path = f'{root}/res/ssim/within_train/ssim_sigma{sigma}.h5'
        ssim_data = load_hdf5(hdf5_path)
        sorted_ssims = sort_ssim_data(ssim_data)
        sorted_paths = sorted_ssims['image_paths']
        for feat in ['ssim', 'ms_ssim']:
            data = ssim_data[feat]
            row, col = data.shape
            # SSIm and MS_SSIM matrices should be square with shape (N, N)
            if row != len(num_imgs := list(SORTED_PATHS.keys())) or col != len(num_imgs):
                raise ValueError(f"Expected shape ({len(num_imgs)}, {len(num_imgs)}), got ({row}, {col})")
            cluster_images_and_save(list(SORTED_FULLPATHS.keys()),
                                    data,
                                    n_clusters=num_clusters,
                                    output_dir=f'{root}/res/clustered_datasets_{num_clusters}_clusters/ssim/{feat}_sigma{sigma}')

2024-12-20 09:12:34,685 - root - INFO - Clustering 1782 images into 15 clusters...


Processing clusters:   0%|          | 0/15 [00:00<?, ?it/s]


TypeError: expected str, bytes or os.PathLike object, not int