#  Import libraries

In [2]:
import os
import time
import matplotlib
import matplotlib.pyplot as plt
import re

# Use Agg backend to save figures
matplotlib.use('Agg')

from src.utils import *
from src.datasets import ExcavatorDataset
from src.metrics import VLAD, FisherVector
from src.config import ROOT

In [None]:
root = ROOT

# Cluster images based on their VLAD/Fisher vectors (DONE)

In [None]:
from src.utils import *
import tqdm

k_means_models = ['k_means_model_k32_sift', 'k_means_model_k32_root_sift', 'k_means_model_k64_root_sift', 'k_means_model_k64_sift', 'k_means_model_k128_root_sift', 'k_means_model_k128_sift', 'k_means_model_k256_sift', 'k_means_model_k256_root_sift']


vectors_dir = f'{root}/res/vlad'
output_dir = f'{root}/res/similarity_matrix'

for num_clusters in range(20, 61, 5):
    for model in k_means_models:
        kmeans_h5_paths = f'{root}/res/vlad/{model}.h5'

        data = load_hdf5(f'{root}/res/vlad/{model}.h5')
        image_paths = list(data.keys())
        vectors = np.array([data[path] for path in image_paths])
        image_paths = [path.replace('|', '/') for path in image_paths]
        cluster_images_and_save(image_paths,
                                vectors,
                                n_clusters=num_clusters,
                                output_dir=f'{root}/res/clustered_datasets_{num_clusters}_clusters/vlad/{model}')

# Cluster images based on their Fisher vectors (DONE)

In [None]:
gmm_models = ['gmm_model_k32_sift',  'gmm_model_k32_root_sift', 'gmm_model_k64_root_sift', 'gmm_model_k64_sift', 'gmm_model_k128_root_sift', 'gmm_model_k128_sift', 'gmm_model_k256_sift', 'gmm_model_k256_root_sift']

vectors_dir = f'{root}/res/fisher'
output_dir = f'{root}/res/similarity_matrix'

for num_clusters in range(20, 61, 5):
    for model in gmm_models:
        gmm_h5_paths = f'{root}/res/fisher/{model}.h5'

        data = load_hdf5(f'{root}/res/fisher/{model}.h5')
        image_paths = list(data.keys())
        vectors = np.array([data[path] for path in image_paths])
        image_paths = [path.replace('|', '/') for path in image_paths]
        cluster_images_and_save(image_paths,
                                vectors,
                                n_clusters=num_clusters,
                                output_dir=f'{root}/res/clustered_datasets_{num_clusters}_clusters/fisher/{model}')

# Evaluate overall similarity within each cluster

In [None]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress

def create_and_save_boxplots_per_model(
    models_data: dict[str, dict[int, list[float]]],
    x_label: str = "Number of Clusters",
    y_label: str = "Average Similarity Score",
    show: bool = True,
    save_fig_path: str = None,
    plots_per_row: int = 4,
):
    """
    Create and save boxplots for each model, arranged in subplots with the same y-axis scale.

    :param models_data: dict where keys are model names, values are dicts of {number of clusters: list of values}
    :param x_label: Label for x-axis
    :param y_label: Label for y-axis
    :param show: Whether to display the plot
    :param save_fig_path: Path to save the figure
    :param plots_per_row: Number of plots per row
    """
    num_models = len(models_data)
    num_rows = (num_models + plots_per_row - 1) // plots_per_row  # Ceiling division
    fig, axes = plt.subplots(
        num_rows, plots_per_row, figsize=(plots_per_row * 5, num_rows * 5)
    )
    axes = axes.flatten()

    # Compute global y-axis limits
    all_values = [value for data_dict in models_data.values() for values in data_dict.values() for value in values]
    global_y_min, global_y_max = min(all_values), max(all_values)

    for idx, (model, data_dict) in enumerate(models_data.items()):
        ax = axes[idx]
        x_values = sorted(data_dict.keys())
        data = [data_dict[x] for x in x_values]
        # Create boxplot
        _ = ax.boxplot(data, positions=range(len(x_values)), patch_artist=True)
        # Set x-ticks to x_values
        ax.set_xticks(range(len(x_values)))
        ax.set_xticklabels(x_values)
        ax.set_xlabel(x_label)
        ax.set_ylabel(y_label)
        ax.set_title(model)
        ax.set_ylim(global_y_min, global_y_max)  # Apply global y-axis limits
        # Compute mean average similarity scores for regression
        mean_values = [np.mean(data_dict[x]) for x in x_values]
        # Fit regression line
        slope, intercept, _, _, _ = linregress(x_values, mean_values)
        # Compute regression line values
        reg_line = [slope * x + intercept for x in x_values]
        # Plot regression line
        ax.plot(range(len(x_values)), reg_line, color="red", linestyle="--", label=f"Slope: {slope:.2f}-Intercept: {intercept:.2f}")
        ax.legend()

    # Remove empty subplots
    for idx in range(num_models, num_rows * plots_per_row):
        fig.delaxes(axes[idx])

    plt.tight_layout()

    if save_fig_path:
        plt.savefig(save_fig_path)

    if show:
        plt.show()

k_means_models = ['k_means_model_k32_sift', 'k_means_model_k32_root_sift', 'k_means_model_k64_root_sift', 'k_means_model_k64_sift', 'k_means_model_k128_root_sift', 'k_means_model_k128_sift', 'k_means_model_k256_sift', 'k_means_model_k256_root_sift']

vlad_models_data = {}

for model in k_means_models:
    print("Processing model:", model)
    vlad_models_data[model] = {}
    for num_clusters in range(20, 61, 5):
        vlad_models_data[model][num_clusters] = []
        cluster_dir = f'{root}/res/clustered_datasets_{num_clusters}_clusters/vlad/{model}'
        print("Cluster dir:", cluster_dir)

        if os.path.exists(cluster_dir):
            for cluster_subdir in os.listdir(cluster_dir):
                cluster_subdir_path = os.path.join(cluster_dir, cluster_subdir)
                if os.path.isdir(cluster_subdir_path):
                    json_file_path = os.path.join(cluster_subdir_path, f'{cluster_subdir}_info.json')
                    if os.path.exists(json_file_path):
                        with open(json_file_path, 'r') as f:
                            data_json = json.load(f)
                            avg_similarity = data_json.get('average_similarity')
                            num_images = len([file for file in os.listdir(cluster_subdir_path) if file.lower().endswith('.jpg')])

                            if avg_similarity is not None:
                                vlad_models_data[model][num_clusters].append(avg_similarity)
# Sort data based on number of clusters


# Now create the boxplots
create_and_save_boxplots_per_model(
    vlad_models_data,
    x_label="Number of Image Clusters",
    y_label="Average Similarity Score",
    show=True,
    save_fig_path='res/average_similarity_boxplots.png',
    plots_per_row=4
)


In [None]:
gmm_models = ['gmm_model_k32_sift',  'gmm_model_k32_root_sift', 'gmm_model_k64_root_sift', 'gmm_model_k64_sift', 'gmm_model_k128_root_sift', 'gmm_model_k128_sift', 'gmm_model_k256_sift', 'gmm_model_k256_root_sift']

gmm_models_data = {}

for model in gmm_models:
    print("Processing model:", model)
    gmm_models_data[model] = {}
    for num_clusters in range(20, 61, 5):
        gmm_models_data[model][num_clusters] = []
        cluster_dir = f'{root}/res/clustered_datasets_{num_clusters}_clusters/fisher/{model}'
        print("Cluster dir:", cluster_dir)

        if os.path.exists(cluster_dir):
            for cluster_subdir in os.listdir(cluster_dir):
                cluster_subdir_path = os.path.join(cluster_dir, cluster_subdir)
                if os.path.isdir(cluster_subdir_path):
                    json_file_path = os.path.join(cluster_subdir_path, f'{cluster_subdir}_info.json')
                    if os.path.exists(json_file_path):
                        with open(json_file_path, 'r') as f:
                            data_json = json.load(f)
                            avg_similarity = data_json.get('average_similarity')
                            num_images = len([file for file in os.listdir(cluster_subdir_path) if file.lower().endswith('.jpg')])

                            if avg_similarity is not None:
                                gmm_models_data[model][num_clusters].append(avg_similarity)

# Now create the boxplots
create_and_save_boxplots_per_model(
    gmm_models_data,
    x_label="Number of Image Clusters",
    y_label="Average Similarity Score",
    show=True,
    save_fig_path='res/average_similarity_boxplots.png',
    plots_per_row=4
)
