# Notebook overview
Computes pairwise cosine distance matrices between image embeddings for different dataset splits and saves the results.

- Loads .pt embeddings via a CustomDataset and DataLoader
- Normalizes embeddings (L2) and computes cosine distances in batches using sklearn.pairwise_distances
- Saves distance matrices (.npy) and records calculation durations

The notebook was exported as a Python script and run in a console using Tmux to execute it. The notebook was used for both origin and resized dataset just adapte the paths.

# Preperation

### Import

In [14]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime

import torch
from torch.utils.data import Dataset, DataLoader
torch.cuda.empty_cache()  # GPU-Cache clear
torch.cuda.reset_peak_memory_stats()  # Reset statistics

from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import Normalizer

### Load Paths - df_dir_path, embedding_dir_path, result_dir_path

In [15]:
# df Folder
DF_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/datasets/created'
df_dir_path = Path(DF_DIR_PATH)
if not df_dir_path.exists():
    raise FileNotFoundError(f"File does not exist: {DF_DIR_PATH}")

# Embeddings Folder
HIGH_EMBEDDING_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/embeddings/adapted/resized/high'
high_embedding_dir_path = Path(HIGH_EMBEDDING_DIR_PATH)
if not high_embedding_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {HIGH_EMBEDDING_DIR_PATH}")

LOW_EMBEDDING_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/embeddings/adapted/resized/low'
low_embedding_dir_path = Path(LOW_EMBEDDING_DIR_PATH)
if not low_embedding_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {LOW_EMBEDDING_DIR_PATH}")

# Results Folder
RESULT_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/models/knn/resized/model'
result_dir_path = Path(RESULT_DIR_PATH)
if not result_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {RESULT_DIR_PATH}")

# Function

### Function - CustomDataset

In [16]:
class CustomDataset(Dataset): # copy and refactored from CustomDatasetFineGrain Class availible in other files

    def __init__(self, df: pd.DataFrame, embedding_dir_path: Path, column_name_tensor: str):
        self.embedding_dir_path = embedding_dir_path
        self.df_reduced = df[[column_name_tensor]].copy() # create dataFrame with relevant columns

    def __len__(self):
        return len(self.df_reduced)

    def __getitem__(self, item: int):
        image_file_path = self.df_reduced.iloc[item, 0]
        tensor_file_path = Path(image_file_path).with_suffix('.pt')
        absolute_path = self.embedding_dir_path / tensor_file_path
        tensor = torch.load( absolute_path , weights_only=True, map_location='cpu')
        return tensor.squeeze() # squeeze to remove the first dimension

### Function - fill_distance_matrix

In [17]:
def fill_distance_matrix( dataloader_x:DataLoader, dataloader_y:DataLoader, distance_matrix: np.ndarray) -> tuple[np.ndarray, datetime]:
    start_time = datetime.now()
    scaler = Normalizer(norm='l2')

    row_count = 0
    for embedding_batch_x in dataloader_x:
        embedding_batch_x_scaled = scaler.transform(embedding_batch_x.detach().numpy())
        len(embedding_batch_x_scaled)
        col_count = 0
        for embedding_batch_y in dataloader_y:
            embedding_batch_y_scaled = scaler.transform(embedding_batch_y.detach().numpy())
            len(embedding_batch_y_scaled)
            distance_batch = pairwise_distances(embedding_batch_x_scaled, embedding_batch_y_scaled,  metric='cosine', n_jobs=-1)

            row_batch_y = distance_batch.shape[0]
            col_batch_y =  distance_batch.shape[1]
            
            distance_matrix[
                row_count: row_count + row_batch_y,
                col_count: col_count + col_batch_y
                ] = distance_batch

            col_count += col_batch_y
            print(f"> {col_count} columns in batch_y calculated")

        row_batch_x = embedding_batch_x_scaled.shape[0]
        row_count += row_batch_x

        batch_time_x = datetime.now()
        duration_batch_time_x = batch_time_x - start_time
        print(f">>> {row_count} rows in batch_y calculated in time: {duration_batch_time_x}")
    
    end_time = datetime.now()
    durcation_time = end_time - start_time
    return distance_matrix, durcation_time

# Optimize calculation: symmetric matrix - upper triangular matrix calculation sufficient.

### Function - calculate_distance_matrix

In [None]:
def calculate_distance_matrix( x_dataset_path: Path, col_name: str, embedding_dir_path: Path, save_path: Path):
    x_df = pd.read_csv( x_dataset_path , index_col=False, usecols=[col_name]) # nrows=10
    y_df = pd.read_csv( df_dir_path / 'high_id_train.csv', index_col=False, usecols=['image_path']) # nrows=10
    print("dfs loaded")

    x_dataset = CustomDataset(x_df, embedding_dir_path, col_name)
    y_dataset = CustomDataset(y_df, high_embedding_dir_path, 'image_path')
    print("datasets created")

    x_dataloader = DataLoader(x_dataset, batch_size=4096, shuffle=False) #batch_size=4096
    y_dataloader = DataLoader(y_dataset, batch_size=1024, shuffle=False) #batch_size=1024
    print("dataloader created")

    nr_rows_df = len(x_df)
    nr_cols_df = len(y_df)
    distance_matrix = np.empty((nr_rows_df, nr_cols_df), dtype=np.float32)
    print(f"empty matrix created: {nr_rows_df}, {nr_cols_df}")

    distance_matrix_calculated, duration_time = fill_distance_matrix( x_dataloader, y_dataloader, distance_matrix )
    print("distance matrix calculated")

    if nr_rows_df == nr_cols_df:
        np.fill_diagonal(distance_matrix_calculated, 0.0) # Correcting rounding errors on the diagonal
        print("Diagonal elements of the matrix set to 0")

    # save distance_matrix
    np.save( save_path , distance_matrix_calculated)

    # Save calculation time
    with open( save_path.parent / f"{save_path.name}_calculation_time.txt" , "w") as f:
        f.write(str(duration_time))
    print(f"results saved at: {save_path}")

# Apply

### Apply - calculate_distance_matrix

In [19]:
# calculate_distance_matrix( df_dir_path / 'high_id_train.csv', 'image_path', result_dir_path / "distance_matrix_high_id_train.npy")
calculate_distance_matrix( df_dir_path / 'high_id_val.csv', 'image_path', high_embedding_dir_path, result_dir_path / "distance_matrix_high_id_val.npy")

calculate_distance_matrix( df_dir_path / 'high_id_test.csv', 'image_path', high_embedding_dir_path, result_dir_path / "distance_matrix_high_id_test.npy")
calculate_distance_matrix( df_dir_path / 'high_ood_test.csv', 'image_path', high_embedding_dir_path, result_dir_path / "distance_matrix_high_ood_test.npy")

calculate_distance_matrix( df_dir_path / 'low_id_test.csv', 'identifier', low_embedding_dir_path, result_dir_path / "distance_matrix_low_id_test.npy")
calculate_distance_matrix( df_dir_path / 'low_ood_test.csv', 'identifier', low_embedding_dir_path, result_dir_path / "distance_matrix_low_ood_test.npy")

dfs loaded
datasets created
dataloader created
empty matrix created: 10, 10
> 10 columns in batch_y calculated
>>> 10 rows in batch_y calculated in time: 0:00:00.036897
distance matrix calculated
Diagonal elements of the matrix set to 0
results saved at: /home/jleick/masterArbeitProjekt/final_release/models/knn/resized/model/distance_matrix_high_id_val.npy
dfs loaded
datasets created
dataloader created
empty matrix created: 10, 10
> 10 columns in batch_y calculated
>>> 10 rows in batch_y calculated in time: 0:00:00.032817
distance matrix calculated
Diagonal elements of the matrix set to 0
results saved at: /home/jleick/masterArbeitProjekt/final_release/models/knn/resized/model/distance_matrix_high_id_test.npy
dfs loaded
datasets created
dataloader created
empty matrix created: 10, 10
> 10 columns in batch_y calculated
>>> 10 rows in batch_y calculated in time: 0:00:00.035508
distance matrix calculated
Diagonal elements of the matrix set to 0
results saved at: /home/jleick/masterArbeitP

# Review Result

In [20]:
# import sympy as sp

# M = sp.Matrix(distance_matrix[0:100,0:100])
# M

In [21]:
# embedding_0 = train_dataset[0][0].cpu().detach().numpy().reshape(1, -1)
# embedding_0_scaled = scaler.transform(embedding_0)
# embedding_1 = train_dataset[3][0].cpu().detach().numpy().reshape(1, -1)
# embedding_1_scaled = scaler.transform(embedding_1)

# pairwise_distances( embedding_0_scaled, embedding_1_scaled, metric = 'cosine')