# Notebook overview

Calculates various Out-of-Distribution (OOD) distance thresholds at global, per-species, and intra-species levels based on precomputed distance matrices and validation results.

- Loads validation predictions, label maps, and precomputed training distance matrices
- Implements percentile-based logic to determine thresholds for Out-of-Distribution detection
- Computes distance thresholds at three levels: globally over all positive examples, per individual species, and intra-species within the training set
- Saves the resulting threshold dictionaries as pickle files

The notebook was used for both datasets(original and resized) just adapte the paths.

# Preperation

### Import

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

import ast
import pickle

### Path - prediction_dir_path, result_dir_path

In [3]:
### The paths for the original and resized data must be adjusted for calculation of original or resized ood thresholds (Replace “origin” with “resized” and vice versa).

### prediction folder to load df
PREDICTION_DIR_PATH = r'/home/stud/jleick/masterArbeitProjekt/final_release/models/knn/resized/prediction'
prediction_dir_path = Path(PREDICTION_DIR_PATH)
if not prediction_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {PREDICTION_DIR_PATH}")

### df Folder to load df
DF_SOURCE_PATH = r'/home/stud/jleick/masterArbeitProjekt/final_release/data/datasets/created'
df_source_dir = Path(DF_SOURCE_PATH)
if not df_source_dir.exists():
    raise FileNotFoundError(f"Folder does not exist: {DF_SOURCE_PATH}")

### distance matrix folder to load distance matrix
DISTANC_MATRIX_DIR_PATH = r'/home/stud/jleick/masterArbeitProjekt/final_release/models/knn/resized/model'
distanc_matrix_dir_path = Path(DISTANC_MATRIX_DIR_PATH)
if not distanc_matrix_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {DISTANC_MATRIX_DIR_PATH}")

### Folder to save results
RESULT_DIR_PATH = r'/home/stud/jleick/masterArbeitProjekt/final_release/models/ood/thresholds/resized'
result_dir_path = Path(RESULT_DIR_PATH)
if not result_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {RESULT_DIR_PATH}")

# Function

### Function - calculate_distances_for_percentiles

In [32]:
def calculate_distances_for_percentiles(distances_sorted: pd.DataFrame, percentiles: pd.Series) -> dict:
    percentile_dict = {}
    num_distances = distances_sorted.shape[0]

    for percentile in percentiles:
        threshold_index = round(num_distances*percentile) #round to next integer

        if threshold_index > num_distances or threshold_index < -1:
            raise IndexError(f'threshold_index {threshold_index} not expected')

        if threshold_index == 0:
            percentile_dict[percentile] = distances_sorted[0]
        else:
            percentile_dict[percentile] = distances_sorted[threshold_index - 1]
    
    return percentile_dict

### Function - calculate_trasholds_over_all_positiv_examples

1. Filter out all the correctly classified examples in the validation dataset.
2. Select the k-distance of the correctly classified examples.
For example, if k = 5, select the example that was the fifth furthest from the determining example.
3. Sort the list.
4. Select the distance in the sorted list by percentile (e.g. 95%).

In [33]:
def calculate_thresholds_over_all_positiv_examples( val_prediction:pd.DataFrame, percentiles: pd.Series) -> dict:
    copy_df = val_prediction.copy()
    # filter correct classified examples
    copy_positiv_df = copy_df[copy_df['prediction'] == copy_df['label']]
    # copy_positiv_df = copy_df # Temporary

    # select k distances
    distances = np.stack(copy_positiv_df['k_distances'].values)
    last_distances = distances[:,-1]

    # sort last_distances
    last_distances.sort()

    # select Perzentil
    percentile_dict = calculate_distances_for_percentiles(last_distances, percentiles)

    return percentile_dict

### Function - calculate_thresholds_for_each_species_examples (on largest distance anyway which class)
- recognies only predicted examples of a species but take other speceis into account when calculating neighbours for classification

1. Filter out all the examples that have been correctly classified from the validation dataset.
2. for each class - Select all examples of a class.
3. Select the k-distance of the correctly classified examples.
For example, if k = 5, select the example that was the fifth furthest from the determining example.
4. Sort the list.
5. Select the distance in the sorted list by percentile (e.g. 95%).

In [34]:
def calculate_thresholds_for_each_positiv_species_examples( val_prediction:pd.DataFrame, percentiles: pd.Series, fallback_threshold: dict) -> dict:
    copy_df = val_prediction.copy()

    # filter correct classified examples
    copy_positiv_df = copy_df[val_prediction['prediction'] == copy_df['label']]
    # copy_positiv_df = copy_df

    # group df over species (label)
    copy_positiv_grouped_df = copy_positiv_df.groupby('label')

    # for each species
    label_dict = {}
    for label, group_df in copy_positiv_grouped_df:
        if len(group_df) == 0:  # Safety check
            label_dict[label] = fallback_threshold
            continue

        # select k distances
        distances = np.stack(group_df['k_distances'].values)
        last_distances = distances[:,-1]

        # sort last_distances
        last_distances.sort()

        # select Perzentil
        percentile_dict = calculate_distances_for_percentiles(last_distances, percentiles)
        label_dict[label] = percentile_dict
        
    return label_dict

### Function - predict_ood_for_species (on largest distance on exampel of predicted class)

In [35]:
# def get_last_label_distance(k_labels:pd.Series, k_distances:pd.Series, prediction:int) -> float:
#     for i in range(len(k_labels) - 1, -1, -1):  # rückwärts suchen
#         if k_labels[i] == prediction:
#             return k_distances[i]
#     return k_distances[-1]

In [36]:
# def calculate_thresholds_for_each_positiv_species_examples( val_prediction:pd.DataFrame, percentiles: pd.Series, fallback_threshold: dict) -> dict:

#     copy_df = val_prediction.copy()

#     copy_df['largest_distance'] = copy_df.apply(
#             lambda row: get_last_label_distance(
#                 row['k_image_labels'], 
#                 row['k_distances'], 
#                 row['prediction']
#             ), 
#             axis=1
#         ).values

#     # filter correct classified examples
#     copy_positiv_df = copy_df[copy_df['prediction'] == copy_df['label']]

#     # group df over species (label)
#     copy_positiv_grouped_df = copy_positiv_df.groupby('label')

#     # for each species
#     label_dict = {}
#     for label, group_df in copy_positiv_grouped_df:
#         if len(group_df) == 0:  # Safety check
#             label_dict[label] = fallback_threshold
#             continue

#         largest_distances = np.stack(group_df['largest_distance'].values)

#         # sort last_distances
#         largest_distances.sort()

#         # select Perzentil
#         percentile_dict = calculate_distances_for_percentiles(largest_distances, percentiles)
#         label_dict[label] = percentile_dict
        
#     return label_dict

### Function - calculate_thrasholds_in_species
- only train_distance_matrixes is used
- takes only distanzes between examples of a species, drops all examples of other species also when calculating neighbours

1. filter distances between examples of an species (label) only
2. take the k distance of each exampel
3. sort distances
4. predict threshold with percentile

In [37]:
def calculate_thrasholds_in_species( train_distance_matrix: np.array, train_df: pd.DataFrame, labels: pd.Series, k_neighbours:int, percentiles: pd.Series ):
    label_dict = {}

    for label in labels:
        mask = train_df['label'] == label
        matrix_filterd = train_distance_matrix[mask.values, :][:, mask.values]
        matrix_k_sorted = np.partition(matrix_filterd, kth=k_neighbours-1, axis=1) #sort first k-1 elements in each row of matrix
        last_distances = matrix_k_sorted[:,k_neighbours-1]

        last_distances_sorted = np.sort(last_distances, axis=None)

        percentile_dict = calculate_distances_for_percentiles(last_distances_sorted, percentiles)

        label_dict[label] = percentile_dict
    
    return label_dict

### Function - load_distance_matrix

In [38]:
def load_distance_matrix( distance_matrix_path:Path ) -> np.ndarray:
    dist_matrix = np.load( distance_matrix_path )

    if dist_matrix.shape[0] > dist_matrix.shape[1]:
        dist_matrix = dist_matrix.T

    print(f"distance matrix loaded: {distance_matrix_path}")
    return dist_matrix

### Function - save_thresholds, load_thresholds

In [39]:
# Save resultes
def save_thresholds(file, save_path):
    try:
        with open(save_path, "wb") as f:
            pickle.dump(file, f)
        print(f"thresholds saved succesfully: {save_path} ")
    except (IOError, pickle.PickleError) as e:
        print(f"error occure while save trasholds: {e}")

# open resultes
def load_thresholds(load_path):
    try:
        with open(load_path, "rb") as f:
            threshold_dict = pickle.load(f)
        print("thresholds loaded succesfully")
        return threshold_dict
    except (IOError, pickle.PickleError, EOFError) as e:
        print(f"error occure while loaded trasholds: {e}")

# Apply

In [40]:
percentiles = np.arange(0.0,1.001,0.01)
percentiles = np.round(percentiles, 2)

### Load dfs and merge

In [41]:
# for calculate_thresholds_over_all_positiv_examples and calculate_thresholds_for_each_species_examples
val_prediction_df = pd.read_csv( prediction_dir_path / 'high_id_val_prediction_k_13.csv', index_col=False, converters={"k_distances": ast.literal_eval, "k_image_labels": ast.literal_eval, "k_image_paths": ast.literal_eval})
label_map_df = pd.read_csv( df_source_dir / 'label_map_id.csv', usecols=['label', 'speciesKey'])


# for calculate_thrasholds_in_species
k_neighbours = 13
high_id_train_df = pd.read_csv( df_source_dir / 'high_id_train.csv', index_col=0, usecols=['speciesKey'])
label_map_id_df = pd.read_csv( df_source_dir / 'label_map_id.csv', index_col=0, usecols=['speciesKey','label'])

high_id_train_df_label = high_id_train_df.merge(label_map_id_df, how='left', on='speciesKey')
high_id_train_df_label = high_id_train_df_label.reset_index(drop=True)

### Apply - load_distance_matrix

In [42]:
train_distance_matrix = load_distance_matrix( distanc_matrix_dir_path / 'distance_matrix_high_id_train.npy')

distance matrix loaded: /home/stud/jleick/masterArbeitProjekt/final_release/models/knn/resized/model/distance_matrix_high_id_train.npy


### Apply - calculate_thrasholds ...

In [43]:
thresholds_over_all = calculate_thresholds_over_all_positiv_examples( val_prediction_df, percentiles)
thresholds_for_each_species = calculate_thresholds_for_each_positiv_species_examples( val_prediction_df, percentiles, thresholds_over_all )
thresholds_in_each_species = calculate_thrasholds_in_species(train_distance_matrix, high_id_train_df_label, label_map_id_df['label'], k_neighbours, percentiles)

### save results

In [44]:
save_thresholds(thresholds_over_all, result_dir_path / f'thresholds_over_all_examples.pkl' )
save_thresholds(thresholds_for_each_species, result_dir_path / f'thresholds_for_each_species.pkl' )
save_thresholds(thresholds_in_each_species, result_dir_path / 'thresholds_in_each_species.pkl' )

thresholds saved succesfully: /home/stud/jleick/masterArbeitProjekt/final_release/models/ood/thresholds/resized/thresholds_over_all_examples.pkl 
thresholds saved succesfully: /home/stud/jleick/masterArbeitProjekt/final_release/models/ood/thresholds/resized/thresholds_for_each_species.pkl 
thresholds saved succesfully: /home/stud/jleick/masterArbeitProjekt/final_release/models/ood/thresholds/resized/thresholds_in_each_species.pkl 
