# Notebook overview

Predicts Out-of-Distribution (OOD) status for all test sets by applying various thresholds to k-NN prediction results.

- Loads prediction data: Imports results containing k-distances and predicted labels.
- Threshold Application for OOD Classification: Applies three distinct OOD threshold strategies over_alle_species, for_each_species, and in_each_species to classify examples as "In-Distribution" or "Out-of-Distribution" based on their distance.
- Class-specific thresholds are derived based on predictions from different classifiers (MLP and k-NN).
- Saves the updated prediction DataFrames, including OOD flags.

The notebook was used for both datasets(original and resized) just adapte the paths.

# Preperation

### Import

In [66]:
import pandas as pd
import numpy as np
from pathlib import Path

import ast
import pickle

### Path - prediction_dir_path, result_dir_path

In [67]:
### The paths for the original and resized data must be adjusted for calculation of original or resized ood Prediction (Replace “origin” with “resized” and vice versa).

### prediction folder to load df (includes distances to predict ood)
PREDICTION_DIR_PATH = r'/home/stud/jleick/masterArbeitProjekt/final_release/models/knn/resized/prediction'
prediction_dir_path = Path(PREDICTION_DIR_PATH)
if not prediction_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {PREDICTION_DIR_PATH}")

### prediction folder to load df
MLP_PREDICTION_DIR_PATH = r'/home/stud/jleick/masterArbeitProjekt/final_release/models/mlp/resized/prediction'
mlp_prediction_dir_path = Path(MLP_PREDICTION_DIR_PATH)
if not mlp_prediction_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {MLP_PREDICTION_DIR_PATH}")

### threshold Folder to load df
THRESHOLD_DIR_PATH = r'/home/stud/jleick/masterArbeitProjekt/final_release/models/ood/thresholds/resized'
threshold_dir_path = Path(THRESHOLD_DIR_PATH)
if not threshold_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {THRESHOLD_DIR_PATH}")

### Folder to save results
RESULT_DIR_PATH = r'/home/stud/jleick/masterArbeitProjekt/final_release/models/ood/predictions/resized'
result_dir_path = Path(RESULT_DIR_PATH)
if not result_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {RESULT_DIR_PATH}")

### Load dfs

In [None]:
### The paths must be adjusted for high and low datasets to k_13.csv and k_10.csv, respectively.

# for ood prediction and for knn species prediction for ood prediction on species level
high_id_test_prediction_df = pd.read_csv( prediction_dir_path / 'high_id_test_prediction_k_13.csv', index_col=False, converters={"k_distances": ast.literal_eval})
high_ood_test_prediction_df = pd.read_csv( prediction_dir_path / 'high_ood_test_prediction_k_13.csv', index_col=False, converters={"k_distances": ast.literal_eval})

low_id_test_prediction_df = pd.read_csv( prediction_dir_path / 'low_id_test_prediction_k_13.csv', index_col=False, converters={"k_distances": ast.literal_eval})
low_ood_test_prediction_df = pd.read_csv( prediction_dir_path / 'low_ood_test_prediction_k_13.csv', index_col=False, converters={"k_distances": ast.literal_eval})

# for mlp species prediction for ood prediction on species level
mlp_high_id_test_prediction_df = pd.read_csv( mlp_prediction_dir_path / 'high_id_test_prediction.csv', index_col=False, converters={"k_distances": ast.literal_eval})
mlp_high_ood_test_prediction_df = pd.read_csv( mlp_prediction_dir_path / 'high_ood_test_prediction.csv', index_col=False, converters={"k_distances": ast.literal_eval})

mlp_low_id_test_prediction_df = pd.read_csv( mlp_prediction_dir_path / 'low_id_test_prediction.csv', index_col=False, converters={"k_distances": ast.literal_eval})
mlp_low_ood_test_prediction_df = pd.read_csv( mlp_prediction_dir_path / 'low_ood_test_prediction.csv', index_col=False, converters={"k_distances": ast.literal_eval})

# Functions

### Function - merge_knn_to_df

In [69]:
def merge_knn_to_df( main_df, to_merge_df):
    prepare_to_merge_df = to_merge_df.drop(['label','prediction'], axis=1)
    merged_df = main_df.merge(prepare_to_merge_df, how='left', on='image_path')
    return merged_df

### Functions - save_thresholds, load_thresholds

In [70]:
# Save resultes
def save_thresholds(file, save_path):
    try:
        with open(save_path, "wb") as f:
            pickle.dump(file, f)
        print(f"thresholds saved succesfully: {save_path} ")
    except (IOError, pickle.PickleError) as e:
        print(f"error occure while save trasholds: {e}")

# open resultes
def load_thresholds(load_path):
    try:
        with open(load_path, "rb") as f:
            threshold_dict = pickle.load(f)
        print("thresholds loaded succesfully")
        return threshold_dict
    except (IOError, pickle.PickleError, EOFError) as e:
        print(f"error occure while loaded trasholds: {e}")
        return None

### Functions - predict_ood

In [71]:
def predict_ood( df:pd.DataFrame, thresholds: dict) -> pd.DataFrame:
    df_copied = df.copy()

    distances = np.stack(df_copied['k_distances'].values)
    last_distances = distances[:,-1]

    for percentile, threshold in thresholds.items():
        is_ood = last_distances > threshold 
        df_copied[f'ood_{percentile}'] = np.where( is_ood, -1, df_copied['prediction'])
    
    return df_copied

### Function - get_thresholds_of_percentile

In [72]:
def get_thresholds_of_percentile(thresholds_dict:dict, percentile: float):
    result_dict = {}

    for label, thresholds in thresholds_dict.items():
        result_dict[label] = thresholds[percentile]
    
    return result_dict

### Function - predict_ood_for_species (on largest distance anyway which class)

In [None]:
def predict_ood_for_species(df:pd.DataFrame, thresholds_dict: dict, percentiles:float):
    df_copy = df.copy()

    for percentile in percentiles:
        distances = df_copy['k_distances']
        last_distances = np.stack( distances.values )[:,-1]

        threshold_percentile_dict = get_thresholds_of_percentile( thresholds_dict, percentile )
        thresholds = df_copy['prediction'].map( threshold_percentile_dict )

        is_ood = last_distances > thresholds
        df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
    
    return df_copy

### Function - predict_ood_for_species (on largest distance on exampel of predicted class)

In [74]:
# def get_last_label_distance(k_labels:pd.Series, k_distances:pd.Series, prediction:int) -> float:
#     for i in range(len(k_labels) - 1, -1, -1):  # rückwärts suchen
#         if k_labels[i] == prediction:
#             return k_distances[i]
#     return k_distances[-1]

In [75]:
# def predict_ood_for_species(df: pd.DataFrame, thresholds_dict: dict, percentiles: list):
#     df_copy = df.copy()

#     last_distances = df_copy.apply(
#         lambda row: get_last_label_distance(
#             row['k_image_labels'], 
#             row['k_distances'], 
#             row['prediction']
#         ), 
#         axis=1
#     ).values
    
#     for percentile in percentiles:
#         threshold_percentile_dict = get_thresholds_of_percentile(thresholds_dict, percentile)
#         thresholds = df_copy['prediction'].map( threshold_percentile_dict ).values

#         is_ood = last_distances > thresholds
#         df_copy[f'ood_{percentile}'] = np.where(is_ood, -1, df_copy['prediction'])
    
#     return df_copy

### Function - run_prediction_over_all_exampels_thresholds

In [76]:
def run_prediction_over_all_exampels_thresholds( threshold_load_path: Path, prediction_save_path: Path, high_id_test_prediction: pd.DataFrame, high_ood_test_prediction: pd.DataFrame, low_id_test_prediction: pd.DataFrame, low_ood_test_prediction: pd.DataFrame):
    thresholds_dict = load_thresholds( threshold_load_path )

    # predict ood
    pred_ood_for_high_id_test = predict_ood( high_id_test_prediction, thresholds_dict )
    pred_ood_for_high_ood_test = predict_ood( high_ood_test_prediction, thresholds_dict )
    
    pred_ood_for_low_id_test = predict_ood( low_id_test_prediction, thresholds_dict )
    pred_ood_for_low_ood_test = predict_ood( low_ood_test_prediction, thresholds_dict )

    # save results
    pred_ood_for_high_id_test.to_csv( prediction_save_path / 'high_id_test_prediction_ood.csv', index=False)
    pred_ood_for_high_ood_test.to_csv( prediction_save_path / 'high_ood_test_prediction_ood.csv', index=False)

    pred_ood_for_low_id_test.to_csv( prediction_save_path / 'low_id_test_prediction_ood.csv', index=False)
    pred_ood_for_low_ood_test.to_csv( prediction_save_path / 'low_ood_test_prediction_ood.csv', index=False)

### Function - run_prediction_for_species_thresholds

In [77]:


def run_prediction_for_species_thresholds( threshold_load_path: Path, prediction_save_path: Path, high_id_test_prediction: pd.DataFrame, high_ood_test_prediction: pd.DataFrame, low_id_test_prediction: pd.DataFrame, low_ood_test_prediction: pd.DataFrame, percentiles: np.array):
    thresholds_dict = load_thresholds( threshold_load_path )

    # predict ood
    pred_ood_for_high_id_test = predict_ood_for_species(high_id_test_prediction, thresholds_dict, percentiles)
    pred_ood_for_high_ood_test = predict_ood_for_species(high_ood_test_prediction, thresholds_dict, percentiles)

    pred_ood_for_low_id_test = predict_ood_for_species(low_id_test_prediction, thresholds_dict, percentiles)
    pred_ood_for_low_ood_test = predict_ood_for_species(low_ood_test_prediction, thresholds_dict, percentiles)

    # save results
    pred_ood_for_high_id_test.to_csv( prediction_save_path / 'high_id_test_prediction_ood.csv', index=False)
    pred_ood_for_high_ood_test.to_csv( prediction_save_path / 'high_ood_test_prediction_ood.csv', index=False)

    pred_ood_for_low_id_test.to_csv( prediction_save_path / 'low_id_test_prediction_ood.csv', index=False)
    pred_ood_for_low_ood_test.to_csv( prediction_save_path / 'low_ood_test_prediction_ood.csv', index=False)

    

# Apply

# Apply - over_all

In [78]:
run_prediction_over_all_exampels_thresholds(
    threshold_dir_path / 'thresholds_over_all_examples.pkl',
    result_dir_path / 'over_all_examples',
    high_id_test_prediction_df,
    high_ood_test_prediction_df,
    low_id_test_prediction_df,
    low_ood_test_prediction_df
)

thresholds loaded succesfully


  df_copied[f'ood_{percentile}'] = np.where( is_ood, -1, df_copied['prediction'])
  df_copied[f'ood_{percentile}'] = np.where( is_ood, -1, df_copied['prediction'])
  df_copied[f'ood_{percentile}'] = np.where( is_ood, -1, df_copied['prediction'])
  df_copied[f'ood_{percentile}'] = np.where( is_ood, -1, df_copied['prediction'])
  df_copied[f'ood_{percentile}'] = np.where( is_ood, -1, df_copied['prediction'])
  df_copied[f'ood_{percentile}'] = np.where( is_ood, -1, df_copied['prediction'])
  df_copied[f'ood_{percentile}'] = np.where( is_ood, -1, df_copied['prediction'])
  df_copied[f'ood_{percentile}'] = np.where( is_ood, -1, df_copied['prediction'])
  df_copied[f'ood_{percentile}'] = np.where( is_ood, -1, df_copied['prediction'])
  df_copied[f'ood_{percentile}'] = np.where( is_ood, -1, df_copied['prediction'])
  df_copied[f'ood_{percentile}'] = np.where( is_ood, -1, df_copied['prediction'])
  df_copied[f'ood_{percentile}'] = np.where( is_ood, -1, df_copied['prediction'])


# knn

In [79]:
percentiles = np.arange(0.0,1.001,0.01)
percentiles = np.round(percentiles, 2)

### Apply - for_each_species

In [80]:
run_prediction_for_species_thresholds(
    threshold_dir_path / 'thresholds_for_each_species.pkl',
    result_dir_path / 'knn/for_each_species',
    high_id_test_prediction_df,
    high_ood_test_prediction_df,
    low_id_test_prediction_df,
    low_ood_test_prediction_df,
    percentiles
)

thresholds loaded succesfully


  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )


### Apply - in_each_species

In [81]:
run_prediction_for_species_thresholds(
    threshold_dir_path / 'thresholds_in_each_species.pkl',
    result_dir_path / 'knn/in_each_species',
    high_id_test_prediction_df,
    high_ood_test_prediction_df,
    low_id_test_prediction_df,
    low_ood_test_prediction_df,
    percentiles
)

thresholds loaded succesfully


  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )


# mlp

### merge - mlp_df

In [82]:
mlp_high_id_test_prediction_df_merged = merge_knn_to_df(mlp_high_id_test_prediction_df, high_id_test_prediction_df)
mlp_high_ood_test_prediction_df_merged = merge_knn_to_df(mlp_high_ood_test_prediction_df, high_ood_test_prediction_df)

mlp_low_id_test_prediction_df_merged = merge_knn_to_df(mlp_low_id_test_prediction_df, low_id_test_prediction_df)
mlp_low_ood_test_prediction_df_merged = merge_knn_to_df(mlp_low_ood_test_prediction_df, low_ood_test_prediction_df)

### Apply - for_each_species

In [83]:
run_prediction_for_species_thresholds(
    threshold_dir_path / 'thresholds_for_each_species.pkl',
    result_dir_path / 'mlp/for_each_species',
    mlp_high_id_test_prediction_df_merged,
    mlp_high_ood_test_prediction_df_merged,
    mlp_low_id_test_prediction_df_merged,
    mlp_low_ood_test_prediction_df_merged,
    percentiles
)

thresholds loaded succesfully


  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )


### Apply - in_each_species

In [84]:
run_prediction_for_species_thresholds(
    threshold_dir_path / 'thresholds_in_each_species.pkl',
    result_dir_path / 'mlp/in_each_species',
    mlp_high_id_test_prediction_df_merged,
    mlp_high_ood_test_prediction_df_merged,
    mlp_low_id_test_prediction_df_merged,
    mlp_low_ood_test_prediction_df_merged,
    percentiles
)

thresholds loaded succesfully


  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
  df_copy[f'ood_{percentile}'] = np.where( is_ood, -1, df_copy['prediction'] )
