# Notebook overview
Predicts labels for multiple test splits using a precomputed k-NN classifier, and saves detailed prediction outputs.

- Loads train/val/test metadata and label maps
- Loads the selected k from hyperparameter search and the saved k-NN classifier
- Runs predictions and kneighbors to retrieve neighbor distances, labels, and image paths
- Saves CSVs containing predictions, true labels, neighbor distances, neighbor labels, and neighbor image paths

The notebook was exported as a Python script and run in a console using Tmux to execute it. The notebook was used for both datasets(original and resized) just adapte the paths.

# Preperation

### Import

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

import joblib
import gc

### Path - df_dir_path, embedding_dir_path, result_dir_path

In [None]:
# df folder
DF_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/datasets/created'
df_dir_path = Path(DF_DIR_PATH)
if not df_dir_path.exists():
    raise FileNotFoundError(f"File does not exist: {DF_DIR_PATH}")

# knn_clasifier database, distance_matrix folder
DATABASE_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/models/knn/resized/model'
database_dir_path = Path(DATABASE_DIR_PATH)
if not database_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {DATABASE_DIR_PATH}")

# df score folder
DF_SCORE_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/models/knn/resized/scores'
df_score_dir_path = Path(DF_SCORE_DIR_PATH)
if not df_score_dir_path.exists():
    raise FileNotFoundError(f"File does not exist: {DF_SCORE_DIR_PATH}")

# folder to save results
RESULT_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/models/knn/resized/prediction'
result_dir_path = Path(RESULT_DIR_PATH)
if not result_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {RESULT_DIR_PATH}")

### Load df - high_id_test_df, ...

In [4]:
high_id_train_df = pd.read_csv( df_dir_path / 'high_id_train.csv', index_col=False, usecols=['speciesKey', 'image_path'])
high_id_val_df = pd.read_csv( df_dir_path / 'high_id_val.csv', index_col=False, usecols=['speciesKey', 'image_path'])

high_id_test_df = pd.read_csv( df_dir_path / 'high_id_test.csv', index_col=False, usecols=['speciesKey', 'image_path'])
high_ood_test_df = pd.read_csv(df_dir_path / 'high_ood_test.csv', index_col=False, usecols=['speciesKey', 'image_path'])

low_id_test_df = pd.read_csv(df_dir_path / 'low_id_test.csv', index_col=False, usecols=['speciesKey', 'identifier'])
low_ood_test_df = pd.read_csv(df_dir_path / 'low_ood_test.csv', index_col=False, usecols=['speciesKey', 'identifier'])

label_map_id_df = pd.read_csv( df_dir_path / 'label_map_id.csv', index_col=0, usecols=['speciesKey', 'label'])
label_map_ood_df = pd.read_csv(df_dir_path / 'label_map_ood.csv', index_col=0, usecols=['speciesKey', 'label'])

### Merge df - high_id_test_df, ...

In [5]:
high_id_train_df_label = high_id_train_df.merge(label_map_id_df, how='left', on='speciesKey')
high_id_val_df_label = high_id_val_df.merge(label_map_id_df, how='left', on='speciesKey')

high_id_test_df_label = high_id_test_df.merge(label_map_id_df, how='left', on='speciesKey')
high_ood_test_df_label = high_ood_test_df.merge(label_map_ood_df, how='left', on='speciesKey')

low_id_test_df_label = low_id_test_df.merge(label_map_id_df, how='left', on='speciesKey')
low_ood_test_df_label = low_ood_test_df.merge(label_map_ood_df, how='left', on='speciesKey')

print("df loaded")

df loaded


### Load knn - knn_database

In [6]:
knn_classifier = joblib.load( database_dir_path / 'knn_classifier.joblib' ) # load time 7m s19, 13m 33s

print("distance matrix train - database loaded")

KeyboardInterrupt: 

### Load hyperparameter - k

In [None]:
high_id_val_k_df = pd.read_csv( df_score_dir_path / 'high_id_val_hyperparam_k_scores.csv' )
val_k, high_id_val_balance_accuracy = high_id_val_k_df.loc[high_id_val_k_df["balanced_accuracy"].idxmax()]
val_k = int(val_k)

print(f"hyperparameter k: {val_k} loaded, with balanced accuracy: {high_id_val_balance_accuracy}")

hyperparameter k: 10 loaded, with balanced accuracy: 0.8599350845947602


# Function

### Function - load_distance_matrix

In [None]:
def load_distance_matrix( distance_matrix_path:Path ) -> np.ndarray:
    dist_matrix = np.load( distance_matrix_path )

    if dist_matrix.shape[0] > dist_matrix.shape[1]:
        dist_matrix = dist_matrix.T

    print(f"distance matrix loaded: {distance_matrix_path}")
    return dist_matrix

### Function - run_knn

In [None]:
def predict_knn_with_neighbors( knn_classifier, k:int, distance_matrix_test: np.ndarray, train_df:pd.DataFrame, test_df:pd.DataFrame, col_name: str) -> tuple[np.ndarray, np.array, np.array, np.array, np.array, np.array]:
    print(f'predict k: {k} ', end='')
    knn_classifier.n_neighbors = k

    # predict exampel
    predictions = knn_classifier.predict( distance_matrix_test )
    labels = test_df['label'].values
    image_paths = test_df[col_name].values

    # predict kneighbour
    k_distances, k_indices = knn_classifier.kneighbors( distance_matrix_test )
    k_image_labels = train_df['label'].values.take(k_indices)
    k_image_paths = train_df['image_path'].values.take(k_indices) #low_... neads 'identifier' instead of 'image_path'

    print(f'- finished')
    return predictions, labels, image_paths, k_distances, k_image_labels, k_image_paths

### Function - save_results

In [None]:
def save_results( predictions: np.array, labels: np.array, image_paths: np.array, k_distances: np.ndarray, k_image_labels: np.ndarray, k_image_paths: np.ndarray, save_path: Path):
    results_df = pd.DataFrame({
        "prediction": predictions,
        "label": labels,
        "image_path": image_paths,
        
        "k_distances": k_distances.tolist(),
        "k_image_labels": k_image_labels.tolist(),
        "k_image_paths": k_image_paths.tolist(),
    })

    results_df.to_csv( save_path, index=False)

    print(f"results saved at: {save_path}")

### Funktion - run_knn

In [None]:
def run_knn( distance_matrix_path:Path, knn_classifier, k:int, train_df:pd.DataFrame, test_df:pd.DataFrame, col_name: str, save_path: Path):
    distance_matrix = load_distance_matrix( distance_matrix_path )

    predictions, labels, image_paths, k_distances, k_image_labels, k_image_paths = predict_knn_with_neighbors( knn_classifier, k, distance_matrix, train_df, test_df, col_name)

    save_results( predictions, labels, image_paths, k_distances, k_image_labels, k_image_paths, save_path)

    del distance_matrix
    gc.collect()

# Apply - run_knn

### High - id val

In [None]:
run_knn(
    database_dir_path / 'distance_matrix_high_id_val.npy',
    knn_classifier,
    val_k,
    high_id_train_df_label,
    high_id_val_df_label,
    'image_path',
    result_dir_path / f"high_id_val_prediction_k_{val_k}.csv"
    )

distance matrix loaded: /home/jleick/masterArbeitProjekt/final_release/models/knn/origin/model/distance_matrix_high_id_val.npy
predict k: 10 - finished
results saved at: /home/jleick/masterArbeitProjekt/final_release/models/knn/origin/prediction/high_id_val_prediction_k_10.csv


### High - id test

In [None]:
run_knn(
    database_dir_path / 'distance_matrix_high_id_test.npy',
    knn_classifier,
    val_k,
    high_id_train_df_label,
    high_id_test_df_label,
    'image_path',
    result_dir_path / f"high_id_test_prediction_k_{val_k}.csv"
    )

distance matrix loaded: /home/jleick/masterArbeitProjekt/final_release/models/knn/origin/model/distance_matrix_high_id_test.npy
predict k: 10 - finished
results saved at: /home/jleick/masterArbeitProjekt/final_release/models/knn/origin/prediction/high_id_test_prediction_k_10.csv


### High - ood test

In [None]:
run_knn(
    database_dir_path / 'distance_matrix_high_ood_test.npy',
    knn_classifier,
    val_k,
    high_id_train_df_label,
    high_ood_test_df_label,
    'image_path',
    result_dir_path / f"high_ood_test_prediction_k_{val_k}.csv"
    )

distance matrix loaded: /home/jleick/masterArbeitProjekt/final_release/models/knn/origin/model/distance_matrix_high_ood_test.npy
predict k: 10 - finished
results saved at: /home/jleick/masterArbeitProjekt/final_release/models/knn/origin/prediction/high_ood_test_prediction_k_10.csv


### Low - id test

In [None]:
run_knn(
    database_dir_path / 'distance_matrix_low_id_test.npy',
    knn_classifier,
    val_k,
    high_id_train_df_label,
    low_id_test_df_label,
    'identifier',
    result_dir_path / f"low_id_test_prediction_k_{val_k}.csv"
    )

distance matrix loaded: /home/jleick/masterArbeitProjekt/final_release/models/knn/origin/model/distance_matrix_low_id_test.npy
predict k: 10 - finished
results saved at: /home/jleick/masterArbeitProjekt/final_release/models/knn/origin/prediction/low_id_test_prediction_k_10.csv


### Low - ood test

In [None]:
run_knn(
    database_dir_path / 'distance_matrix_low_ood_test.npy',
    knn_classifier,
    val_k,
    high_id_train_df_label,
    low_ood_test_df_label,
    'identifier',
    result_dir_path / f"low_ood_test_prediction_k_{val_k}.csv"
    )

distance matrix loaded: /home/jleick/masterArbeitProjekt/final_release/models/knn/origin/model/distance_matrix_low_ood_test.npy
predict k: 10 - finished
results saved at: /home/jleick/masterArbeitProjekt/final_release/models/knn/origin/prediction/low_ood_test_prediction_k_10.csv
