# Notebook overview
Searches for the best k hyperparameter for a precomputed k-NN classifier by evaluating balanced accuracy on a validation set.

- Loads validation labels, a saved k-NN classifier, and precomputed distance matrices
- Iterates over a range of k values, predicts with the classifier, and computes balanced accuracy for each k
- Records prediction durations and saves results (CSV + calculation time)

The notebook was exported as a Python script and run in a console using Tmux to execute it. The notebook was used for both datasets just adapte the paths.

# Preperation

### Imports

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime

import joblib
from sklearn import metrics

### Path - df_dir_path, embedding_dir_path, result_dir_path

In [None]:
# df folder
DF_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/datasets/created'
df_dir_path = Path(DF_DIR_PATH)
if not df_dir_path.exists():
    raise FileNotFoundError(f"File does not exist: {DF_DIR_PATH}")

# knn_classifier, distance_matrix folder
DATABASE_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/models/knn/resized/model'
database_dir_path = Path(DATABASE_DIR_PATH)
if not database_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {DATABASE_DIR_PATH}")

# folder to save results
RESULT_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/models/knn/resized/scores'
result_dir_path = Path(RESULT_DIR_PATH)
if not result_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {RESULT_DIR_PATH}")

### Load df - high_id_val_df, label_map_id_df

In [None]:
high_id_val_df = pd.read_csv( df_dir_path / 'high_id_val.csv', index_col=False, usecols=['speciesKey'])
label_map_id_df = pd.read_csv( df_dir_path / 'label_map_id.csv', index_col=0, usecols=['speciesKey', 'label'])

### Merge df - high_id_val_df_label

In [None]:
high_id_val_df_label = high_id_val_df.merge(label_map_id_df, how='left', on='speciesKey')

print("df loaded")

df loaded


### Load knn - knn_classifier

In [5]:
knn_classifier = joblib.load( database_dir_path / 'knn_classifier.joblib' ) # load time 7m s19

print("distance matrix train - database loaded")

distance matrix train - database loaded


# Function

### Function - load_distance_matrix

In [6]:
def load_distance_matrix( distance_matrix_path:Path ) -> np.ndarray:
    dist_matrix = np.load( distance_matrix_path )

    if dist_matrix.shape[0] > dist_matrix.shape[1]:
        dist_matrix = dist_matrix.T

    print(f"distance matrix loaded: {distance_matrix_path}")
    return dist_matrix

### Function - predict_balance_accuracy_score_over_K

In [None]:
def predict_balance_accuracy_score_over_K(knn_classifier, distance_matrix_test: np.ndarray, df:pd.DataFrame) -> tuple[dict[int,float], datetime]:
    start_time = datetime.now()
    scores_balanced_accuracy = {}

    k_range = np.arange(1,529,1) # species with lowes number of exampels (529) in train dataset
    for k in k_range:
        print(f'predict k: {k} ', end='')
        knn_classifier.n_neighbors = k
        predictions = knn_classifier.predict(distance_matrix_test)

        score_ba = metrics.balanced_accuracy_score(df['label'], predictions)
        scores_balanced_accuracy[k] = score_ba
        print(f'> balanced accuracy score: {score_ba} ')

        prediction_time = datetime.now()
        duration_time = prediction_time - start_time
        print(f'> in time: {duration_time}')

    prediction_time = datetime.now()
    duration_time = prediction_time - start_time

    return scores_balanced_accuracy, duration_time

# Optimize calculation: sort whole distance_matrix_test and take first k elements avoids to predict calculation for every k - but attiontion edge cases (exapmle 2 species vs 2 species prediction)

### Function - save_results

In [8]:
def save_results( scores_balanced_accuracy: dict, duration_time: datetime, save_path: Path):
    results_df = pd.DataFrame({
        "k": list(scores_balanced_accuracy.keys()),
        "balanced_accuracy": list(scores_balanced_accuracy.values()),
    })

    results_df.to_csv( save_path.with_suffix(".csv") , index=False)

    # Save calculation time
    with open( save_path.parent / f"{save_path.name}_calculation_time.txt", "w") as f:
        f.write(str(duration_time))

    print(f"results saved at: {result_dir_path} / ...")

# Apply

### Apply - predict_balance_accuracy_score_over_K

In [None]:
# Apply - load_distance_matrix
high_id_val_distance_matrix = load_distance_matrix(database_dir_path / 'distance_matrix_high_id_val.npy')

# Apply - predict_balance_accuracy_score_over_K
high_id_val_scores_balanced_accuracy, high_id_val_duration_time = predict_balance_accuracy_score_over_K(knn_classifier, high_id_val_distance_matrix, high_id_val_df_label)

# Apply - save_results
save_results(high_id_val_scores_balanced_accuracy, high_id_val_duration_time, result_dir_path / "high_id_val_hyperparam_k_scores") # no suffix needed