# Notebook overview
Builds a k-NN classifier using precomputed cosine distance matrices of image embeddings and saves the trained model and metadata.

- Loads training labels and precomputed distance matrix (cosine, precomputed)
- Fits sklearn KNeighborsClassifier(metric='precomputed') on training labels
- Saves the trained classifier (joblib) and build parameters (JSON)
- Records build duration and dataset statistics

In [1]:
# https://www.geeksforgeeks.org/ml-implementation-of-knn-classifier-using-sklearn/

# Preperation

### imports

In [2]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset

from pathlib import Path
from datetime import datetime

from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.neighbors import KNeighborsClassifier

import joblib
import json

### Load Paths - distance_matrix_path, result_dir_path

In [3]:
# df Folder
DF_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/datasets/created'
df_dir_path = Path(DF_DIR_PATH)
if not df_dir_path.exists():
    raise FileNotFoundError(f"File does not exist: {DF_DIR_PATH}")

# calculated distance matrix
DISTANCE_MATRIX_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/models/knn/resized/model'
distance_matrix_dir_path = Path(DISTANCE_MATRIX_DIR_PATH)
if not distance_matrix_dir_path.exists():
    raise FileNotFoundError(f"File does not exist: {DISTANCE_MATRIX_DIR_PATH}")

### Folder to save results
RESULT_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/models/knn/resized/model'
result_dir_path = Path(RESULT_DIR_PATH)
if not result_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {RESULT_DIR_PATH}")

### Load df - train_high_df, label_map_id_df

In [4]:
train_high_df = pd.read_csv( df_dir_path / 'high_id_train.csv' , index_col=False, usecols=['speciesKey'])
label_map_id_df = pd.read_csv( df_dir_path / 'label_map_id.csv' , index_col=0, usecols=['speciesKey', 'label'])

### merge df - train_high_df_label

In [5]:
train_high_df_label = train_high_df.merge(label_map_id_df, how='left', on='speciesKey')
print("datasets loaded")

datasets loaded


### Load - distance_matrix

In [6]:
distance_matrix = np.load( distance_matrix_dir_path / 'distance_matrix_high_id_train.npy') # load time 6m

# Build Database

### fit knn

In [7]:
distance_matrix.shape

(85803, 85803)

In [8]:
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifiers

start = datetime.now()

knn_classifier = KNeighborsClassifier( metric='precomputed' )
knn_classifier.fit(distance_matrix, train_high_df_label['label'])

end = datetime.now()
build_duration = end - start
print(f'knn database builded in time: {build_duration}' )

knn database builded in time: 0:00:04.765332


### Save - model

In [9]:
# save model - save time 12m 30s
joblib.dump(knn_classifier, result_dir_path / 'knn_classifier.joblib', compress=('gzip', 3)) # compress=('lz4', 3) # 
# knn_classifier = joblib.load(SAVE_PATH +'_model.joblib')

# https://joblib.readthedocs.io/en/latest/generated/joblib.dump.html
print("database/model saved")

database/model saved


### Define save - parameter

In [10]:
params = {
    "model_name": 'knn_AMI-fine-grained',
    "metric": 'cosine',
    "fit_time": str(build_duration), #0:00:04.655802
    "data_path": str(df_dir_path / 'high_id_train.csv'),
    "number_of_examples": len(train_high_df),
    "embeddings_created_on": 'vit_small_patch14_dinov2.lvd142m',
    "embeddings_normalised": 'l2_norm',
}

with open(result_dir_path / "knn_classifier_build_parameters.json", "w") as f:
    json.dump(params, f, indent=2)