# Notebook overview
Runs a trained MLP on precomputed image embeddings to predict species labels for multiple test splits and saves results.

- Loads test metadata (high/low, ID and OOD) and label maps
- Loads resized embeddings directories and MLP model
- Creates DataLoaders that load .pt embeddings and runs model predictions
- Saves prediction CSVs containing true labels, predicted labels, and image paths

The notebook was exported as a Python script and run in a console using Tmux to execute it. The notebook was used for both datasets just adapte the paths.

# Preperation

### Imports

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

### Path - df_source_dir

In [None]:
# df folder
DF_SOURCE_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/datasets/created'
df_source_dir = Path(DF_SOURCE_PATH)
if not df_source_dir.exists():
    raise FileNotFoundError(f"Folder does not exist: {DF_SOURCE_PATH}")

# Embeddings folder
# high
EMBEDDING_HIGH_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/embeddings/adapted/resized/high'
EMBEDDING_HIGH_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/embeddings/adapted/resized/high'
embedding_high_dir_path = Path(EMBEDDING_HIGH_DIR_PATH)
if not embedding_high_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {EMBEDDING_HIGH_DIR_PATH}")
# low
EMBEDDING_LOW_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/embeddings/adapted/resized/low'
embedding_low_dir_path = Path(EMBEDDING_LOW_DIR_PATH)
if not embedding_low_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {EMBEDDING_LOW_DIR_PATH}")

# Result folder
RESULT_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/models/mlp/resized/predicted_test_datasets'
result_dir_path = Path(RESULT_DIR_PATH)
if not result_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {RESULT_DIR_PATH}")

### load dfs

In [4]:
high_id_test = pd.read_csv(df_source_dir / 'high_id_test.csv', usecols=['image_path', 'speciesKey'], nrows=100)
high_ood_test = pd.read_csv(df_source_dir / 'high_ood_test.csv', usecols=['image_path', 'speciesKey'], nrows=100)

low_id_test = pd.read_csv(df_source_dir / 'low_id_test.csv', usecols=['identifier', 'speciesKey'], nrows=100)
low_ood_test = pd.read_csv(df_source_dir / 'low_ood_test.csv', usecols=['identifier', 'speciesKey'], nrows=100)

label_map_id = pd.read_csv(df_source_dir / 'label_map_id.csv', usecols=['label', 'speciesKey'])
label_map_ood = pd.read_csv(df_source_dir / 'label_map_ood.csv', index_col=0, usecols=['label', 'speciesKey'])

### merge dfs

In [5]:
high_id_test_label = high_id_test.merge(label_map_id, how='left', on='speciesKey')
high_ood_test_label = high_ood_test.merge(label_map_ood, how='left', on='speciesKey')

low_id_test_label = low_id_test.merge(label_map_id, how='left', on='speciesKey')
low_ood_test_label = low_ood_test.merge(label_map_ood, how='left', on='speciesKey')

### Variable - device

In [6]:
device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))
print(f"Training on device {device}.")

Training on device cuda.


### Load model

In [7]:
model = nn.Sequential(
    nn.Linear(384, 192),
    nn.Tanh(),
    nn.Linear(192, 91),
).to(device=device)

MLP_WEIGHTS_PATH = f'/home/jleick/masterArbeitProjekt/final_release/models/mlp/resized/model/model_weights.pt'

model.load_state_dict(torch.load(MLP_WEIGHTS_PATH, weights_only = True))

# ignor warning: /home/jleick/miniconda3/envs/masterArbeit/lib/python3.12/site-packages/torch/cuda/__init__.py:789: UserWarning: Can't initialize NVML warnings.warn("Can't initialize NVML")



<All keys matched successfully>

# Functions

### Function - CustomDataset

In [8]:
# Define Dataset Class

class CustomDataset(Dataset):

    def __init__(self, df: pd.DataFrame, embedding_dir_path: Path, column_name_tensor: str):
        self.embedding_dir_path = embedding_dir_path
        self.df_reduced = df[[column_name_tensor, 'label']].copy() # create dataFrame with relevant columns

    def __len__(self):
        return len(self.df_reduced)

    def __getitem__(self, item: int):
        image_file_path, label = self.df_reduced.iloc[item]
        tensor_file_path = Path(image_file_path).with_suffix('.pt')
        absolute_path = self.embedding_dir_path / tensor_file_path
        tensor = torch.load( absolute_path , weights_only=True )
        return tensor.squeeze(), label, image_file_path # squeeze to remove the first dimension

### Function - predict

In [9]:
def predict(dataloader: DataLoader , model: torch.nn.Module, device: str):
    model = model.to(device)
    model.eval()

    labels = []
    predictions = []
    paths = []


    with torch.no_grad():
        for i, (batch_embeddings, batch_labels, batch_image_paths) in enumerate(dataloader, start=1):
            batch_embeddings = batch_embeddings.to(device)

            batch_output = model(batch_embeddings)
            _, batch_prediction = torch.max(batch_output, 1)

            predictions.extend(batch_prediction.cpu().numpy())
            labels.extend(batch_labels.cpu().numpy())
            paths.extend(batch_image_paths)
            print(f'>>> {i} batch predicted')

    return np.array(labels), np.array(predictions), list(paths)

### Function - save_df

In [10]:
def save_df(df:pd.DataFrame, save_path: Path ):
    df.to_csv( save_path, index=False)
    print("df save to: ", save_path)

# Apply

### prepreation

In [11]:
batch_size = 128

### Apply - predict high

In [12]:
high_id_test_dataset = CustomDataset(high_id_test_label, embedding_high_dir_path, 'image_path')
high_id_test_dataLoader = DataLoader(high_id_test_dataset, batch_size=batch_size, shuffle=False)

high_ood_test_dataset = CustomDataset(high_ood_test_label, embedding_high_dir_path, 'image_path')
high_ood_test_dataLoader = DataLoader(high_ood_test_dataset, batch_size=batch_size, shuffle=False)

In [13]:
high_id_test_labels, high_id_test_prediction, high_id_test_paths = predict(high_id_test_dataLoader, model, device)
high_ood_test_labels, high_ood_test_prediction, high_ood_test_paths = predict(high_ood_test_dataLoader, model, device)

>>> 1 batch predicted
>>> 1 batch predicted


### Save - prediction high

In [14]:
high_id_test_prediction_df = pd.DataFrame({
    "label": high_id_test_labels,
    "prediction": high_id_test_prediction,
    "image_path": high_id_test_paths
})

save_df(high_id_test_prediction_df, result_dir_path / "high_id_test_prediction.csv")

df save to:  /home/jleick/masterArbeitProjekt/final_release/models/mlp/resized/predicted_test_datasets/high_id_test_prediction.csv


In [15]:
high_ood_test_prediction_df = pd.DataFrame({
    "label": high_ood_test_labels,
    "prediction": high_ood_test_prediction,
    "image_path": high_ood_test_paths
})

save_df(high_ood_test_prediction_df, result_dir_path / "high_ood_test_prediction.csv")

df save to:  /home/jleick/masterArbeitProjekt/final_release/models/mlp/resized/predicted_test_datasets/high_ood_test_prediction.csv


### Apply - predict low

In [16]:
low_id_test_dataset = CustomDataset(low_id_test_label, embedding_low_dir_path, 'identifier')
low_id_test_dataLoader = DataLoader(low_id_test_dataset, batch_size=batch_size, shuffle=False)

low_ood_test_dataset = CustomDataset(low_ood_test_label, embedding_low_dir_path, 'identifier')
low_ood_test_dataLoader = DataLoader(low_ood_test_dataset, batch_size=batch_size, shuffle=False)

In [17]:
low_id_test_labels, low_id_test_prediction, low_id_test_paths = predict(low_id_test_dataLoader, model, device)
low_ood_test_labels, low_ood_test_prediction, low_ood_test_paths = predict(low_ood_test_dataLoader, model, device)

>>> 1 batch predicted
>>> 1 batch predicted


### Save - prediction low

In [18]:
low_id_test_prediction_df = pd.DataFrame({
    "label": low_id_test_labels,
    "prediction": low_id_test_prediction,
    "image_path": low_id_test_paths
})

save_df(low_id_test_prediction_df, result_dir_path / "low_id_test_prediction.csv")

df save to:  /home/jleick/masterArbeitProjekt/final_release/models/mlp/resized/predicted_test_datasets/low_id_test_prediction.csv


In [19]:
low_ood_test_prediction_df = pd.DataFrame({
    "label": low_ood_test_labels,
    "prediction": low_ood_test_prediction,
    "image_path": low_ood_test_paths
})

save_df(low_ood_test_prediction_df, result_dir_path / "low_ood_test_prediction.csv")

df save to:  /home/jleick/masterArbeitProjekt/final_release/models/mlp/resized/predicted_test_datasets/low_ood_test_prediction.csv
