# Notebook overview
Creates label maps for the datasets by assigning numeric labels to species for ID (train/val/test FG, test FG+DG) and OOD (test ND, test ND+DG) splits.

- Loads species lists and taxonomy mapping
- Merges speciesKey with species names and assigns continuous class labels
- Saves label_map_id.csv and label_map_ood.csv to the result directory

# Preperations

### Imports

In [26]:
import pandas as pd
import numpy as np
from pathlib import Path

### Load df - high_id_train

In [27]:
HIGH_ID_TRAIN_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/datasets/created/high_id_train.csv'
high_id_train  = Path(HIGH_ID_TRAIN_PATH)
if not high_id_train.exists():
    raise FileNotFoundError(f"File does not exist: {HIGH_ID_TRAIN_PATH}")

high_id_train_df = pd.read_csv(high_id_train, usecols=['speciesKey'])

### Load df - high_ood_test_df

In [None]:
HIGH_OOD_TEST_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/datasets/created/high_ood_test.csv'

high_ood_test  = Path(HIGH_OOD_TEST_PATH)
if not high_ood_test.exists():
    raise FileNotFoundError(f"File does not exist: {HIGH_OOD_TEST_PATH}")

high_ood_test_df = pd.read_csv(high_ood_test, usecols=['speciesKey'])

### Load df - taxonomy_map

In [29]:
TAXONOMY_MAP_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/datasets/origin/fine-grain/taxonomy_map.csv'
taxonomy_map = Path(TAXONOMY_MAP_PATH)
if not taxonomy_map.exists():
    raise FileNotFoundError(f"File does not exist: {TAXONOMY_MAP_PATH}")

taxonomy_map_df = pd.read_csv(taxonomy_map, usecols=['speciesKey', 'species'])

### Path - result_dir

In [30]:
RESULT_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/datasets/created/'
result_dir = Path(RESULT_DIR_PATH)

# create label_map_id

In [31]:
# filter to only one example per speciesKey
high_id_train_df_unique = high_id_train_df.drop_duplicates(subset=["speciesKey"], keep="first")

In [32]:
# Merge Species name
label_map_id_df = high_id_train_df_unique.merge(taxonomy_map_df, how='left', on='speciesKey')

In [33]:
# add label column

In [34]:
label_map_id_df['label'] = label_map_id_df.index

# create label_map_ood

In [35]:
# filter to only one example per speciesKey
high_ood_test_df_unique = high_ood_test_df.drop_duplicates(subset=["speciesKey"], keep="first")

In [36]:
label_map_ood_df = high_ood_test_df_unique.merge(taxonomy_map_df, how='left', on='speciesKey')

In [37]:
len(label_map_ood_df)

37

In [38]:
label_map_ood_df['label'] = np.arange(len(label_map_id_df), len(label_map_id_df) + len(label_map_ood_df),1)

# Save df 

In [39]:
label_map_id_df.to_csv( result_dir / 'label_map_id.csv')

label_map_ood_df.to_csv( result_dir / 'label_map_ood.csv')