# Notebook overview
reates six datasets (training, validation, and four test splits: FG, FG+DG, ND, ND+DG) from GBIF (high) and trap (low) images extracted from the AMI dataset.

- Filters to species-level records and selects taxa with sufficient examples
- Applies per-species min/max example constraints and reproducible shuffling (SEED)
- Builds high and low splits and saves resulting CSVs

# Preperation

### import

In [1]:
import pandas as pd
from pathlib import Path

### Variable - SEED

In [2]:
### variable
SEED = 42

### Load df - low_all

In [3]:
# traps or low image csv are created by extrakting images from .tar file
PATH_LOW_all = r'/home/stud/jleick/masterArbeitProjekt/final_release/data/datasets/origin/trap/traps_fine-grained_embeddings.csv'
low_all_df = pd.read_csv(PATH_LOW_all, index_col=0)
low_all_df.reset_index(drop=True,inplace=True)

### Load df - high_all

In [4]:
# Paths to GBIF fine-grain all .csv files
PATH_HIGH_TEST_ALL = r'/home/stud/jleick/masterArbeitProjekt/final_release/data/datasets/origin/fine-grain/download_check/04_ami-gbif_fine-grained_all_test_download_check.csv'
PATH_HIGH_TRAIN_ALL = r'/home/stud/jleick/masterArbeitProjekt/final_release/data/datasets/origin/fine-grain/download_check/04_ami-gbif_fine-grained_all_train_download_check.csv'
PATH_HIGH_VAL_ALL = r'/home/stud/jleick/masterArbeitProjekt/final_release/data/datasets/origin/fine-grain/download_check/04_ami-gbif_fine-grained_all_val_download_check.csv'

high_test_all_df = pd.read_csv(PATH_HIGH_TEST_ALL, index_col=False)
high_train_all_df = pd.read_csv(PATH_HIGH_TRAIN_ALL, index_col=False)
high_val_all_df = pd.read_csv(PATH_HIGH_VAL_ALL, index_col=False)

### Load df - taxom_map

In [5]:
PATH_TAXONOMY_MAP = r'/home/stud/jleick/masterArbeitProjekt/final_release/data/datasets/origin/fine-grain/taxonomy_map.csv'
taxonomy_map_df = pd.read_csv(PATH_TAXONOMY_MAP)

### Load Path - result_dir

In [6]:
RESULT_DIR_PATH = r'/home/stud/jleick/masterArbeitProjekt/final_release/data/datasets/created'
result_dir = Path(RESULT_DIR_PATH)

# Function

### Function - max_spec_exampels

In [7]:
def max_spec_exampels( df, max_exampels ):
    species_keys = df['speciesKey'].unique()
    species_dfs = []

    for species in species_keys:
        species_df = df[df['speciesKey'] == species]
        species_count = len(species_df)

        if species_count > max_exampels:
            dataset_shuffled = species_df.sample(frac=1, random_state=SEED).reset_index(drop=True)
            dataset_select = dataset_shuffled.iloc[:max_exampels]
            print(f"{species}: {species_count}")
            species_dfs.append(dataset_select)
        else:
            species_dfs.append(species_df)

    return pd.concat(species_dfs)

### Function - min_spec_exampels

In [8]:
def min_spec_exampels( df, min_exampels ):
    spec_exampels = df['speciesKey'].value_counts()
    spec_exampels_min = spec_exampels[spec_exampels >= min_exampels]

    filtered_df = df[ df['speciesKey'].isin(spec_exampels_min.index) ]

    return filtered_df

# create - species split

### Reduce - low_all_spec

In [9]:
# reduce to taxon_rank SPECIES only
low_all_spec_df = low_all_df.loc[low_all_df['taxon_rank'] == 'SPECIES']

### NaN - exampels with no speciesKey

In [10]:
# Examples that do not have a SpeciesKey are not classified as species in the Taxon_rank field. That's why these examples are not relevant.

# Exampels which have no speciesKey
low_all_spec_na_df = low_all_spec_df.loc[low_all_spec_df['speciesKey'].isna()]
# low_all_spec_na['acceptedTaxonKey'].value_counts()

### Sum - low_all_spec_sum

In [11]:
low_all_spec_sum =  low_all_spec_df['speciesKey'].value_counts()
len(low_all_spec_sum)

487

### Sum min 6 - low_all_spec_selected

In [12]:
low_all_spec_selected = low_all_spec_sum.loc[low_all_spec_sum >= 6]
len(low_all_spec_selected)

140

### shuffel

In [13]:
low_all_spec_selected_shuffled = low_all_spec_selected.sample(frac=1, axis=0, random_state=SEED)
len(low_all_spec_selected_shuffled)

140

### Create df - id_spec

In [14]:
id_spec = low_all_spec_selected_shuffled.iloc[:100]
len(id_spec)

100

### Create df - id_spec_set

In [15]:
id_spec_set = set(id_spec.index)
len(id_spec_set)

100

### Create df - ood_spec

In [16]:
ood_spec = low_all_spec_selected_shuffled.iloc[100:]
len(ood_spec)

40

### Create df - ood_spec_set

In [17]:
ood_spec_set = set(ood_spec.index)
len(ood_spec_set)

40

# create df - high

## id test

#### Filter - high_test_all_df

In [18]:
high_test_all_df_downloaded = high_test_all_df[ high_test_all_df['image_downloaded'] == True ]

#### create df - high_id_test_df

In [19]:
high_id_test_df = high_test_all_df_downloaded[ high_test_all_df_downloaded['speciesKey'].isin(id_spec.index) ]
len(high_id_test_df['speciesKey'].value_counts())

97

#### Apply - min_spec_exampels_in_df

In [20]:
high_id_test_df = min_spec_exampels( high_id_test_df, 100)
len(high_id_test_df['speciesKey'].value_counts())

94

#### Reduce - id_spec

In [21]:
high_id_test_set = set(high_id_test_df['speciesKey'])

id_spec_reduc_set = id_spec_set & high_id_test_set

## id train

#### Filter df - high_train_all_df

In [22]:
high_train_all_df_downloaded = high_train_all_df[ high_train_all_df['image_downloaded'] == True ]

#### create df - high_id_train_df

In [23]:
# 1000 exampels per species - posibility to reduce to 500 exampels to reduce hardware resources
high_id_train_df = high_train_all_df_downloaded[ high_train_all_df_downloaded['speciesKey'].isin(id_spec_reduc_set) ]
len(high_id_train_df['speciesKey'].value_counts())

94

#### Apply - max_spec_exampels

In [None]:
# high_id_train_df = max_spec_exampels(high_id_train_df, 500)
# len(high_id_train_df['speciesKey'].value_counts())

#### Apply - min_spec_exampels

In [25]:
high_id_train_df = min_spec_exampels( high_id_train_df, 500)
len(high_id_train_df['speciesKey'].value_counts())

91

#### Reduced - id_spec_reduc_set

In [26]:
high_id_train_set = set(high_id_train_df['speciesKey'])

id_spec_reduc_set = id_spec_reduc_set & high_id_train_set

### Reduce - high_id_test_df

In [27]:
high_id_test_df = high_id_test_df[ high_id_test_df['speciesKey'].isin(id_spec_reduc_set) ]
len(high_id_test_df['speciesKey'].value_counts())

91

## id val

#### Filter - high_val_all_df

In [28]:
high_val_all_df_downloaded = high_val_all_df[ high_val_all_df['image_downloaded'] == True ]

#### create df - high_id_val_df

In [29]:
high_id_val_df = high_val_all_df_downloaded[ high_val_all_df_downloaded['speciesKey'].isin(id_spec_reduc_set) ]
len(high_id_val_df['speciesKey'].value_counts())

91

#### Apply - min_spec_exampels

In [31]:
high_id_val_df = min_spec_exampels( high_id_val_df, 50)
len(high_id_val_df['speciesKey'].value_counts())

91

#### Reduced - id_spec_reduc_set

In [32]:
high_id_val_set = set(high_id_val_df['speciesKey'])

id_spec_reduc_set = id_spec_reduc_set & high_id_val_set

### reduce - high_id_test_df

In [33]:
high_id_test_df = high_id_test_df[ high_id_test_df['speciesKey'].isin(id_spec_reduc_set) ]
len(high_id_test_df['speciesKey'].value_counts())

91

### reduce - high_id_train_df

In [34]:
high_id_train_df = high_id_train_df[ high_id_train_df['speciesKey'].isin(id_spec_reduc_set) ]
len(high_id_train_df['speciesKey'].value_counts())

91

## ood test

#### create df - high_ood_test_df

In [35]:
high_ood_test_df = high_test_all_df_downloaded[ high_test_all_df_downloaded['speciesKey'].isin(ood_spec.index) ]
len( high_ood_test_df['speciesKey'].value_counts() )

39

#### Apply - min_spec_exampels

In [36]:
high_ood_test_df = min_spec_exampels( high_ood_test_df, 100)
len(high_ood_test_df['speciesKey'].value_counts())

37

#### Reduced - ood_spec_reduc_set

In [37]:
high_ood_test_set = set(high_ood_test_df['speciesKey'])

ood_spec_reduc_set = ood_spec_set & high_ood_test_set

# create df - low

### create - low_id

In [38]:
low_id_test_df = low_all_spec_df[ low_all_spec_df['speciesKey'].isin(id_spec_reduc_set) ]
len(low_id_test_df['speciesKey'].value_counts())

91

#### Apply - max_spec_exampels

In [39]:
low_id_test_df = max_spec_exampels(low_id_test_df, 100)
len(low_id_test_df['speciesKey'].value_counts())

1889838.0: 119
1739272.0: 183
1743825.0: 178
1978751.0: 171
5102671.0: 107
1886378.0: 122
1887290.0: 562


91

### create - low_ood

In [40]:
low_ood_test_df = low_all_spec_df[ low_all_spec_df['speciesKey'].isin(ood_spec_reduc_set) ]
len(low_ood_test_df['speciesKey'].value_counts())

37

#### Apply - max_spec_exampels

In [41]:
low_ood_test_df = max_spec_exampels(low_ood_test_df, 100)
len(low_ood_test_df['speciesKey'].value_counts())

1806737.0: 255
5102666.0: 334


37

# Save df

### Save - high

In [None]:
high_id_train_df.to_csv( result_dir/'high_id_train.csv' )
high_id_test_df.to_csv( result_dir/'high_id_test.csv' )
high_id_val_df.to_csv( result_dir/'high_id_val.csv' )

high_ood_test_df.to_csv( result_dir/'high_ood_test.csv' )

### Save - low

In [None]:
low_id_test_df.to_csv( result_dir/'low_id_test.csv' )

low_ood_test_df.to_csv( result_dir/'low_ood_test.csv')