In [1]:
import os 
import sys

import numpy as np
import pandas as pd

sys.path.append("..")
from utils.BALROG_pipeline import ROOTPATH

In [2]:
df = pd.read_pickle(os.path.join(ROOTPATH, "utils", "references", "database_reference.pkl"))
# Remove all images that are not of the class "F" (female) and that do not have a damaged wing
df = df.loc[df["Sex"] == "F"]
df = df.loc[df["Damaged Wing"] == False]

# Only select Wings from the ConVector project and use 4. Taxonomic Level as the label
df["LABEL"] = df["4. Taxonomic Level"]
df_ref = df.loc[df["Project"] == "ConVector"].copy()

In [3]:
# Number of folds
n_splits = 5
random_seed = 42

def create_testing_fold(df, new_column, n_splits, random_seed=42):
    # Initialize the fold column
    df[new_column] = np.nan

    # Group data by "Specimen ID" and "LABEL" to ensure samples of the same specimen stay together
    specimen_grouped = df.groupby("Specimen ID").first().reset_index()

    # Create a list of indices for each label group
    label_groups = specimen_grouped.groupby("LABEL")

    # Create an empty dictionary to hold fold indices
    folds = {i: [] for i in range(n_splits)}

    # Create a random number generator with a fixed seed
    rng = np.random.default_rng(random_seed)

    # For each label group, assign specimens to folds
    for label, group in label_groups:
        indices = group.index.tolist()  # Get indices of specimens
        rng.shuffle(indices)  # Shuffle indices for randomness (seeded)
        splits = np.array_split(indices, n_splits)  # Split indices into roughly equal-sized folds
        
        # Assign each split to a fold
        for i, split in enumerate(splits):
            folds[i].extend(split)

    # Map specimens back to their samples
    specimen_to_fold = {}
    for fold_idx, specimen_indices in folds.items():
        for specimen_index in specimen_indices:
            specimen_to_fold[specimen_grouped.loc[specimen_index, "Specimen ID"]] = fold_idx

    # Assign fold numbers to the main dataframe
    df[new_column] = df["Specimen ID"].map(specimen_to_fold)
    return df

# Train on one device, test on the other, assign OOD samples to the testing fold
df_device_trainphone = create_testing_fold(df_ref.copy(), "TESTING FOLD", n_splits, random_seed=random_seed)
df_device_trainphone.loc[df_device_trainphone["Device"] != "macrolens + iphone se", "TESTING FOLD"] = -1 

df_device_trainolympus = create_testing_fold(df_ref.copy(), "TESTING FOLD", n_splits, random_seed=random_seed)
df_device_trainolympus.loc[df_device_trainphone["Device"] == "macrolens + iphone se", "TESTING FOLD"] = -1 

# Assign OOD samples to the testing fold for the device experiment
df_device_bias = create_testing_fold(df_ref.copy(), "TESTING FOLD", n_splits, random_seed=random_seed)
df_device_bias.loc[(df_device_trainphone["Device"] == "macrolens + iphone se") & (df_device_trainphone["LABEL"] == "aegypti"), "TESTING FOLD"] = -1
df_device_bias.loc[(df_device_trainphone["Device"] == "olympus sz61 + olympus dp23") & (df_device_trainphone["LABEL"] == "albopictus"), "TESTING FOLD"] = -1
df_device_bias.loc[(df_device_trainphone["Device"] == "macrolens + iphone se") & (df_device_trainphone["LABEL"] == "koreicus"), "TESTING FOLD"] = -1
df_device_bias.loc[(df_device_trainphone["Device"] == "olympus sz61 + olympus dp23") & (df_device_trainphone["LABEL"] == "japonicus"), "TESTING FOLD"] = -1 

In [4]:
#df_device_trainphone.to_pickle(os.path.join(ROOTPATH, "utils", "references", "database_reference_MLREADY_TrainPhone.pkl")
#df_device_trainolympus.to_pickle(os.path.join(ROOTPATH, "utils", "references", "database_reference_MLREADY_TrainOlymp.pkl")
#df_device_bias.to_pickle(os.path.join(ROOTPATH, "utils", "references", "database_reference_MLREADY_bias.pkl")