In [None]:
import os
import sys

import numpy as np
import pandas as pd

sys.path.append("..")
from utils.BALROG_pipeline import ROOTPATH

In [2]:
df = pd.read_pickle(os.path.join(ROOTPATH, "utils", "references", "database_reference.pkl"))
# Remove all images that are not of the class "F" (female) and that do not have a damaged wing
df = df.loc[df["Sex"] == "F"]
df = df.loc[df["Damaged Wing"] == False]

In [3]:
# Define the classification labels
# Chose taxonimoc level 3 if not "-" else chose taxonomic level 4
df["LABEL"] = df["3. Taxonomic Level"]
df.loc[df["LABEL"] == "-", "LABEL"] = df.loc[df["LABEL"] == "-", "4. Taxonomic Level"]

# If there are less than 75 images of a certain class, change the label to "Other"
counts = df["LABEL"].value_counts()
df.loc[df["LABEL"].isin(counts[counts <= 80].index), "LABEL"] = "other"

In [4]:
# Number of folds
n_splits = 6
# Fixed seed for reproducibility
seed = 42  

# Initialize the fold column
df["TESTING FOLD"] = np.nan

# Group data by "Specimen ID" and "LABEL" to ensure samples of the same specimen stay together
specimen_grouped = df.groupby("Specimen ID").first().reset_index()

# Create a list of indices for each label group
label_groups = specimen_grouped.groupby("LABEL")

# Create an empty dictionary to hold fold indices
folds = {i: [] for i in range(n_splits)}

# Create a random number generator with a fixed seed
rng = np.random.default_rng(seed)

# For each label group, assign specimens to folds
for label, group in label_groups:
    indices = group.index.tolist()  # Get indices of specimens
    rng.shuffle(indices)  # Shuffle indices for randomness (seeded)
    splits = np.array_split(indices, n_splits)  # Split indices into roughly equal-sized folds
    
    # Assign each split to a fold
    for i, split in enumerate(splits):
        folds[i].extend(split)

# Map specimens back to their samples
specimen_to_fold = {}
for fold_idx, specimen_indices in folds.items():
    for specimen_index in specimen_indices:
        specimen_to_fold[specimen_grouped.loc[specimen_index, "Specimen ID"]] = fold_idx

# Assign fold numbers to the main dataframe
df["TESTING FOLD"] = df["Specimen ID"].map(specimen_to_fold)

In [5]:
# Wings which were used in the feasibility study
feasibility_wings = ['353f52c84025', '83e6e6418abf', '97ac6879392b', '4bd749e4ebd7',
       '9935c9ea8bbe', '8c6bd98c8b8b', '21b4542bda0f', '220ac0b60d8e',
       'df3c55c46235', '262a2c604633', '79d27f4ea625', 'd85a28add3f5',
       '5884e9a765c0', '569ef178a8d7', 'bdc9c377f98b', '1425b0581c09',
       'ff1a213aaea7', '75f9411760b2', '68f20aa38179', 'a3b9df5d29e4',
       'dc438aeecc9c', '2438a719beed', '8318585f57d1', 'ead4ce4a6350',
       'f6a602f7aacc', '98b991a4f708', 'dff976bcfa46', '71a566ef1e3f',
       'c1408357130e', '2eb915fbb22d', '719077c0e9a2', '2ec1d44eedcc',
       'ec422efeda41', 'd4d8bb6391bc', '6d8426ae04b6', '9aef0e83de27',
       '7fe3ec219cbd', 'd7e7ad8334e6', '6f51c34925af', 'ec37402d465a',
       '34abe4ae4b3b', 'fed84692b879', '005ed08e818f', 'dafe6e862994',
       '6dcdd0b9158a', '17f2c6d9da9d', '8b7c9e016e87', 'ea3cd239e227',
       'fb9a64ead7b8', 'a2b12296e17f', 'a139178bca77', 'c647fca63ee6',
       '26f4c6b4db03', 'fab98f144073', '67fa3ea97980', 'be13f4704c95',
       'e20799179c6a', '6598e7afca70', 'd8560f8a25de', 'df6778597799',
       'f5a2e4dc9333', '24acd9682a77', '490ef4cdd5e7', '661d083772bd',
       '1af3637c8869', '0bd625e52e5e', '58ed4c8e8db7', 'e3bcb777f7ae',
       '4245e63c104c', 'b9ad9219df22', '08358a71c6a1', '4f77a1d04154',
       'e45c8cb021ed', '11b77d4957ad', '10e380da7a12']

# Leave out the samples which are used in the feasibility study
df.loc[df["Image ID"].isin(feasibility_wings), "TESTING FOLD"] = -1 

In [6]:
# Explore dataset
df["TESTING FOLD"].value_counts()

TESTING FOLD
 4    2492
 0    2490
 2    2486
 1    2469
 3    2439
 5    2437
-1      75
Name: count, dtype: int64

In [7]:
# Check wether folds contain duplicates between them (False = duplicate)
for i in range(n_splits):
    for j in range(i+1, n_splits):
         print(len(set(folds[i]) & set(folds[j])) == 0)

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [8]:
#df.to_pickle(os.path.join(ROOTPATH, "utils", "references", "database_reference_MLREADY.pkl"))