In [1]:
from dataset_process.dataset_splitter import split_datasets
from dataset_process.transformation import SYKE2024_zoo_transform, SYKE2022_phyto_transform

CLASS_SPLIT_PATHS = {
    'zooplankton': "./data/class_splits/SYKE-plankton_ZooScan_2024",
    'phytoplankton': "./data/class_splits/SYKE-plankton_IFCB_2022",
}

IMG_FOLDER_PATHS = {
    'zooplankton': "./data/images/SYKE-plankton_ZooScan_2024",
    'phytoplankton': "./data/images/labeled_20201020",
}

OUTPUT_FOLDER_NAMES = {
    'zooplankton': "Zooplankton_224",
    'phytoplankton': "Phytoplankton_224",
}

image_size = (224, 224)
N_train = 1000 # How many images in training set
transform_zoo = SYKE2024_zoo_transform(image_size, crop=True)
transform_phyto = SYKE2022_phyto_transform(image_size)

# split_datasets(
#     dataset_name='zooplankton',
#     transform=transform_zoo,
#     CLASS_SPLIT_PATHS = CLASS_SPLIT_PATHS,
#     IMG_FOLDER_PATHS = IMG_FOLDER_PATHS,
#     OUTPUT_FOLDER_NAMES = OUTPUT_FOLDER_NAMES,
#     N=N_train  
# )

# split_datasets(
#     dataset_name='phytoplankton',
#     transform=transform_phyto,
#     CLASS_SPLIT_PATHS = CLASS_SPLIT_PATHS,
#     IMG_FOLDER_PATHS = IMG_FOLDER_PATHS,
#     OUTPUT_FOLDER_NAMES = OUTPUT_FOLDER_NAMES,
#     N=N_train
# )


In [10]:
from dataset_process.dataset_creator import get_datasets
from train_utils import train_test_transform

from pathlib import Path

# Includes both train and test transforms
data_dir = "./data/images/Zooplankton_224"
trial = 0
trial_path = Path(f"./data/class_splits/SYKE-plankton_ZooScan_2024")

train_transform = train_test_transform(num_channels=1).train()
valid_transform = train_test_transform(num_channels=1).eval()

transform = (train_transform, valid_transform)
gallery = False

for trial in range(5):
    train_dataset, valid_dataset, test_dataset = get_datasets(data_dir, trial_path, trial, transform, unk_in_valid=False, gallery=False)
    print(len(valid_dataset))
    print(valid_dataset.class_names)


3185
['Bubbles', 'Ceriodaphnia_sp', 'Copepoda_calanoida', 'Copepoda_cyclopoida', 'Copepoda_nauplius', 'Daphnia_sp', 'Evadne_sp', 'Fibers_etc', 'Gastropoda', 'Podon_sp', 'Polychaeta', 'Synchaeta_sp']
2543
['Bivalvia', 'Bivalvia_multiple', 'Bosmina_sp', 'Copepoda_cyclopoida', 'Copepoda_nauplius', 'Daphnia_sp', 'Evadne_sp', 'Fibers_etc', 'Gastropoda', 'Podon_sp', 'Polychaeta', 'Synchaeta_sp']
4283
['Bivalvia', 'Bivalvia_multiple', 'Bosmina_sp', 'Bubbles', 'Ceriodaphnia_sp', 'Copepoda_calanoida', 'Evadne_sp', 'Fibers_etc', 'Gastropoda', 'Podon_sp', 'Polychaeta', 'Synchaeta_sp']
4308
['Bivalvia', 'Bivalvia_multiple', 'Bosmina_sp', 'Bubbles', 'Ceriodaphnia_sp', 'Copepoda_calanoida', 'Copepoda_cyclopoida', 'Copepoda_nauplius', 'Daphnia_sp', 'Podon_sp', 'Polychaeta', 'Synchaeta_sp']
3825
['Bivalvia', 'Bivalvia_multiple', 'Bosmina_sp', 'Bubbles', 'Ceriodaphnia_sp', 'Copepoda_calanoida', 'Copepoda_cyclopoida', 'Copepoda_nauplius', 'Daphnia_sp', 'Evadne_sp', 'Fibers_etc', 'Gastropoda']


In [6]:
def get_class_images(main_folder_path, class_name):
    folder_path = main_folder_path / class_name
    # List all image files
    image_files = [str(f) for f in folder_path.glob("*") if f.suffix.lower() in {".jpg", ".png", ".jpeg"}]
    return image_files

path = Path(data_dir) / "valid"
total = 0
for c in valid_dataset.class_names:
    images = get_class_images(path, c)
    # print(len(images))
    total += len(images)
print(total)

59
15
1277
13
116
1864
27
63
163
51
155
22
85
231
395
4536
