In [2]:
!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import shutil
from pathlib import Path
from tqdm import tqdm

1. Function to split metadata into train, validation and test sets [DONE]
2. Fucntion to write files from a split into required directury hierarchy [PENDING]

In [None]:
# 1.
def create_split(df: pd.DataFrame, val_ratio: float, test_ratio: float) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Splits a DataFrame with two columns (file path, target class) into stratified train, validation, and test sets.

    :param df (pd.DataFrame): input DataFrame with [file_path, target_class] columns
    :param val_ratio (float): proportion of data for validation
    :param test_ratio (float): proportion of data for testing 
    :returns: tuple (train_df, val_df, test_df) as stratified splits
    """
    assert df.shape[1] == 2, "Dataframe should have 2 columns"
    assert 0 < val_ratio < 1 and 0 < test_ratio < 1 and val_ratio + test_ratio < 1, "Incorrect split ratio"

    path = df.iloc[:, 0]
    target = df.iloc[:, 1]

    # defining splits
    path_train, path_test, target_train, target_test = train_test_split(path, target, test_size = test_ratio, random_state=42, stratify=target)
    path_train, path_val, target_train, target_val = train_test_split(path_train, target_train, test_size=val_ratio/(1-test_ratio), random_state=42, stratify=target_train)

    # concatenating path and target data
    train_df = pd.concat([path_train, target_train], axis=1)
    val_df = pd.concat([path_val, target_val], axis=1)
    test_df = pd.concat([path_test, target_test], axis=1)

    return train_df, val_df, test_df
    
    

In [6]:
# testing function on metadata file
df = pd.read_csv('../data/rare_species 1/metadata.csv')
df_phylum = df[["file_path", "phylum"]]

len(df_phylum)

11983

In [7]:
tr, v, t = create_split(df_phylum, 0.2, 0.1)
print ((len(tr) + len(v) + len(t)) == len(df))

True


In [8]:
print(f'Training set: {len(tr)} examples\n{tr.iloc[:, 1].value_counts()}\n')
print(f'Validation set: {len(v)} examples\n{v.iloc[:, 1].value_counts()}\n')
print(f'Testing set: {len(t)} examples\n{t.iloc[:, 1].value_counts()}')

Training set: 8387 examples
phylum
chordata         6965
arthropoda        666
cnidaria          567
mollusca          147
echinodermata      42
Name: count, dtype: int64

Validation set: 2397 examples
phylum
chordata         1991
arthropoda        190
cnidaria          162
mollusca           42
echinodermata      12
Name: count, dtype: int64

Testing set: 1199 examples
phylum
chordata         996
arthropoda        95
cnidaria          81
mollusca          21
echinodermata      6
Name: count, dtype: int64


In [9]:
tr.head()

Unnamed: 0,file_path,phylum
9208,chordata_ambystomatidae/14005845_1019571_eol-f...,chordata
7117,chordata_urolophidae/2752484_46561068_eol-full...,chordata
11862,chordata_crocodylidae/22564193_795274_eol-full...,chordata
10662,chordata_fringillidae/21328570_47043290_eol-fu...,chordata
5308,chordata_gekkonidae/29874183_791137_eol-full-s...,chordata


In [10]:
v.head()

Unnamed: 0,file_path,phylum
9173,cnidaria_pocilloporidae/29676316_45275970_eol-...,cnidaria
7514,chordata_trochilidae/21360931_45512248_eol-ful...,chordata
8099,chordata_callitrichidae/20365852_323908_eol-fu...,chordata
4474,chordata_laridae/21923656_45509321_eol-full-si...,chordata
7118,chordata_balaenidae/28510307_46559421_eol-full...,chordata


In [11]:
t.head()

Unnamed: 0,file_path,phylum
11315,mollusca_cardiidae/10836386_46473747_eol-full-...,mollusca
1135,arthropoda_carabidae/10787825_2867406_eol-full...,arthropoda
2549,chordata_hexanchidae/22181459_46560151_eol-ful...,chordata
8061,chordata_ramphastidae/30015533_45512569_eol-fu...,chordata
6649,cnidaria_pocilloporidae/20916144_45275970_eol-...,cnidaria


In [None]:
#2

# sample of dataset to test split and copy
X = df_phylum.iloc[:, 0]
y = df_phylum.iloc[:, 1]

    # defining splits
_x, sample_path, _y, sample_target = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)
sample_phylum_df = pd.concat([sample_path, sample_target], axis=1)
print(len(sample_phylum_df))
sample_phylum_df.head()

2397


Unnamed: 0,file_path,phylum
6616,cnidaria_faviidae/28170637_45276848_eol-full-s...,cnidaria
2351,chordata_balaenicipitidae/14204552_1049385_eol...,chordata
9192,mollusca_cardiidae/2751825_46473744_eol-full-s...,mollusca
4003,chordata_latimeriidae/29935022_46582216_eol-fu...,chordata
2573,chordata_hemiscylliidae/29599796_46559713_eol-...,chordata


In [17]:
train_sample_phylum, val_sample_phylum, test_sample_phylum = create_split(sample_phylum_df, 0.2, 0.2)

In [20]:
test_sample_phylum.head()

Unnamed: 0,file_path,phylum
10010,chordata_salamandridae/21499620_289670_eol-ful...,chordata
6928,chordata_pleuronectidae/10431389_46570095_eol-...,chordata
1159,chordata_burhinidae/29684475_45517788_eol-full...,chordata
2593,chordata_nesospingidae/22028346_45510504_eol-f...,chordata
203,chordata_dasypodidae/5471999_328497_eol-full-s...,chordata


In [None]:
# function to copy the files
def write_split_dir(df: pd.DataFrame, split_folder_name: str, base_data_dir: str = "../data/rare_species 1"):
    """
    Copies files from original data folder into a requiered directory based on provided split DataFrame

    :param df (pd.DataFrame): input (split) DataFrame with [file_path, target_class] columns
    :split_folder_name (str): name of the folder to create under `base_data_dir`, for clarity should 
    contain the name of class type (phylum/family) and name of the split (train/val/test)
    :base_data_dir (str): base directory with original data, defaults to "../data/rare_species1".

    """
    # creating "split_folder_name" directory if it doesn't exist
    target_root = Path(base_data_dir).parent / split_folder_name
    target_root.mkdir(parents=True, exist_ok=True)

    # creating class folders if they don't exist
    class_names = df.iloc[:, 1].unique()
    for cls in class_names:
        (target_root / cls).mkdir(exist_ok=True)

    print(f"Copying {len(df)} files into {split_folder_name}/...")

    # copyin files, showing progress
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Copying files"):
        rel_path = row.iloc[0]
        class_name = row.iloc[1]

        src_path = Path(base_data_dir) / rel_path
        dst_path = target_root / class_name / Path(rel_path).name

        try:
            shutil.copy2(src_path, dst_path)
        except Exception as e:
            print(f"Failed to copy {src_path} → {dst_path}: {e}")

In [28]:
write_split_dir(test_sample_phylum, "test_sample_phylum")

Copying 480 files into test_sample_phylum/...


Copying files:   0%|          | 0/480 [00:00<?, ?it/s]

Copying files: 100%|██████████| 480/480 [00:04<00:00, 99.40it/s] 


In [29]:
write_split_dir(val_sample_phylum, "val_sample_phylum")

Copying 480 files into val_sample_phylum/...


Copying files: 100%|██████████| 480/480 [00:04<00:00, 96.28it/s] 


In [30]:
write_split_dir(train_sample_phylum, "train_sample_phylum")

Copying 1437 files into train_sample_phylum/...


Copying files: 100%|██████████| 1437/1437 [00:14<00:00, 100.47it/s]
