In [None]:
# Example usage
input_dir = "b_mosquito_database_25_01_21"
output_dir = "b_mosquito_database_train_val_25_01_21"

blacklist_fn="wav_blacklist_for_train.txt"
train_ratio=0.8

class_ratios={'mosquito':0.35, 'not':1.0}


In [None]:
import os
import shutil
from collections import defaultdict
import random


In [None]:
black_list = []

# Processing lines
with open(blacklist_fn, "r", encoding="utf-8") as file:
    black_list = [line.strip().replace(" ", "") for line in file]

# Print result
print(black_list)



In [None]:
"""
Splits files based on prefix separately for each class,
ensuring that files from both classes are placed in the train and validation folders.

Args:
    input_dir (str): The root folder of the input database (with "mosquito" and "not" subfolders).
    output_dir (str): The root folder of the target database (with "train" and "validation" subfolders).
    train_ratio (float): The ratio for the "train" set.
"""
random.seed(42)

for class_dir in ["mosquito", "not"]:
    class_path = os.path.join(input_dir, class_dir)

    # Collect prefixes and files
    prefix_to_files = defaultdict(list)
    for file_name in os.listdir(class_path):
        if file_name.endswith(".wav"):
            prefix = file_name.rsplit("_", 1)[0]

            # there is a prefix blacklist for positive examples: those wavs that contain many errors
            if class_dir=="mosquito":
                for b in black_list:
                    if b in prefix:
                        print(prefix)
                        continue

            class_ratio=class_ratios[class_dir]
            if random.random()>class_ratio:
                continue
            
            prefix_to_files[prefix].append(os.path.join(class_path, file_name))

    # Sort prefixes
    #sorted_prefixes = sorted(prefix_to_files.keys(), key=lambda p: len(prefix_to_files[p]), reverse=True)
    sorted_prefixes = list(prefix_to_files.keys())
    random.shuffle(sorted_prefixes)

    # Initialize train and validation folders for the class
    train_dir = os.path.join(output_dir, "train", class_dir)
    validation_dir = os.path.join(output_dir, "validation", class_dir)
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(validation_dir, exist_ok=True)

    # Distribute files within the class
    train_count = 0
    validation_count = 0
    total_files = sum(len(files) for files in prefix_to_files.values())
    train_limit = int(total_files * train_ratio)

    for prefix in sorted_prefixes:
        file_paths = prefix_to_files[prefix]
        if train_count < train_limit:
            target_dir = train_dir
            train_count += len(file_paths)
        else:
            target_dir = validation_dir
            validation_count += len(file_paths)

        for file_path in file_paths:
            shutil.copy(file_path, target_dir)

    print(f"Class: {class_dir}, Train: {train_count}, Validation: {validation_count}")

