# Data Preparation for Model Training

This notebook provides tools to prepare audio data for training a Convolutional Neural Network (CNN). You may use a subset of the dataset available on Kaggle: https://www.kaggle.com/datasets/defined/birdclef-2021-mel-spectograms, or alternatively, the original **BirdClef** dataset. If you choose the latter, it is necessary to convert each audio file into a **Mel-Spectrogram** before proceeding.

A preprocessed dataset is already provided for training purposes. However, if you decide to use new or alternative data, it is important to consider the timing of bird vocalizations within the recordings. Ideally, a statistical analysis should be performed to determine whether bird calls tend to occur near the midpoint of the audio files. If this assumption does not hold, the current data processing pipeline may not behave as intended.



In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import numpy as np

dir = '/content/drive/Shared drives/Deep Learning Group G/UPF_Deep_Learning_2025/Final Project/Birdclef2021/' # Your path here.
os.chdir(dir)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#See if you have each bird folder in the raight place.
folder_path = dir + "train/" #Change to the data path

def list_directory_contents(path):
    if not os.path.exists(path):
        print("The path does not exist.")
        return

    print(f"Contents of: {path}\n")
    for item in os.listdir(path):
        item_path = os.path.join(path, item)
        if os.path.isdir(item_path):
            print(f"[DIR]  {item}")
        else:
            print(f"[FILE] {item}")

list_directory_contents(folder_path)

Contents of: /content/drive/Shared drives/Deep Learning Group G/UPF_Deep_Learning_2025/Final Project/Birdclef2021/train/

[DIR]  redcro
[DIR]  gbwwre1
[DIR]  norcar
[DIR]  comrav
[DIR]  sonspa
[DIR]  houspa


In [None]:
def get_fragments_per_file(folder_path):
    """Returns list of tuples (filename, number of fragments) for all .npy files in a folder."""
    files_fragments = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.npy'):
            full_path = os.path.join(folder_path, filename)
            try:
                data = np.load(full_path)
                if len(data.shape) == 3:
                    files_fragments.append((filename, data.shape[0]))
            except Exception as e:
                print(f"Error processing {filename}: {e}")
                continue
    return files_fragments

def calculate_folder_mean(files_fragments):
    """Returns floor mean of fragment counts."""
    fragment_counts = [n for _, n in files_fragments]
    if not fragment_counts:
        return None
    return int(np.floor(np.mean(fragment_counts)))

def move_and_trim_files_to_target(src_folder, dst_root, files_fragments, global_mean, target_length):
    """Copies and trims files directly to dst_root, flattening the structure."""
    min_acceptable = global_mean - 2
    max_acceptable = global_mean + 4

    os.makedirs(dst_root, exist_ok=True)
    moved, trimmed = 0, 0

    for filename, n in files_fragments:
        if min_acceptable <= n <= max_acceptable:
            src_path = os.path.join(src_folder, filename)
            dst_path = os.path.join(dst_root, filename)
            try:
                data = np.load(src_path)
                while data.shape[0] - 2 >= target_length:
                    data = data[1:-1]
                if data.shape[0] > target_length:
                    data = data[:-1]
                np.save(dst_path, data)
                moved += 1
                trimmed += 1
            except Exception as e:
                print(f"Error processing {filename}: {e}")
                continue
    return moved, trimmed

def compute_all_folder_means(root_path):
    """Returns a list of (folder_path, files_fragments, folder_mean)."""
    folder_means = []
    for folder_name in os.listdir(root_path):
        folder_path = os.path.join(root_path, folder_name)
        if os.path.isdir(folder_path):
            files_fragments = get_fragments_per_file(folder_path)
            folder_mean = calculate_folder_mean(files_fragments)
            if folder_mean is not None:
                folder_means.append((folder_path, files_fragments, folder_mean))
    return folder_means

def process_all_folders(root_path, destination_root):
    # Step 1: compute folder-level means
    folder_data = compute_all_folder_means(root_path)
    if not folder_data:
        print("No valid folders with .npy files found.")
        return

    # Step 2: compute global mean and target length
    individual_means = [mean for _, _, mean in folder_data]
    global_mean = int(np.floor(np.mean(individual_means)))
    target_length = global_mean - 2
    print(f"Global mean = {global_mean} | Target fragment count = {target_length}")

    # Step 3: process all folders
    total_moved, total_trimmed = 0, 0
    for folder_path, files_fragments, _ in folder_data:
        print(f"\nProcessing: {os.path.basename(folder_path)}")
        moved, trimmed = move_and_trim_files_to_target(
            folder_path, destination_root, files_fragments, global_mean, target_length
        )
        total_moved += moved
        total_trimmed += trimmed
        print(f"Moved to train_good: {moved} files")

    print(f"\nAll Done!")
    print(f"Total files saved in '{destination_root}': {total_moved}")
    print(f"Total files trimmed to {target_length} fragments: {total_trimmed}")


# Set your source and destination root folders here
source_root_path = dir + "train/"
destination_root_path = dir + "train_good/"

# Run the process
process_all_folders(source_root_path, destination_root_path)

Global mean = 12 | Target fragment count = 10

Processing: redcro
Moved to train_good: 96 files

Processing: gbwwre1
Moved to train_good: 80 files

Processing: norcar
Moved to train_good: 123 files

Processing: comrav
Moved to train_good: 86 files

Processing: sonspa
Moved to train_good: 120 files

Processing: houspa
Moved to train_good: 104 files

All Done!
Total files saved in '/content/drive/Shared drives/Deep Learning Group G/UPF_Deep_Learning_2025/Final Project/Birdclef2021/train_good/': 609
Total files trimmed to 10 fragments: 609
Duplicate filenames: []


In [None]:
def check_npy_fragment_counts(folder_path, expected_fragments=10):
    """Verifies that all .npy files in the folder have exactly expected_fragments."""
    if not os.path.exists(folder_path):
        print(f"Path does not exist: {folder_path}")
        return

    total_files = 0
    valid_files = 0
    invalid_files = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.npy'):
            total_files += 1
            file_path = os.path.join(folder_path, filename)
            try:
                data = np.load(file_path)
                if len(data.shape) == 3 and data.shape[0] == expected_fragments:
                    valid_files += 1
                else:
                    invalid_files.append((filename, data.shape))
            except Exception as e:
                invalid_files.append((filename, f"Error: {e}"))

    # Summary
    print(f"\nChecked {total_files} .npy files in: {folder_path}")
    print(f"Valid files with {expected_fragments} spectrograms: {valid_files}")
    print(f"Invalid files: {len(invalid_files)}")

    if invalid_files:
        print("\nFiles with incorrect fragment counts or errors:")
        for name, issue in invalid_files:
            print(f" - {name}: {issue}")

# ----------------------------
# Set your folder path here:
folder = dir + "train_good/"
check_npy_fragment_counts(folder , expected_fragments = 10)


Checked 609 .npy files in: /content/drive/Shared drives/Deep Learning Group G/UPF_Deep_Learning_2025/Final Project/Birdclef2021/train_good/
Valid files with 10 spectrograms: 609
Invalid files: 0


#Next step -> Prepare CSV