# CSV Preparation for Model Training
This notebook provides tools to prepare the CSV file related to the BirdCLEF 2021 dataset for training a Convolutional Neural Network (CNN).

Use it to generate a structured dataset that matches your spectrogram files and is ready for model development.



In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd
import ast

dir = '/content/drive/Shared drives/Deep Learning Group G/UPF_Deep_Learning_2025/Final Project/Birdclef2021/' # your path if changed.
os.chdir(dir)

Mounted at /content/drive


In [2]:
def simplify_and_remap_csv(csv_path):
    # Target primary labels
    target_labels = ['sonspa', 'redcro', 'norcar', 'houspa', 'gbwwre1', 'comrav'] #Name of the bird species folders wantet to use
    label_mapping = {label: idx for idx, label in enumerate(target_labels)}

    # Read CSV
    df = pd.read_csv(csv_path)

    # Filter rows by primary_label
    df_filtered = df[df['primary_label'].isin(target_labels)].copy()

    # Parse secondary_labels column (string to list)
    def contains_target_label(sec_labels):
        try:
            labels = ast.literal_eval(sec_labels)
            return any(label in target_labels for label in labels)
        except Exception:
            return False  # Skip if parsing fails or is empty

    # Remove rows where target_labels appear in secondary_labels
    df_filtered = df_filtered[~df_filtered['secondary_labels'].apply(contains_target_label)]

    # Build simplified DataFrame
    df_simplified = df_filtered[['filename', 'primary_label']].copy()
    df_simplified['label_id'] = df_simplified['primary_label'].map(label_mapping).astype(int)
    df_simplified = df_simplified[['filename', 'label_id', 'primary_label']]

    # Save result
    output_path = os.path.join(os.path.dirname(csv_path), 'update_metadata.csv')
    df_simplified.to_csv(output_path, index=False)

    print(f"Simplified CSV saved at: {output_path}")
    print(df_simplified['label_id'].value_counts().sort_index())

# -------------------------
# CSV path
original_csv_path = dir + "train_metadata_extended.csv"

# Run it
simplify_and_remap_csv(original_csv_path)

Simplified CSV saved at: /content/drive/Shared drives/Deep Learning Group G/UPF_Deep_Learning_2025/Final Project/Birdclef2021/update_metadata2.csv
label_id
0    464
1    491
2    460
3    488
4    497
5    492
Name: count, dtype: int64
