In this notebook 100 human labeled dataset will be produced.

Labelling procedure:

1.sample 20 random files from BirdNET labelled 5 most common species(species name/# of detections in the dataset).

Corvus monedula (Eurasian Jackdaw)/24,851
Parus major (Great Tit) /14,994
Cyanistes caeruleus (Eurasian Blue Tit) /7,958
Streptopelia decaocto (Eurasian Collared-Dove) /3,819
Columba palumbus (Common Wood-Pigeon) /3,182 

In [4]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from glob import glob


In [2]:
def get_audio_files(data_dir):
    file_names = []
    file_paths = []

    # Walk through all directories and subdirectories
    for root, dirs, files in os.walk(data_dir):
        # Use glob to find all .wav files in the current directory
        for fp in sorted(glob(os.path.join(root, '*.wav'))):
            bn = os.path.basename(fp)
            file_names.append(bn)
            file_paths.append(fp)
            

    # Create a DataFrame from the collected file names and paths
    df = pd.DataFrame({
        "filename": file_names,
        "filepath": file_paths,
    })
    return df


In [5]:
# Define the root directory containing all subfolders of interest
folder_path = '/Users/evgenynazarenko/DACS_3_year/Thesis/GardenFiles23'
temp_df = get_audio_files(folder_path)
print(temp_df.shape)


(390738, 2)


In [6]:
# read the file from metadata folder
metedata = pd.read_csv('../MIT_AST/metadata/stats_full_25.04_full.csv')
metedata.shape

(404553, 14)

In [7]:
# get 5 most common labels by "BirdNET" column
top_5_labels = metedata['BirdNET'].value_counts().index[:5]
top_5_labels

Index(['Corvus monedula_Eurasian Jackdaw', 'Parus major_Great Tit',
       'Cyanistes caeruleus_Eurasian Blue Tit',
       'Streptopelia decaocto_Eurasian Collared-Dove',
       'Columba palumbus_Common Wood-Pigeon'],
      dtype='object', name='BirdNET')

In [8]:
# sample 20 files for each label out of top 5
sampled_files = []
for label in top_5_labels:
    temp = metedata[metedata['BirdNET'] == label].sample(20)
    sampled_files.append(temp)

sampled_files = pd.concat(sampled_files)
sampled_files.shape


(100, 14)

In [9]:
sampled_files.head()

Unnamed: 0,datetime,precipRate,pressureMax,dewptAvg,windgustHigh,windspeedAvg,tempAve,humidityAvg,winddirAvg,uvHigh,solarRadiationHigh,BirdNET,MIT_AST_label,sound_class_label
136085,2023-11-17 11:04:07,0.0,1015.14,6.6,1.8,0.0,6.8,99.0,173.0,0.0,52.5,Corvus monedula_Eurasian Jackdaw,Fowl,bird
244693,2023-12-10 11:45:00,0.0,1002.44,7.5,21.9,6.5,10.5,81.3,199.0,1.0,135.7,Corvus monedula_Eurasian Jackdaw,Fowl,bird
378928,2023-09-08 14:08:52,0.0,1011.41,20.7,5.5,1.6,30.9,54.7,128.0,5.0,533.1,Corvus monedula_Eurasian Jackdaw,Turkey,bird
224349,2024-02-01 17:22:05,0.0,1028.04,4.9,5.5,1.0,7.9,81.6,194.0,0.0,4.8,Corvus monedula_Eurasian Jackdaw,Caw,bird
26015,2024-03-12 18:26:55,0.0,1008.94,8.4,7.2,1.6,8.8,97.0,130.0,0.0,2.6,Corvus monedula_Eurasian Jackdaw,"Bird vocalization, bird call, bird song",bird


In [10]:


def generate_file_name(date_time_str, file_prefix='er_file_'):
    """
    Generate a file path for an audio file based on the date and time.

    Parameters:
    - date_time_str (str): The date and time string in the format 'YYYY-MM-DD HH:MM:SS'.
    
    - file_prefix (str): Prefix for the filename.

    Returns:
    - str: The full path to the file.
    """
    # Convert the date_time_str to a datetime object
    date_time_obj = datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')
    
    # Format the datetime object into the filename format 'YYYY_MM_DD_H_MM_SS'
    file_name = date_time_obj.strftime('%Y_%m_%d_%-H_%M_%S')
    
    # Append file extension
    file_extension = '.wav'
    
    # Construct the full file path
    full_file_name = f"{file_prefix}{file_name}{file_extension}"
    
    return full_file_name

# Example usage




In [11]:
# reconstruct the file path from datetime column and add it to the dataframe
# Applying the function to create file paths using .loc and column names directly
sampled_files.loc[:, 'file_name'] = sampled_files['datetime'].apply(generate_file_name)
sampled_files.head()

Unnamed: 0,datetime,precipRate,pressureMax,dewptAvg,windgustHigh,windspeedAvg,tempAve,humidityAvg,winddirAvg,uvHigh,solarRadiationHigh,BirdNET,MIT_AST_label,sound_class_label,file_name
136085,2023-11-17 11:04:07,0.0,1015.14,6.6,1.8,0.0,6.8,99.0,173.0,0.0,52.5,Corvus monedula_Eurasian Jackdaw,Fowl,bird,er_file_2023_11_17_11_04_07.wav
244693,2023-12-10 11:45:00,0.0,1002.44,7.5,21.9,6.5,10.5,81.3,199.0,1.0,135.7,Corvus monedula_Eurasian Jackdaw,Fowl,bird,er_file_2023_12_10_11_45_00.wav
378928,2023-09-08 14:08:52,0.0,1011.41,20.7,5.5,1.6,30.9,54.7,128.0,5.0,533.1,Corvus monedula_Eurasian Jackdaw,Turkey,bird,er_file_2023_09_08_14_08_52.wav
224349,2024-02-01 17:22:05,0.0,1028.04,4.9,5.5,1.0,7.9,81.6,194.0,0.0,4.8,Corvus monedula_Eurasian Jackdaw,Caw,bird,er_file_2024_02_01_17_22_05.wav
26015,2024-03-12 18:26:55,0.0,1008.94,8.4,7.2,1.6,8.8,97.0,130.0,0.0,2.6,Corvus monedula_Eurasian Jackdaw,"Bird vocalization, bird call, bird song",bird,er_file_2024_03_12_18_26_55.wav


In [14]:
temp_df.head()

Unnamed: 0,filename,filepath
0,er_file_2024_01_10_10_00_00.wav,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...
1,er_file_2024_01_10_10_00_44.wav,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...
2,er_file_2024_01_10_10_00_51.wav,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...
3,er_file_2024_01_10_10_01_11.wav,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...
4,er_file_2024_01_10_10_01_58.wav,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...


In [15]:
# add the file path to the sampled_files
sampled_files = sampled_files.merge(temp_df, left_on='file_name', right_on='filename', how='left')
sampled_files.head()


Unnamed: 0,datetime,precipRate,pressureMax,dewptAvg,windgustHigh,windspeedAvg,tempAve,humidityAvg,winddirAvg,uvHigh,solarRadiationHigh,BirdNET,MIT_AST_label,sound_class_label,file_name,filename,filepath
0,2023-11-17 11:04:07,0.0,1015.14,6.6,1.8,0.0,6.8,99.0,173.0,0.0,52.5,Corvus monedula_Eurasian Jackdaw,Fowl,bird,er_file_2023_11_17_11_04_07.wav,er_file_2023_11_17_11_04_07.wav,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...
1,2023-12-10 11:45:00,0.0,1002.44,7.5,21.9,6.5,10.5,81.3,199.0,1.0,135.7,Corvus monedula_Eurasian Jackdaw,Fowl,bird,er_file_2023_12_10_11_45_00.wav,er_file_2023_12_10_11_45_00.wav,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...
2,2023-09-08 14:08:52,0.0,1011.41,20.7,5.5,1.6,30.9,54.7,128.0,5.0,533.1,Corvus monedula_Eurasian Jackdaw,Turkey,bird,er_file_2023_09_08_14_08_52.wav,er_file_2023_09_08_14_08_52.wav,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...
3,2024-02-01 17:22:05,0.0,1028.04,4.9,5.5,1.0,7.9,81.6,194.0,0.0,4.8,Corvus monedula_Eurasian Jackdaw,Caw,bird,er_file_2024_02_01_17_22_05.wav,er_file_2024_02_01_17_22_05.wav,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...
4,2024-03-12 18:26:55,0.0,1008.94,8.4,7.2,1.6,8.8,97.0,130.0,0.0,2.6,Corvus monedula_Eurasian Jackdaw,"Bird vocalization, bird call, bird song",bird,er_file_2024_03_12_18_26_55.wav,er_file_2024_03_12_18_26_55.wav,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...


In [16]:
sampled_files.iloc[0]['filepath']

'/Users/evgenynazarenko/DACS_3_year/Thesis/GardenFiles23/garden_12112023/135/er_file_2023_11_17_11_04_07.wav'

In [None]:
# listen to all the files in the sampled_files
import IPython.display as ipd
for i in range(sampled_files.shape[0]):
    ipd.display(ipd.Audio(sampled_files.iloc[i]['filepath']))

    



Manually labeled 100 files are saved to 100_labels.csv

Add them to the dataframe.

In [23]:
# add column with labels to the sampled_files
labels = pd.read_csv('100_labels.csv')
labels.head()

Unnamed: 0,;;;;;;
0,Jackdaw;;;;;;
1,Jackdaw;;;;;;
2,Jackdaw;;;;;;
3,Jackdaw;;;;;;
4,Jackdaw/tit;;;;;;


In [25]:
# rename the column
labels = labels.rename(columns={';;;;;;': 'human_label'})
# get rid of ';;;;;;' in the column
labels['human_label'] = labels['human_label'].str.replace(';;;;;;', '')
labels.head()

Unnamed: 0,human_label
0,Jackdaw
1,Jackdaw
2,Jackdaw
3,Jackdaw
4,Jackdaw/tit


In [26]:
# concat the labels to the sampled_files
sampled_files = pd.concat([sampled_files, labels], axis=1)
sampled_files.head()


Unnamed: 0,datetime,precipRate,pressureMax,dewptAvg,windgustHigh,windspeedAvg,tempAve,humidityAvg,winddirAvg,uvHigh,solarRadiationHigh,BirdNET,MIT_AST_label,sound_class_label,file_name,filename,filepath,human_label
0,2023-11-17 11:04:07,0.0,1015.14,6.6,1.8,0.0,6.8,99.0,173.0,0.0,52.5,Corvus monedula_Eurasian Jackdaw,Fowl,bird,er_file_2023_11_17_11_04_07.wav,er_file_2023_11_17_11_04_07.wav,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...,Jackdaw
1,2023-12-10 11:45:00,0.0,1002.44,7.5,21.9,6.5,10.5,81.3,199.0,1.0,135.7,Corvus monedula_Eurasian Jackdaw,Fowl,bird,er_file_2023_12_10_11_45_00.wav,er_file_2023_12_10_11_45_00.wav,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...,Jackdaw
2,2023-09-08 14:08:52,0.0,1011.41,20.7,5.5,1.6,30.9,54.7,128.0,5.0,533.1,Corvus monedula_Eurasian Jackdaw,Turkey,bird,er_file_2023_09_08_14_08_52.wav,er_file_2023_09_08_14_08_52.wav,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...,Jackdaw
3,2024-02-01 17:22:05,0.0,1028.04,4.9,5.5,1.0,7.9,81.6,194.0,0.0,4.8,Corvus monedula_Eurasian Jackdaw,Caw,bird,er_file_2024_02_01_17_22_05.wav,er_file_2024_02_01_17_22_05.wav,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...,Jackdaw
4,2024-03-12 18:26:55,0.0,1008.94,8.4,7.2,1.6,8.8,97.0,130.0,0.0,2.6,Corvus monedula_Eurasian Jackdaw,"Bird vocalization, bird call, bird song",bird,er_file_2024_03_12_18_26_55.wav,er_file_2024_03_12_18_26_55.wav,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...,Jackdaw/tit


In [27]:
# save the sampled_files
sampled_files.to_csv('100_human_labeled_files.csv', index=False)