In [1]:
import pandas as pd
from glob import glob
import os
from pathlib  import Path
import torchaudio
from datasets import Dataset
from tqdm import tqdm
import numpy as np

### Schulze

In [2]:
data_dir = Path("/data/vogelstimmen_cds/schulze/")

metadata_loc = "/data/meta/schulze_en.csv"
metadata = pd.read_csv(metadata_loc)
metadata

Unnamed: 0,file,time_ranges,comment,name_de
0,/data/vogelstimmen_cds/schulze/CD01/0101 Stern...,0:00 - 0:16 - 0:40 - 0:50 - 0:58,Calls. Recording 1: courtship display of a pa...,Sterntaucher
1,/data/vogelstimmen_cds/schulze/CD01/0102 Prach...,0:00 - 0:09 - 0:20 - 0:38 - 0:47 - 0:57,"Calls. In recording 2 also noise of water, in ...",Prachttaucher
2,/data/vogelstimmen_cds/schulze/CD01/0103 Eista...,0:00 - 0:39 - 1:02,Calls of courting pairs,Eistaucher
3,/data/vogelstimmen_cds/schulze/CD01/0104 Gelbs...,0:00 - 0:29 - 0:52,Calls of a courting pair,Gelbschnabeltaucher
4,/data/vogelstimmen_cds/schulze/CD01/0105 Binde...,0:00 - 0:30,Calls,Bindentaucher
...,...,...,...,...
1306,/data/vogelstimmen_cds/schulze/CD17/1781 Rosen...,0:00 - 0:28,Song,Rosenbrust-Kernknacker
1307,/data/vogelstimmen_cds/schulze/CD17/1782 Rosen...,0:00 - 0:12,Calls,Rosenbrust-Kernknacker
1308,/data/vogelstimmen_cds/schulze/CD17/1783 Azurb...,0:00 - 0:36,Song,Azurbischof
1309,/data/vogelstimmen_cds/schulze/CD17/1784 Indig...,0:00 - 0:25,Song,Indigofink


In [6]:
for file in tqdm(metadata.file):
    out = torchaudio.load(file)

100%|██████████| 1311/1311 [01:54<00:00, 11.49it/s]


In [110]:
print(metadata.name_de.value_counts())

Grauspecht                4
Mittelspecht              4
Schwarzspecht             4
Kleinspecht               4
Eisvogel                  3
                         ..
Einsamer Wasserlaeufer    1
Drosseluferlaeufer        1
Schlammtreter             1
Wilsonwassertreter        1
Papstfink                 1
Name: name_de, Length: 819, dtype: int64


In [6]:
def get_file(file_ref):
    cd_num, file_num = file_ref.split("/")
    cd_num, file_num = cd_num.zfill(2), file_num.zfill(2) # add leading 0 if needed to get 2 digits
    # note: both cd_num, file_num are strings
    cd_dir = f"CD{cd_num}"
    file_prefix = f"{cd_num}{file_num}"
    pattern = os.path.join(data_dir, cd_dir, f"{file_prefix}*")
    matches = glob(pattern)
    assert len(matches) == 1, f"the pattern '{pattern}' should only match one file but matches are: {matches}"
    file = matches[0]
    return file

metadata["file"] = metadata.file.apply(get_file)

In [17]:
def get_name(file):
    file = os.path.split(file)[1]
    name_de = " ".join(file.split()[1:-1])
    return name_de

metadata["name_de"] = metadata.file.apply(get_name)

In [92]:
metadata.to_csv(metadata_loc, index=False)

### Bergmann

In [2]:
data_dir = Path("/data/vogelstimmen_cds/bergmann")

metadata_loc = "/data/meta/bergman_en.csv"
metadata = pd.read_csv(metadata_loc)
metadata

# note sampling rate is 44100 for all files in bergman

Unnamed: 0,file,duration,comment,name_de,name_sci,label
0,/data/vogelstimmen_cds/bergmann/0010-0170_Anat...,22.0,Wing sounds during flight including snorting ...,Hoeckerschwan,Cygnus olor,A Cygnus olor. Wing sounds during flight inclu...
1,/data/vogelstimmen_cds/bergmann/0010-0170_Anat...,39.0,Two individual birds flying past: singing and...,Hoeckerschwan,Cygnus olor,A Cygnus olor. Two individual birds flying pas...
2,/data/vogelstimmen_cds/bergmann/0010-0170_Anat...,19.0,A single bird taking-off from the water surfa...,Hoeckerschwan,Cygnus olor,A Cygnus olor. A single bird taking-off from t...
3,/data/vogelstimmen_cds/bergmann/0010-0170_Anat...,16.0,"Two birds, short calls and landing sounds on ...",Hoeckerschwan,Cygnus olor,"A Cygnus olor. Two birds, short calls and land..."
4,/data/vogelstimmen_cds/bergmann/0010-0170_Anat...,54.0,Flight sounds and starting noise from water p...,Hoeckerschwan,Cygnus olor,A Cygnus olor. Flight sounds and starting nois...
...,...,...,...,...,...,...
845,/data/vogelstimmen_cds/bergmann/3190-3500_Estr...,24.0,Calls and flight song.,Ortolan,Emberiza hortulana,A Emberiza hortulana. Calls and flight song.
846,/data/vogelstimmen_cds/bergmann/3190-3500_Estr...,10.0,Flight calls („pt“) of staging birds during t...,Ortolan,Emberiza hortulana,A Emberiza hortulana. Flight calls („pt“) of s...
847,/data/vogelstimmen_cds/bergmann/3190-3500_Estr...,9.0,The sharp „tchitt“ and quiet „plet“ calls are...,Grauortolan,Emberiza caesia,A Emberiza caesia. The sharp „tchitt“ and quie...
848,/data/vogelstimmen_cds/bergmann/3190-3500_Estr...,8.0,Calls of a bird in flight,Rohrammer,Emberiza schoeniclus,A Emberiza schoeniclus. Calls of a bird in flight


In [8]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")

In [11]:
metadata["label"].map(lambda x: len(tokenizer.encode(x))).describe()

count    850.000000
mean      27.049412
std       10.218457
min       11.000000
25%       20.000000
50%       25.000000
75%       32.000000
max       92.000000
Name: label, dtype: float64

In [5]:
def to_sec(desc: str): 
    if desc is np.nan: return np.nan
    min, sec = desc.split(":")
    return int(min) * 60 + int(sec)
metadata["duration"] = metadata.duration.apply(to_sec)

In [11]:
print(metadata.name_de.value_counts())

Kolkrabe              9
Bruchwasserlaeufer    8
Rotschenkel           8
Girlitz               7
Rabenkraehe           7
                     ..
Tannenmeise           1
Gerfalke              1
Beutelmeise           1
Nebelkraehe           1
Kappenammer           1
Name: name_de, Length: 351, dtype: int64


In [6]:
def get_file(file_ref):
    pattern = os.path.join(data_dir, "*", f"{file_ref}*")
    matches = glob(pattern)
    assert len(matches) == 1, f"the pattern '{pattern}' should only match one file but matches are: {matches}"
    file = matches[0]
    return file

metadata["file"] = metadata.file.apply(get_file)

In [9]:
def get_name_components(file):
    file = os.path.split(file)[1]
    name_components = " ".join(file.split("_")[1:-1])
    return name_components.split()
    
name_components = metadata.file.apply(get_name_components)
split = pd.Series(index=name_components[name_components.apply(lambda x: len(x) != 3)].index, data=([1]*4 + [2]*8 + [1] + [2]*13 + [1]*15 + [2]))

In [10]:
def get_names(name_components, idx):
    if len(name_components) == 3:
        name_de = name_components[0]
        name_sci = " ".join(name_components[1:])
    else:
        split_idx = split.loc[idx]
        name_de = " ".join(name_components[:split_idx])
        name_sci = " ".join(name_components[split_idx:])
    
    return name_de, name_sci
    
    
for idx, comp in name_components.items():
    metadata.loc[idx, ["name_de", "name_sci"]] = get_names(comp, idx)

In [3]:
# map english names in comment to sci names
# Import the required modules
import xmltodict
from pprint import pprint

# Open the file and read the contents
with open('../meta/master_ioc-names_xml.xml', 'r', encoding='utf-8') as file:
	my_xml = file.read()

# Use xmltodict to parse and convert the XML document
my_dict = xmltodict.parse(my_xml)

# Print the dictionary
#pprint.pprint(my_dict, indent=2)

# !pip3 install xmltodict

mapping  = dict()

species = []

for i in range(len(my_dict['ioclist']['list']['order'])):
    family =  my_dict['ioclist']['list']['order'][i]['family']
    
    if not isinstance(family, list): 
        family = [family]
    
    for fam in family:
        genus = fam['genus']
        if not isinstance(genus, list): 
            genus = [genus]
        
        for g in genus:
            g_name = g['latin_name']
            
            if not isinstance(g['species'], list): 
                g['species'] = [g['species']]
            
            for item in g['species']:
                    mapping[item['english_name']] = f'{g_name} {item["latin_name"]}' 
                    species.append(f'{g_name} {item["latin_name"]}')

In [32]:
def comment_mapping(comment):
    out = comment
    for name_eng in mapping.keys():
        out = out.replace(name_eng, mapping[name_eng])
    return out

metadata["comment"] = metadata.comment.apply(comment_mapping)

In [8]:
def get_label(record):
    name_sci = record["name_sci"]
    comment = record["comment"].strip()
    return f"A {name_sci}. {comment}"

metadata["label"] = metadata.apply(get_label, axis=1)

In [9]:
metadata.to_csv(metadata_loc, index=False)