### Import dependencies


In [None]:
import os
import shutil
import opendatasets as od
import json

import requests
from time import sleep

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import librosa as li
import soundfile as sf
from scipy import signal

import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

### Function definitions


In [None]:
# Generate butterworth highpass coefficients
def butter_highpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
    return b, a


# Apply filter to signal
def apply_butter_highpass(data, cutoff, fs, order=5):
    b, a = butter_highpass(cutoff, fs, order=order)
    y = signal.filtfilt(b, a, data)
    return y

In [None]:
# Remove sections of silence or low intensity signal
def remove_silence(signal, thresh=18, hop=2048, plot=False):
    splits = li.effects.split(y=signal, top_db=thresh, frame_length=(hop * 2), hop_length=hop)
    if plot:
        peak = np.max(signal)
        plt.subplots(figsize=(12, 4))
        plt.plot(signal)
        plt.vlines(splits, ymin=-peak, ymax=peak, color='red')
        plt.show()

    stripped_audio = []

    for s in splits:
        split = signal[s[0] : s[1]]
        stripped_audio.extend(split)

    return np.asarray(stripped_audio)

In [None]:
# Split audio into segments of desired length
def split_audio(signal, target_length, samplerate, plot=False):
    duration = li.get_duration(y=signal, sr=samplerate)
    n_segments = np.ceil(duration / target_length)
    audio_segments = []

    for n in range(int(n_segments)):
        s = signal[samplerate * n * target_length : samplerate * (n + 1) * target_length]

        if len(s) < target_length * samplerate:
            s = np.pad(s, (0, target_length * samplerate - len(s)), 'constant')

        audio_segments.append(s)

        if plot:
            plt.plot(s, alpha=1 / n_segments)

    if plot:
        plt.show()

    return audio_segments

In [None]:
# Apply all signal processing functions to audio and return segments
def generate_preprocessed_samples(path, sr, length, hp=700):

    y, sr = li.load(path, sr=sr, mono=True)  # Load audio file

    y = apply_butter_highpass(data=y, cutoff=hp, fs=sr, order=5)  # Apply high-pass filter

    # Delete silent sections
    y = remove_silence(y, thresh=18, hop=2048, plot=False)

    audio_segments = split_audio(
        y, target_length=length, samplerate=sr
    )  # Split into segments of desired length

    return audio_segments

# Audio dataset download and preparation


In [None]:
MIN_RECORDED_TIME = 100
MAX_RECORDED_TIME = 120

## Database query and metadata generation


In [None]:
dataset_location = '..\\datasets\\xeno-canto_argentina\\'

# Query filters
country = 'argentina'
since = '1980-01-01'
group = 'birds'
length = '10-60'

In [None]:
api_url = 'https://xeno-canto.org/api/2/recordings?query='
params = f"cnt:{country}+grp:{group}+len:{length}+since:{since}"

response = requests.get(api_url + params)

print(f"• Query result: status-code {response.status_code}")

if response.status_code == 200:
    data = response.json()

    n_rec = data["numRecordings"]
    pages = data["numPages"]
    print(f"• Found {n_rec} recordings in {pages} pages.")

In [None]:
# Create dataset folder
try:
    os.mkdir(dataset_location)
    print('Created new folder: ' + dataset_location)
except:
    try:
        os.mkdir('..\\datasets')
        os.mkdir(dataset_location)
    except:
        print(dataset_location + ' already exists\n')

# Store each page of the request into a Pandas dataframe
df_list = []
for page in range(1, pages + 1):
    print(f'Working on page {page}...', end="   ")
    response = requests.get(api_url + params + f"&page={page}")
    data = response.json()
    df = pd.json_normalize(data['recordings'])
    df_list.append(df)
    print('Done!')

# Concatenate all dataframes into one and save to disk
recordings_dataframe = pd.concat(df_list, ignore_index=True)
recordings_dataframe.to_csv(dataset_location + 'metadata.csv', index=False)

### Filter metadata entries


In [None]:
df = pd.read_csv(
    dataset_location + 'metadata.csv',
    usecols=[
        'id',
        'group',
        'gen',
        'sp',
        'ssp',
        'en',
        'loc',
        'type',
        'file',
        'q',
        'length',
        'method',
        'file-name',
    ],
)

# Filter metadata
df = df.loc[
    (df['group'] == 'birds')
    & (df['gen'] != 'Mystery')
    & (df['sp'] != 'mystery')
    & (df['q'].isin(['A']))
    & (df['method'] == 'field recording')
    & (df['type'].isin(['call', 'song']))
]

# Convert length column from minutes to seconds for aggregation
df['length'] = df['length'].map(minutes_to_seconds)

# Get total audio length for each species and filter by total length
df_group = df.groupby(['gen', 'sp'], as_index=False).sum('length')
df_group = df_group.loc[df_group['length'] > MIN_RECORDED_TIME]

# Get filtered dataframe
species_filter = (df_group['gen'] + ' ' + df_group['sp']).values.tolist()
df_filter = df.loc[(df['gen'] + ' ' + df['sp']).isin(species_filter)]

print(
    f'{len(species_filter)} distinct species with at least {MIN_RECORDED_TIME} seconds '
    + f'of recording were selected, with a total of {len(df_filter)} audio files.'
)

In [None]:
df_group.sort_values('length', ascending=False).iloc[:25]

# Download audio data from Xeno-Canto database


In [None]:
# Create subfolders for each species
audio_location = dataset_location + 'audio\\'

os.mkdir(audio_location)
for sp in species_filter:
    os.mkdir(os.path.join(audio_location, sp))

In [None]:
# Store downloaded time in a dictionary
downloaded_time_per_species = dict.fromkeys(species_filter, 0)

for index in df_filter.index:
    # Get data from df row
    species = df['gen'][index] + ' ' + df['sp'][index]
    id = df['id'][index]
    length = df['length'][index]
    url = df['file'][index]
    ext = df['file-name'][index].split('.')[-1]

    # Download files for each species until max_recorded_time is reached
    if (downloaded_time_per_species[species] + length) <= MAX_RECORDED_TIME:
        downloaded_time_per_species[species] += length
        
        filename = f'{species}_{id}.{ext}'

        with open(os.path.join(audio_location, species, filename), 'wb') as out_file:
            content = requests.get(url, stream=True).content
            out_file.write(content)
        
        # Wait required time between recordings
        sleep(1.2)

# Audio preprocessing and feature extraction


In [None]:
# Target sample rate for resampling audio files
SAMPLE_RATE = 16000

# Target length for audio segments (in seconds)
SAMPLE_LENGTH = 3

dataset_path = '..\\datasets\\xeno-canto_argentina\\'
audio_path = '..\\datasets\\xeno-canto_argentina\\audio\\'

# TODO: fix function so it does not load all MFCCs into memory

def preprocess_audio_dataset(data_path, json_path=None, mfcc_count=13, hop=512, fft_len=2048):
data_dict = {
'label_map': [],
'encoded_labels': [],
'mfccs': [],
'files': [],
}

    for i, (path, _, files) in enumerate(os.walk(data_path)):
        if path == data_path:  # Ignore parent folder
            continue

        # Add unique labels to label_map list
        label = path.split('\\')[-1]
        if label not in data_dict['label_map']:
            data_dict['label_map'].append(label)

        for f in files:
            segments = generate_preprocessed_samples(os.path.join(path, f), sr=SAMPLE_RATE, length=SAMPLE_LENGTH)
            for segment in segments:
                # Add encoded label to encoded_labels list
                index = data_dict['label_map'].index(label)
                data_dict['encoded_labels'].append(index)

                # Add original file path to files list
                data_dict['files'].append(os.path.join(path, f))

                # Load audio and compute MFCCs
                y, sr = li.load(os.path.join(path, f), sr=None, mono=True)
                mfccs = li.feature.mfcc(y=y, sr=sr, n_mfcc=mfcc_count, hop_length=hop, n_fft=fft_len)

                # Append MFCCs to list. Casting np.array to list for saving as JSON file.
                data_dict['mfccs'].append(mfccs.transpose().tolist())

    # Store data dictionary in JSON file
    if json_path:
        with open(json_path, 'w') as jf:
            json.dump(data_dict, jf, indent=4)
            print(f'Successfully saved preprocessed data to {json_path}!')
            file_count = len(data_dict['files'])
            print(f'{file_count} audio samples were processed!')

    return data_dict


In [None]:
json_path = dataset_path + 'preprocessed_data.json'
preprocess_audio_dataset(audio_path, json_path=json_path, mfcc_count=13)