### Import dependencies


In [4]:
import os
import json

import requests
from time import sleep

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import librosa as li
import soundfile as sf
from scipy import signal

### Function definitions


In [3]:
# Generate butterworth highpass coefficients
def butter_highpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
    return b, a


# Apply filter to signal
def apply_butter_highpass(data, cutoff, fs, order=5):
    b, a = butter_highpass(cutoff, fs, order=order)
    y = signal.filtfilt(b, a, data)
    return y

In [5]:
# Remove sections of silence or low intensity signal
def remove_silence(signal, thresh=18, hop=2048, plot=False):
    splits = li.effects.split(y=signal, top_db=thresh, frame_length=(hop * 2), hop_length=hop)
    if plot:
        peak = np.max(signal)
        plt.subplots(figsize=(12, 4))
        plt.plot(signal)
        plt.vlines(splits, ymin=-peak, ymax=peak, color='red')
        plt.show()

    stripped_audio = []

    for s in splits:
        split = signal[s[0] : s[1]]
        stripped_audio.extend(split)

    return np.asarray(stripped_audio)

In [6]:
# Split audio into segments of desired length
def split_audio(signal, target_length, samplerate, plot=False):
    duration = li.get_duration(y=signal, sr=samplerate)
    n_segments = np.ceil(duration / target_length)
    audio_segments = []

    for n in range(int(n_segments)):
        s = signal[samplerate * n * target_length : samplerate * (n + 1) * target_length]

        if len(s) < target_length * samplerate:
            s = np.pad(s, (0, target_length * samplerate - len(s)), 'constant')

        audio_segments.append(s)

        if plot:
            plt.plot(s, alpha=1 / n_segments)

    if plot:
        plt.show()

    return audio_segments

In [19]:
# Apply all signal processing functions to audio and return segments
def generate_preprocessed_samples(path, sr, length, hp=700):

    y, sr = li.load(path, sr=sr, mono=True)  # Load audio file

    y = apply_butter_highpass(data=y, cutoff=hp, fs=sr, order=5)  # Apply high-pass filter

    # Delete silent sections
    y = remove_silence(y, thresh=20, hop=2048, plot=False)

    audio_segments = split_audio(
        y, target_length=length, samplerate=sr
    )  # Split into segments of desired length

    return audio_segments

# Audio dataset download and preparation


In [48]:
MIN_RECORDED_TIME = 120
SPECIES_COUNT = 60

## Database query and metadata generation


In [45]:
dataset_location = '..\\datasets\\xeno-canto_argentina\\'

# Query filters
country = 'argentina'
since = '2000-01-01'
group = 'birds'
length = '10-60'

In [46]:
api_url = 'https://xeno-canto.org/api/2/recordings?query='
params = f"cnt:{country}+grp:{group}+len:{length}+since:{since}"

response = requests.get(api_url + params)

print(f"• Query result: status-code {response.status_code}")

if response.status_code == 200:
    data = response.json()

    n_rec = data["numRecordings"]
    pages = data["numPages"]
    print(f"• Found {n_rec} recordings in {pages} pages.")

• Query result: status-code 200
• Found 8694 recordings in 18 pages.


In [47]:
# Create dataset folder
try:
    os.mkdir(dataset_location)
    print('Created new folder: ' + dataset_location)
except:
    try:
        os.mkdir('..\\datasets')
        os.mkdir(dataset_location)
    except:
        print(dataset_location + ' already exists\n')

# Store each page of the request into a Pandas dataframe
df_list = []
for page in range(1, pages + 1):
    print(f'Working on page {page}...', end="   ")
    response = requests.get(api_url + params + f"&page={page}")
    data = response.json()
    df = pd.json_normalize(data['recordings'])
    df_list.append(df)
    print('Done!')

# Concatenate all dataframes into one and save to disk
recordings_dataframe = pd.concat(df_list, ignore_index=True)
recordings_dataframe.to_csv(dataset_location + 'metadata.csv', index=False)

Created new folder: ..\datasets\xeno-canto_argentina\
Working on page 1...   Done!
Working on page 2...   Done!
Working on page 3...   Done!
Working on page 4...   Done!
Working on page 5...   Done!
Working on page 6...   Done!
Working on page 7...   Done!
Working on page 8...   Done!
Working on page 9...   Done!
Working on page 10...   Done!
Working on page 11...   Done!
Working on page 12...   Done!
Working on page 13...   Done!
Working on page 14...   Done!
Working on page 15...   Done!
Working on page 16...   Done!
Working on page 17...   Done!
Working on page 18...   Done!


### Filter metadata entries


In [49]:
def minutes_to_seconds(mmss_time):
    m, s = mmss_time.split(':')
    return (int(m) * 60) + int(s)

In [50]:
df = pd.read_csv(
    dataset_location + 'metadata.csv',
    usecols=[
        'id',
        'group',
        'gen',
        'sp',
        'ssp',
        'en',
        'loc',
        'type',
        'file',
        'q',
        'length',
        'method',
        'file-name',
    ],
)

# Clean metadata
df = df.loc[
    (df['group'] == 'birds')
    & (df['gen'] != 'Mystery')
    & (df['sp'] != 'mystery')
    & (df['q'].isin(['A']))
    & (df['method'] == 'field recording')
    & (df['type'].isin(['call', 'song']))
]

# Convert length column from minutes to seconds for aggregation
df['length'] = df['length'].map(minutes_to_seconds)

# Group df by species and aggregate file length
df_group = df.groupby(['gen', 'sp'], as_index=False).sum('length')

# Get top n species with most recording time
df_group = df_group.sort_values('length', ascending=False).iloc[:SPECIES_COUNT]
selected_species = (df_group['gen'] + ' ' + df_group['sp']).values.tolist()

# Filter recordings based on selected species
df_filter = df.loc[(df['gen'] + ' ' + df['sp']).isin(selected_species)]

# Download audio data from Xeno-Canto database


In [52]:
# Create subfolders for each species
audio_location = dataset_location + 'audio\\'

os.mkdir(audio_location)
for sp in selected_species:
    os.mkdir(os.path.join(audio_location, sp))

In [60]:
# Store currently downloaded audio time in a dictionary
downloaded_time_per_species = dict.fromkeys(selected_species, 0)

for index in df_filter.index:
    # Get data from df row
    species = df['gen'][index] + ' ' + df['sp'][index]
    id = df['id'][index]
    length = df['length'][index]
    url = df['file'][index]
    ext = df['file-name'][index].split('.')[-1]

    # Download files for each species until max_recorded_time is reached
    if (downloaded_time_per_species[species]) <= MIN_RECORDED_TIME:
        downloaded_time_per_species[species] += length
        
        filename = f'{species}_{id}.{ext}'

        with open(os.path.join(audio_location, species, filename), 'wb') as out_file:
            content = requests.get(url, stream=True).content
            out_file.write(content)
        
        # Wait required time between recordings
        sleep(1.2)

### Convert all files to FLAC 16bits/16kHz

In [89]:
def convert_to_flac(audio_path, samplerate):
    for i, (path, _, files) in enumerate(os.walk(audio_path)):
        if path == audio_path:  # Ignore parent folder
            continue

        for f in files:
            filename = os.path.join(path, f)
            old_extension = filename.split('.')[-1]
            y, sr = li.load(filename, sr=samplerate, mono=True)
            sf.write(filename.replace(old_extension, 'flac'), y, sr, format='flac', subtype='PCM_16')
            os.remove(filename)

In [90]:
SAMPLE_RATE = 16000
audio_path = '..\\datasets\\xeno-canto_argentina\\audio\\'

convert_to_flac(audio_path, SAMPLE_RATE)

# Audio preprocessing and feature extraction


In [12]:
# Target sample rate for resampling audio files
SAMPLE_RATE = 16000

# Target length for audio segments (in seconds)
SAMPLE_LENGTH = 3

dataset_path = '..\\datasets\\xeno-canto_argentina\\'
audio_path = '..\\datasets\\xeno-canto_argentina\\audio\\'

In [30]:
# TODO: fix function so it does not load all MFCCs into memory

def preprocess_audio_dataset(data_path, json_path=None, mfcc_count=13, hop=512, fft_len=2048):
    data_dict = {
    'label_map': [],
    'encoded_labels': [],
    'mfccs': [],
    'files': [],
    }

    for i, (path, _, files) in enumerate(os.walk(data_path)):
        if path == data_path:  # Ignore parent folder
            continue

        # Add unique labels to label_map list
        label = path.split('\\')[-1]
        if label not in data_dict['label_map']:
            data_dict['label_map'].append(label)

        for f in files:
            segments = generate_preprocessed_samples(os.path.join(path, f), sr=SAMPLE_RATE, length=SAMPLE_LENGTH)
            for segment in segments:
                # Add encoded label to encoded_labels list
                index = data_dict['label_map'].index(label)
                data_dict['encoded_labels'].append(index)

                # Add original file path to files list
                data_dict['files'].append(os.path.join(path, f))

                # Compute MFCCs
                mfccs = li.feature.mfcc(y=segment, sr=SAMPLE_RATE, n_mfcc=mfcc_count, hop_length=hop, n_fft=fft_len)

                # Append MFCCs to list. Casting np.array to list for saving as JSON file.
                data_dict['mfccs'].append(mfccs.transpose().tolist())

    # Store data dictionary in JSON file
    if json_path:
        with open(json_path, 'w') as jf:
            json.dump(data_dict, jf, indent=4)
            print(f'Successfully saved preprocessed data to {json_path}!')
            file_count = len(data_dict['files'])
            print(f'{file_count} audio samples were processed!')


In [31]:
json_path = dataset_path + 'preprocessed_data.json'
preprocess_audio_dataset(audio_path, json_path=json_path, mfcc_count=13)

Successfully saved preprocessed data to ..\datasets\xeno-canto_argentina\preprocessed_data.json!
1389 audio samples were processed!
