In [66]:
import numpy as np

# File management
import os
import requests
import json
import time
import random

# Audio processing
import librosa as li
from scipy import signal
import soundfile as sf

# Plotting
from matplotlib import pyplot as plt

### Download argentinian birds dataset from Xeno-Canto

#### Query Xeno-Canto API and save response as JSON file.

In [2]:
dataset_location = '..\\datasets\\xeno-canto_argentina\\'

# Query variables
country = 'argentina'
group = 'birds'
length = '12-60'
since = '2014-01-01'

In [3]:
url = "https://xeno-canto.org/api/2/recordings?query="
params = f"cnt:{country}+grp:{group}+len:{length}+since:{since}"

response = requests.get(url + params)

print(f'• Query result: status-code {response.status_code}')

if response.status_code == 200:
  data = response.json()
  
  n_rec = data['numRecordings']
  pages = data['numPages']
  print(f'• Found {n_rec} recordings in {pages} pages.')

• Query result: status-code 200
• Found 3949 recordings in 8 pages.


In [4]:
# Write json files for all pages
for p in range(1, pages + 1):
  response = requests.get(url + params + f'&page={p}')
  data = response.json()
  
  filename = f"query_{str(p)}.json"
  with open(dataset_location + filename, "w") as file:
    json.dump(data, file, sort_keys=True, indent=4)
    print(f'• Saved page {p} as {filename}')

• Saved page 1 as query_1.json
• Saved page 2 as query_2.json
• Saved page 3 as query_3.json
• Saved page 4 as query_4.json
• Saved page 5 as query_5.json
• Saved page 6 as query_6.json
• Saved page 7 as query_7.json
• Saved page 8 as query_8.json


#### Download files to dataset audio folder.
Each category will be downloaded to their corresponding subfolder.

In [7]:
# Create audio folder inside dataset.
audio_location = dataset_location + 'unprocessed_audio_files\\'
try:
  os.mkdir(audio_location)
  print(f'Created {audio_location}')
except:
  print('Folder already existed.')

Created ..\datasets\xeno-canto_argentina\unprocessed_audio_files\


In [None]:
for file in os.listdir(dataset_location):
  if file.endswith('.json'):
    
    with open(dataset_location + file) as f:
      data = json.load(f)
      recordings = data['recordings']
      print(f"Downloading files from {file}...")
      
      for r in recordings:
        # Get metadata from json
        id = r['id']
        bird = r['en']
        download = r['file']
        ext = '.' + r['file-name'].split('.')[-1]
        
        # Create subfolder if not exists
        subfolder = bird + '\\'
        try:
          os.mkdir(audio_location + subfolder)
        except:  # noqa: E722
          pass
        
        # Download file
        with open(audio_location + subfolder + bird + '_' + id + ext, 'wb') as out_file:
          content = requests.get(download, stream=True).content
          out_file.write(content)
          
        # Wait required time between recordings (randomized)
        time.sleep(random.uniform(1.01, 1.2))
    
    print("Done!")
  
  # Wait some time between json pages (randomized)     
  time.sleep(random.uniform(1, 5))

### Preprocess audio dataset

User defined constants.

In [49]:
# Target sample rate for resampling audio files
SAMPLE_RATE = 16000

# Target length for audio segments (in seconds)
SAMPLE_LENGTH = 3

#### Function definitions

In [31]:
def butter_highpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
    return b, a

def apply_butter_highpass(data, cutoff, fs, order=5):
    b, a = butter_highpass(cutoff, fs, order=order)
    y = signal.filtfilt(b, a, data)
    return y

In [44]:
def remove_silence(signal, thresh=20, hop=1024, plot=False):
  splits = li.effects.split(y=signal, top_db=thresh, frame_length=(hop * 4), hop_length=hop)
  
  # For fine-tuning purposes
  if plot:
    peak = np.max(signal)
    plt.subplots(figsize=(12,4))
    plt.plot(signal)
    plt.vlines(splits, ymin=-peak, ymax=peak, color='red')
    plt.show()
  
  stripped_audio = []
  
  for s in splits:
    split = signal[s[0]:s[1]]
    stripped_audio.extend(split)
  
  return np.asarray(stripped_audio)

In [45]:
def split_audio(signal, target_length, samplerate, plot=False): # Target length must be in seconds
  duration = li.get_duration(y=signal, sr=samplerate)
  n_segments = np.ceil(duration / target_length)
   
  audio_segments = []
  
  for n in range(int(n_segments)):
    s = signal[samplerate * n * target_length : samplerate * (n + 1) * target_length]
    
    if len(s) < target_length * samplerate:
      s = np.pad(s, (0, target_length * samplerate - len(s)), 'constant')
    
    audio_segments.append(s)
    
    if plot:
      plt.plot(s, alpha=1/n_segments)
    
  if plot:
    plt.show()

  return audio_segments

#### Audio preprocessing

- Load each file from all unprocessed audio subfolders. 
- Make an empty copy of the folder in the processed directory.
- Apply high filter, strip noise sections and split into constant length segments.
- Save shorter segments as new WAV files.

In [18]:
processed_location = dataset_location + 'processed_audio_files\\'
subfolders = os.listdir(audio_location)

# Make processed audio folder
try:
  os.mkdir(processed_location)
except:  # noqa: E722
  pass

In [67]:
for sub in subfolders:
  files = os.listdir(audio_location + sub)
  
  # Create same subfolder in processed folder
  p_subfolder = processed_location + sub + '\\'
  try:
    os.mkdir(p_subfolder)
  except:  # noqa: E722
    pass
  
  for f in files:
    # Load audio file from unprocessed folder
    file_path = audio_location + sub + '\\' + f
    y, sr = li.load(file_path, sr=SAMPLE_RATE, mono=True, res_type='soxr_lq')
    
    # Apply high-pass filter
    y = apply_butter_highpass(data=y, cutoff=800, fs=SAMPLE_RATE, order=6)
    
    # Delete silent sections
    y = remove_silence(y, plot=False)
    
    # Split into segments of desired length
    audio_segments = split_audio(y, target_length=SAMPLE_LENGTH, samplerate=SAMPLE_RATE)
    
    # Iterate through splitted audio segments and save each one as a separate .wav file
    for i, segment in enumerate(audio_segments):
      new_filename = f.split('.')[0] + '_' + str(i) + '.wav'
      sf.write(p_subfolder + new_filename, segment, SAMPLE_RATE, subtype='PCM_16')
    