In [1]:
import os
import requests
import json
import time
import random

### Download Argentinian Passeriformes dataset from Xeno-Canto

#### Query Xeno-Canto API and save response as JSON file.

In [2]:
dataset_location = '..\\datasets\\xeno-canto_argentina\\'

# Query variables
country = 'argentina'
group = 'birds'
length = '12-60'
since = '2014-01-01'

In [3]:
url = "https://xeno-canto.org/api/2/recordings?query="
params = f"cnt:{country}+grp:{group}+len:{length}+since:{since}"

response = requests.get(url + params)

print(f'• Query result: status-code {response.status_code}')

if response.status_code == 200:
  data = response.json()
  
  n_rec = data['numRecordings']
  pages = data['numPages']
  print(f'• Found {n_rec} recordings in {pages} pages.')

• Query result: status-code 200
• Found 3949 recordings in 8 pages.


In [4]:
# Write json files for all pages
for p in range(1, pages + 1):
  response = requests.get(url + params + f'&page={p}')
  data = response.json()
  
  filename = f"query_{str(p)}.json"
  with open(dataset_location + filename, "w") as file:
    json.dump(data, file, sort_keys=True, indent=4)
    print(f'• Saved page {p} as {filename}')

• Saved page 1 as query_1.json
• Saved page 2 as query_2.json
• Saved page 3 as query_3.json
• Saved page 4 as query_4.json
• Saved page 5 as query_5.json
• Saved page 6 as query_6.json
• Saved page 7 as query_7.json
• Saved page 8 as query_8.json


#### Download files to dataset audio folder.
Each category will be downloaded to their corresponding subfolder.

In [5]:
# Create audio folder inside dataset.
audio_location = dataset_location + 'audio\\'
try:
  os.mkdir(audio_location)
  print(f'Created {audio_location}')
except:
  print('Folder already existed.')

Created ..\datasets\xeno-canto_argentina\audio\


In [None]:
for file in os.listdir(dataset_location):
  if file.endswith('.json'):
    
    with open(dataset_location + file) as f:
      data = json.load(f)
      recordings = data['recordings']
      print(f"Downloading files from {file}...")
      
      for r in recordings:
        # Get metadata from json
        id = r['id']
        bird = r['en']
        download = r['file']
        ext = '.' + r['file-name'].split('.')[-1]
        
        # Create subfolder if not exists
        subfolder = bird + '\\'
        try:
          os.mkdir(audio_location + subfolder)
        except:  # noqa: E722
          pass
        
        # Download file
        with open(audio_location + subfolder + bird + '_' + id + ext, 'wb') as out_file:
          content = requests.get(download, stream=True).content
          out_file.write(content)
          
        # Wait required time between recordings (randomized)
        time.sleep(random.uniform(1.01, 1.2))
    
    print("Done!")
  
  # Wait some time between json pages (randomized)     
  time.sleep(random.uniform(1, 5))