# Data collection

1. Collect all docs from API call to XENO-CANTO with the query 'cnt:"=Singapore" grp:"birds"' to get all Sinapore recordings
2. Compile list of all unique species that are returned.
3. Perform API call on each of these names
4. Extract either the oscillogram or the audio file
5. Convert data to valid format to pass through the net.

### 1

In [1]:
import requests
import json
from tqdm import tqdm
import os

In [2]:
XENO_URL_ROOT = "https://www.xeno-canto.org/api/2/recordings?query="
SINGAPORE_QUERY = 'cnt:"=Singapore" grp:"birds"'
SINGAPORE_URL = XENO_URL_ROOT + SINGAPORE_QUERY

In [3]:
def collect_all_pages(url):
    data = requests.get(url)
    data_dict = json.loads(data.text)
    numPages = data_dict["numPages"]
    recording_data_list = data_dict["recordings"]
    if numPages > 1:
        for page in range(2, numPages+1):
            page_updated_url = url + "&page=" + str(page)
            page_data_dict = json.loads(requests.get(page_updated_url).text)
            recording_data_list.extend(page_data_dict["recordings"])
    return recording_data_list

In [4]:
def extract_unique_species(data_list, key="en"):
    unique_species = set()
    for doc in data_list:
        unique_species.add(doc[key])
    return list(unique_species)

In [5]:
singapore_data = collect_all_pages(SINGAPORE_URL)

### 2

In [6]:
unique_singapore_species = extract_unique_species(singapore_data)

### 3

In [7]:
unique_singapore_queries  = [species.replace(" ", "+") for species in unique_singapore_species]
filenames = [species.replace(" ", "_") for species in unique_singapore_species]
unique_singapore_urls = [XENO_URL_ROOT+query for query in unique_singapore_queries]

#### Write API data to files

In [9]:
# for i in tqdm(range(0, len(unique_singapore_urls))):
#     url = unique_singapore_urls[i]
#     species_i_data = collect_all_pages(url)
#     with open(f"./data/{filenames[i]}.json", "w") as outfile:
#         json.dump(species_i_data, outfile)

#### Compile all json datafiles into single file for mySQL

In [26]:
compiled_data = []
for filename in os.listdir("./data"):
    if filename.endswith(".json"):
        with open("./data/" + filename, 'r') as speciesFile:
            species_data = json.load(speciesFile)
            compiled_data.extend(species_data)
            
# Convert "00:27" string length format to integer representing length in seconds
for d in compiled_data:
    string_length = d["length"]
    try:
        mins, secs = string_length.split(":")
        int_time = int(mins)*60 + int(secs)
    except:
        hours, mins, secs = string_length.split(":")
        int_time = int(hours)*3600 + int(mins)*60 + int(secs)
        
    d["length"] = int_time

# Write data to compiled_data file
# with open("./data/compiled/compiled_data.json", "w") as outfile:
#     json.dump(compiled_data, outfile)


In [29]:

with open("./data/compiled/compiled_data.json", "r") as largeFile:
    data = json.load(largeFile)
    name_index = 0
    prev_index = 0
    for i in range(len(data)//10, len(data)+1, len(data)//10):
        with open(f"./data/compiled/comp_{str(name_index)}.json", 'w') as outfile:
            # json.dump(data[prev_index:i], outfile)
            prev_index = i
        name_index += 1
    

In [5]:
# Script used to collect Costa Rica test birds.

# costa_birds = ['Great Tinamou', 'Green Ibis', 'Broad-billed Motmot', 'Yellow-throated Toucan', 'White-breasted Wood Wren']

# for bird in costa_birds:
#     query = XENO_URL_ROOT + bird.replace(" ", "+")
#     bird_data = collect_all_pages(query)
#     filename = bird.replace(" ", "_")
#     with open(f"./data/costa_rica/{filename}.json", "w") as outfile:
#          json.dump(bird_data, outfile)