In [1]:
import pandas as pd
from urllib.request import urlretrieve
from os import mkdir, path as os_path
import aiohttp
import asyncio

In [2]:
MAX_CONCURRENT_REQUESTS = 7
audio_dir = 'audio_files/'
nots = []

In [3]:
async def fetch_and_process_page(session, n_page, country, semaphore):
    url = f'https://www.xeno-canto.org/api/2/recordings?query=cnt:{country}&page={n_page}'
    try:
        async with semaphore:
            async with session.get(url) as response:
                page_json = await response.json()
    except Exception as e:
        print(f"An error occurred while fetching page {n_page}: {str(e)}")
        return None

    ids, files, file_names, ens, sps, lengths, gens = [], [], [], [], [], [], []
    for recording in page_json["recordings"]:
        ids.append(recording["id"])
        files.append(recording["file"])
        file_names.append(recording["file-name"])
        ens.append(recording["en"])
        lengths.append(recording["length"])
        gens.append(recording["gen"])
        sps.append(recording["sp"])

    return pd.DataFrame.from_records({'id': ids, "file": files, "file-name": file_names, "en": ens, "gen": gens, "sp": sps, "length": lengths})


In [4]:
async def get_csv():
    country = 'portugal'
    url = f'https://www.xeno-canto.org/api/2/recordings?query=cnt:{country}'
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            if response.status != 200:
                print(f"Failed to fetch initial page. Status code: {response.status}")
                return
            js = await response.json()
    
        semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
        tasks = [fetch_and_process_page(session, n_page, country, semaphore) for n_page in range(1, js["numPages"]+1)]
        results = await asyncio.gather(*tasks)

        df = pd.concat([r for r in results if r is not None], ignore_index=True)
        df.to_csv("birds_raw_pt.csv", index=False)

await get_csv()

In [5]:
async def download_with_retry(url, file_path, max_retries=3):
    for _ in range(max_retries):
        try:
            urlretrieve(url, filename=file_path)
            return
        except Exception as e:
            print(e)
            await asyncio.sleep(20)

In [6]:
async def download_audio(sem, row):
    async with sem:
        url = row["file"]
        f_name = str(row["id"]) + row["file-name"][-4:].lower()
        try:
            await download_with_retry(url, audio_dir + f_name)
        except Exception as e:
            print("\nRetrying:", url)
            print(e)
            await asyncio.sleep(20)
            try:
                await download_with_retry(url, audio_dir + f_name)
            except Exception as ee:
                print("Not downloaded|", f_name)
                print(ee)
                nots.append(row["id"])

In [7]:
async def build_dataset():
    audio_dir = 'audio_files/'
    if not os_path.isdir(audio_dir):
        mkdir(audio_dir)
    
    bird_df = pd.read_csv("birds_raw_pt.csv")
    bird_df = bird_df[bird_df['en'] != 'Identity unknown'].copy()
    bird_df = bird_df[bird_df['en'] != 'Soundscape'].copy()
    counts = bird_df['en'].value_counts()
    chosen = counts[counts >= 30].index
    bird_df = bird_df[bird_df["en"].isin(chosen)]
    bird_df.to_csv("birds_clean_pt.csv", index=False)

    print(bird_df.shape)
    print(len(bird_df["en"].unique()), "different species")

    sem = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)

    tasks = [download_audio(sem, row[1]) for row in bird_df.iterrows()]
    await asyncio.gather(*tasks)

    if len(nots) > 0:
        with open('not_downloaded.txt', 'w') as f:
            for item in nots:
                f.write(str(item) + '\n')
        print(str(nots))
    else:
        print('All files were successfully downloaded!')

await build_dataset()

(9807, 7)
107 different species
HTTP Error 500: Internal Server Error
HTTP Error 500: Internal Server Error
HTTP Error 500: Internal Server Error
All files were successfully downloaded!
