In [2]:
import json
from operator import truediv

from project_info import ROOT
import os
import pandas as pd

taxonomy_2024 = pd.read_csv(os.path.join(ROOT,"data/2024/eBird_Taxonomy_v2021.csv"))
taxonomy_2025 = pd.read_csv(os.path.join(ROOT,"data/2025/taxonomy.csv"))
print("Unique species in taxonomy.csv:", taxonomy_2025["primary_label"].nunique())

print("Unique species in 2025 data:", taxonomy_2024["SCI_NAME"].nunique())
print("Unique species in 2025 data:", taxonomy_2025["scientific_name"].nunique())

Unique species in taxonomy.csv: 206
Unique species in 2025 data: 16753
Unique species in 2025 data: 206


In [3]:
merged_df = pd.merge(
    taxonomy_2025,
    taxonomy_2024,
    left_on="scientific_name",
    right_on="SCI_NAME",
    how="left"
)
print("Unique species in 2024-2025 data:", merged_df["scientific_name"].nunique())
merged_df

Unique species in 2024-2025 data: 206


Unnamed: 0,primary_label,inat_taxon_id,scientific_name,common_name,class_name,TAXON_ORDER,CATEGORY,SPECIES_CODE,PRIMARY_COM_NAME,SCI_NAME,ORDER1,FAMILY,SPECIES_GROUP,REPORT_AS
0,1139490,1139490,Ragoniella pulchella,Ragoniella pulchella,Insecta,,,,,,,,,
1,1192948,1192948,Oxyprora surinamensis,Oxyprora surinamensis,Insecta,,,,,,,,,
2,1194042,1194042,Copiphora colombiae,Copiphora colombiae,Insecta,,,,,,,,,
3,126247,126247,Leptodactylus insularum,Spotted Foam-nest Frog,Amphibia,,,,,,,,,
4,1346504,1346504,Neoconocephalus brachypterus,Neoconocephalus brachypterus,Insecta,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,yehcar1,1432779,Milvago chimachima,Yellow-headed Caracara,Aves,11437.0,species,yehcar1,Yellow-headed Caracara,Milvago chimachima,Falconiformes,Falconidae (Falcons and Caracaras),,
202,yelori1,9352,Icterus nigrogularis,Yellow Oriole,Aves,32643.0,species,yelori1,Yellow Oriole,Icterus nigrogularis,Passeriformes,Icteridae (Troupials and Allies),,
203,yeofly1,16567,Tolmomyias sulphurescens,Yellow-olive Flycatcher,Aves,15813.0,species,yeofly1,Yellow-olive Flycatcher,Tolmomyias sulphurescens,Passeriformes,Tyrannidae (Tyrant Flycatchers),,
204,yercac1,10359,Cacicus cela,Yellow-rumped Cacique,Aves,32548.0,species,yercac1,Yellow-rumped Cacique,Cacicus cela,Passeriformes,Icteridae (Troupials and Allies),,


In [4]:
matches = pd.merge(
    taxonomy_2025,
    taxonomy_2024,
    left_on="scientific_name",
    right_on="SCI_NAME",
    how="inner"
)

print("Number of matches:", matches.shape[0])
matches


Number of matches: 146


Unnamed: 0,primary_label,inat_taxon_id,scientific_name,common_name,class_name,TAXON_ORDER,CATEGORY,SPECIES_CODE,PRIMARY_COM_NAME,SCI_NAME,ORDER1,FAMILY,SPECIES_GROUP,REPORT_AS
0,amakin1,2679,Chloroceryle amazona,Amazon Kingfisher,Aves,9788,species,amakin1,Amazon Kingfisher,Chloroceryle amazona,Coraciiformes,Alcedinidae (Kingfishers),,
1,amekes,4665,Falco sparverius,American Kestrel,Aves,11494,species,amekes,American Kestrel,Falco sparverius,Falconiformes,Falconidae (Falcons and Caracaras),,
2,ampkin1,2676,Chloroceryle aenea,American Pygmy Kingfisher,Aves,9789,species,ampkin1,American Pygmy Kingfisher,Chloroceryle aenea,Coraciiformes,Alcedinidae (Kingfishers),,
3,anhing,5063,Anhinga anhinga,Anhinga,Aves,7015,species,anhing,Anhinga,Anhinga anhinga,Suliformes,Anhingidae (Anhingas),Cormorants and Anhingas,
4,babwar,145236,Setophaga castanea,Bay-breasted Warbler,Aves,32973,species,babwar,Bay-breasted Warbler,Setophaga castanea,Passeriformes,Parulidae (New World Warblers),,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,yehcar1,1432779,Milvago chimachima,Yellow-headed Caracara,Aves,11437,species,yehcar1,Yellow-headed Caracara,Milvago chimachima,Falconiformes,Falconidae (Falcons and Caracaras),,
142,yelori1,9352,Icterus nigrogularis,Yellow Oriole,Aves,32643,species,yelori1,Yellow Oriole,Icterus nigrogularis,Passeriformes,Icteridae (Troupials and Allies),,
143,yeofly1,16567,Tolmomyias sulphurescens,Yellow-olive Flycatcher,Aves,15813,species,yeofly1,Yellow-olive Flycatcher,Tolmomyias sulphurescens,Passeriformes,Tyrannidae (Tyrant Flycatchers),,
144,yercac1,10359,Cacicus cela,Yellow-rumped Cacique,Aves,32548,species,yercac1,Yellow-rumped Cacique,Cacicus cela,Passeriformes,Icteridae (Troupials and Allies),,


It appears thatt the vast majority of species are present in both datasets.

In [5]:
species_2024 = dict(zip(matches["SPECIES_CODE"],matches["PRIMARY_COM_NAME"]))
species_2025 = dict(zip(matches["inat_taxon_id"],matches["common_name"]))
print("Species in 2024:", len(species_2024.keys()))
print("Species in 2025:", len(species_2025.keys()))

Species in 2024: 146
Species in 2025: 146


In [6]:
df_2024 = pd.read_csv("../data/2024/train_metadata.csv")
df_2025 = pd.read_csv("../data/2025/train.csv")
df_2024.shape, df_2025.shape

((24459, 12), (28564, 13))

In [7]:
df_2024['filename'] = df_2024['filename'].apply(lambda x: f"2024/train_audio/{x}")
df_2025['filename'] = df_2025['filename'].apply(lambda x: f"2025/train_audio/{x}")

In [8]:
#  common_name, filename
merged_df =  pd.concat([df_2024, df_2025])
merged_df.shape

(53023, 13)

In [9]:
merged_df.drop_duplicates().shape

(53023, 13)

In [10]:
print(merged_df['url'].nunique())
print(merged_df['filename'].nunique())

52970
53023


In [11]:
merged_df.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,collection
0,asbfly,[],['call'],39.2297,118.1987,Muscicapa dauurica,Asian Brown Flycatcher,Matt Slaymaker,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/134896,2024/train_audio/asbfly/XC134896.ogg,
1,asbfly,[],['song'],51.403,104.6401,Muscicapa dauurica,Asian Brown Flycatcher,Magnus Hellström,Creative Commons Attribution-NonCommercial-Sha...,2.5,https://www.xeno-canto.org/164848,2024/train_audio/asbfly/XC164848.ogg,
2,asbfly,[],['song'],36.3319,127.3555,Muscicapa dauurica,Asian Brown Flycatcher,Stuart Fisher,Creative Commons Attribution-NonCommercial-Sha...,2.5,https://www.xeno-canto.org/175797,2024/train_audio/asbfly/XC175797.ogg,
3,asbfly,[],['call'],21.1697,70.6005,Muscicapa dauurica,Asian Brown Flycatcher,vir joshi,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/207738,2024/train_audio/asbfly/XC207738.ogg,
4,asbfly,[],['call'],15.5442,73.7733,Muscicapa dauurica,Asian Brown Flycatcher,Albert Lastukhin & Sergei Karpeev,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/209218,2024/train_audio/asbfly/XC209218.ogg,


In [12]:
import random

def generate_random_color():
    return '#{:06x}'.format(random.randint(0, 0xFFFFFF))

In [13]:
s = """<View>
  <Labels name="label" toName="audio" zoom="true" hotkey="ctrl+enter">\n """
for common_name in merged_df['common_name'].unique():
    s+=f'<Label value="{common_name}" background="{generate_random_color()}"/>\n'
s+="""
    </Labels>
  <Audio name="audio" value="$audio"/>
</View>"""

In [14]:
with open(os.path.join(ROOT,"ls_taxonomy.txt"), "w") as f:
    f.write(s)

In [15]:
import os

print(os.listdir("/"))

['var', 'srv', 'dev', 'mnt', 'lib32', 'etc', 'root', 'home', 'tmp', 'sys', 'libx32', 'lib64', 'swapfile', 'bin', 'snap', 'run', 'opt', 'lost+found', 'cdrom', 'lib', 'media', 'boot', 'usr', 'sbin', 'proc']


In [16]:
import json
template = json.load(open(os.path.join(ROOT,"difference.json")))
#json.dump(template, open(os.path.join(ROOT,"difference_i.json"), "w"), indent=6)
template = json.load(open(os.path.join(ROOT,"difference_i.json")))


In [17]:
merged_df.common_name.unique()

array(['Asian Brown Flycatcher', 'Ashy Drongo', 'Ashy Prinia',
       'Ashy Woodswallow', 'Asian Koel', 'Asian Openbill',
       'Indian Paradise-Flycatcher', 'Asian Palm-Swift',
       'Black-and-orange Flycatcher', 'Barn Swallow',
       'Black-crowned Night-Heron', 'Flame-throated Bulbul',
       'Black-rumped Flameback', 'Black-winged Kite',
       'Black-winged Stilt', 'Black Drongo', 'Black Eagle', 'Black Kite',
       'Black-hooded Oriole', 'Black-naped Monarch',
       "Blyth's Reed Warbler", 'Brown-capped Pygmy Woodpecker',
       'Brahminy Kite', 'Brahminy Starling', 'Brown-cheeked Fulvetta',
       'Brown Fish-Owl', 'Brown Boobook', 'Brown Shrike',
       'Bronzed Drongo', 'Bronze-winged Jacana', 'Brown Wood-Owl',
       'Blue-tailed Bee-eater', 'Bar-winged Flycatcher-shrike',
       'Cattle Egret', 'Chestnut-headed Bee-eater', 'Common Hawk-Cuckoo',
       'Common Flameback', 'Common Greenshank', 'Common Iora',
       'Common Kingfisher', 'Eurasian Moorhen', 'Common Myna',
 

In [18]:
import copy
from mutagen.oggvorbis import OggVorbis

output = []
for idx, row in merged_df.iterrows():
    audio = OggVorbis(os.path.join(ROOT,"data",row.filename))
    duration_seconds = audio.info.length
    tmp = copy.deepcopy(template[0])
    tmp["id"] = idx
    tmp["annotations"][0]["result"][0]["original_length"] = duration_seconds
    tmp["annotations"][0]["result"][0]["value"]["start"] = 0
    tmp["annotations"][0]["result"][0]["value"]["end"] = duration_seconds
    tmp["annotations"][0]["result"][0]["value"]["labels"] = [row.common_name]
    tmp["annotations"][0]["ground_truth"] = True
    tmp["annotations"][0]["task"] = idx
    tmp["data"]["audio"] = "/data/local-files/?d=data/datasets/data_"+row.filename
    output.append(tmp)
    #if idx == 10: break
json.dump(output, open(os.path.join(ROOT,"ls_complete_local_clean.json"), "w"), indent=6)

'import copy\nfrom mutagen.oggvorbis import OggVorbis\n\noutput = []\nfor idx, row in merged_df.iterrows():\n    audio = OggVorbis(os.path.join(ROOT,"data",row.filename))\n    duration_seconds = audio.info.length\n    tmp = copy.deepcopy(template[0])\n    tmp["id"] = idx\n    tmp["annotations"][0]["result"][0]["original_length"] = duration_seconds\n    tmp["annotations"][0]["result"][0]["value"]["start"] = 0\n    tmp["annotations"][0]["result"][0]["value"]["end"] = duration_seconds\n    tmp["annotations"][0]["result"][0]["value"]["labels"] = [row.common_name]\n    tmp["annotations"][0]["ground_truth"] = True\n    tmp["annotations"][0]["task"] = idx\n    tmp["data"]["audio"] = "/data/local-files/?d=data/datasets/data_"+row.filename\n    output.append(tmp)\n    #if idx == 10: break\njson.dump(output, open(os.path.join(ROOT,"ls_complete_local_clean.json"), "w"), indent=6)'