# Dataset Intersection Analysis

This notebook:
1. Loads Xeno-Canto audio metadata (A-M v11 and N-Z v11)
2. Loads CUB-200-2011 image metadata
3. Normalizes species names across datasets
4. Computes species intersection between Xeno-Canto and CUB-200
5. Filters samples to intersection species
6. Saves filtered indices for downstream tasks

## Setup and Imports

In [16]:
import sys
from pathlib import Path
import json

# Add src to path
ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
sys.path.insert(0, str(ROOT))

# Import pyarrow first to avoid extension type registration issues
import pyarrow as pa
import pandas as pd

from src.data.xeno_canto import index_xeno_canto
from src.data.cub import index_cub
from src.utils.species import normalize_species_name

# Paths
DATA_DIR = ROOT / 'data'
ARTIFACTS = ROOT / 'artifacts'
ARTIFACTS.mkdir(exist_ok=True)

# Dataset paths
XENO_CANTO_AM = DATA_DIR / 'datasets/rohanrao/xeno-canto-bird-recordings-extended-a-m/versions/11'
XENO_CANTO_NZ = DATA_DIR / 'datasets/rohanrao/xeno-canto-bird-recordings-extended-n-z/versions/11'
CUB_ROOT = DATA_DIR / 'datasets/visipedia/cub-200-2011/versions/1/CUB_200_2011'

print(f"Root: {ROOT}")
print(f"Artifacts: {ARTIFACTS}")
print(f"PyArrow version: {pa.__version__}")

Root: /home/giovanni/ufmg/speckitdlbird
Artifacts: /home/giovanni/ufmg/speckitdlbird/artifacts
PyArrow version: 22.0.0


## Index Xeno-Canto Metadata

## Load Existing Indexed Data

Since the data has already been indexed by the scripts, we'll load the existing parquet files.

In [17]:
# Load existing indexed data
print("Loading Xeno-Canto index...")
xc_df = pd.read_parquet(ARTIFACTS / 'xeno_canto_index.parquet')
print(f"Loaded {len(xc_df)} Xeno-Canto recordings")
print(f"Unique species: {xc_df['species'].nunique()}")
print(f"Columns: {list(xc_df.columns)}")

Loading Xeno-Canto index...
Loaded 23784 Xeno-Canto recordings
Unique species: 259
Columns: ['record_id', 'species', 'file_path', 'duration', 'sampling_rate', 'quality']


In [18]:
xc_df.head()

Unnamed: 0,record_id,species,file_path,duration,sampling_rate,quality
0,554809,Alder Flycatcher,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,49,44100 (Hz),0.0
1,552408,Alder Flycatcher,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,21,44100 (Hz),0.0
2,544552,Alder Flycatcher,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,35,44100 (Hz),0.0
3,544551,Alder Flycatcher,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,12,44100 (Hz),0.0
4,544550,Alder Flycatcher,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,38,44100 (Hz),0.0


## Index CUB-200-2011

In [19]:
print("Loading CUB-200-2011 index...")
cub_df = pd.read_parquet(ARTIFACTS / 'cub_index.parquet')
print(f"Loaded {len(cub_df)} images")
print(f"Unique species: {cub_df['species'].nunique()}")
print(f"Columns: {list(cub_df.columns)}")

Loading CUB-200-2011 index...
Loaded 11788 images
Unique species: 200
Columns: ['image_id', 'class_id', 'species', 'file_path']


In [20]:
cub_df.head()

Unnamed: 0,image_id,class_id,species,file_path
0,1,1,Black footed Albatross,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...
1,2,1,Black footed Albatross,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...
2,3,1,Black footed Albatross,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...
3,4,1,Black footed Albatross,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...
4,5,1,Black footed Albatross,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...


## Normalize Species Names

In [21]:
print("Normalizing species names...")

# Apply normalization to each dataset
xc_df['species_normalized'] = xc_df['species'].apply(normalize_species_name)
cub_df['species_normalized'] = cub_df['species'].apply(normalize_species_name)

print(f"Xeno-Canto: {xc_df['species_normalized'].nunique()} normalized species")
print(f"CUB: {cub_df['species_normalized'].nunique()} normalized species")

# Save normalized dataframes
xc_df.to_parquet(ARTIFACTS / 'xeno_canto_normalized.parquet', index=False, engine='pyarrow')
cub_df.to_parquet(ARTIFACTS / 'cub_normalized.parquet', index=False, engine='pyarrow')
print("Saved normalized dataframes")

Normalizing species names...
Xeno-Canto: 259 normalized species
CUB: 200 normalized species
Saved normalized dataframes


In [22]:
# CUB: Original vs Normalized Species
print("=" * 80)
print("CUB-200-2011 SPECIES NORMALIZATION")
print("=" * 80)

cub_species_mapping = cub_df[['species', 'species_normalized']].drop_duplicates().sort_values('species')
print(f"\nTotal unique original species: {len(cub_species_mapping)}")
print(f"Total unique normalized species: {cub_species_mapping['species_normalized'].nunique()}")
print(f"\nShowing first 30 mappings:\n")

# Display as a clean table
for idx, (orig, norm) in enumerate(cub_species_mapping.head(30).values, 1):
    print(f"{idx:3d}. {orig:40s} → {norm}")

print(f"\n... ({len(cub_species_mapping) - 30} more)")
cub_species_mapping

CUB-200-2011 SPECIES NORMALIZATION

Total unique original species: 200
Total unique normalized species: 200

Showing first 30 mappings:

  1. Acadian Flycatcher                       → acadian flycatcher
  2. American Crow                            → american crow
  3. American Goldfinch                       → american goldfinch
  4. American Pipit                           → american pipit
  5. American Redstart                        → american redstart
  6. American Three toed Woodpecker           → american three toed woodpecker
  7. Anna Hummingbird                         → anna hummingbird
  8. Artic Tern                               → artic tern
  9. Baird Sparrow                            → baird sparrow
 10. Baltimore Oriole                         → baltimore oriole
 11. Bank Swallow                             → bank swallow
 12. Barn Swallow                             → barn swallow
 13. Bay breasted Warbler                     → bay breasted warbler
 14. Belted Kingf

Unnamed: 0,species,species_normalized
2052,Acadian Flycatcher,acadian flycatcher
1581,American Crow,american crow
2649,American Goldfinch,american goldfinch
6034,American Pipit,american pipit
6322,American Redstart,american redstart
...,...,...
2410,Yellow bellied Flycatcher,yellow bellied flycatcher
1814,Yellow billed Cuckoo,yellow billed cuckoo
1056,Yellow breasted Chat,yellow breasted chat
603,Yellow headed Blackbird,yellow headed blackbird


In [23]:
# Xeno-Canto: Original vs Normalized Species
print("=" * 80)
print("XENO-CANTO SPECIES NORMALIZATION")
print("=" * 80)

xc_species_mapping = xc_df[['species', 'species_normalized']].drop_duplicates().sort_values('species')
print(f"\nTotal unique original species: {len(xc_species_mapping)}")
print(f"Total unique normalized species: {xc_species_mapping['species_normalized'].nunique()}")
print(f"\nShowing first 30 mappings:\n")

# Display as a clean table
for idx, (orig, norm) in enumerate(xc_species_mapping.head(30).values, 1):
    print(f"{idx:3d}. {orig:40s} → {norm}")

print(f"\n... ({len(xc_species_mapping) - 30} more)")
xc_species_mapping

XENO-CANTO SPECIES NORMALIZATION

Total unique original species: 259
Total unique normalized species: 259

Showing first 30 mappings:

  1. Alder Flycatcher                         → alder flycatcher
  2. American Avocet                          → american avocet
  3. American Bittern                         → american bittern
  4. American Bushtit                         → american bushtit
  5. American Cliff Swallow                   → american cliff swallow
  6. American Coot                            → american coot
  7. American Crow                            → american crow
  8. American Dusky Flycatcher                → american dusky flycatcher
  9. American Goldfinch                       → american goldfinch
 10. American Grey Flycatcher                 → american grey flycatcher
 11. American Kestrel                         → american kestrel
 12. American Redstart                        → american redstart
 13. American Robin                           → american robin
 14

Unnamed: 0,species,species_normalized
0,Alder Flycatcher,alder flycatcher
131,American Avocet,american avocet
140,American Bittern,american bittern
3924,American Bushtit,american bushtit
5083,American Cliff Swallow,american cliff swallow
...,...,...
23411,Woodhouse's Scrub Jay,woodhouse s scrub jay
23604,Yellow-bellied Flycatcher,yellow bellied flycatcher
23618,Yellow-bellied Sapsucker,yellow bellied sapsucker
23622,Yellow-headed Blackbird,yellow headed blackbird


In [24]:
# Check for many-to-one mappings (multiple original names → same normalized name)
print("=" * 80)
print("CHECKING FOR COLLISIONS (many-to-one mappings)")
print("=" * 80)

# Xeno-Canto collisions
xc_collisions = xc_species_mapping.groupby('species_normalized')['species'].apply(list)
xc_collisions = xc_collisions[xc_collisions.apply(len) > 1]

print(f"\nXeno-Canto: {len(xc_collisions)} normalized species have multiple original names")
if len(xc_collisions) > 0:
    print("\nExamples:")
    for norm_name, orig_names in list(xc_collisions.items())[:10]:
        print(f"\n  {norm_name}:")
        for orig in orig_names:
            print(f"    - {orig}")

# CUB collisions
cub_collisions = cub_species_mapping.groupby('species_normalized')['species'].apply(list)
cub_collisions = cub_collisions[cub_collisions.apply(len) > 1]

print(f"\nCUB: {len(cub_collisions)} normalized species have multiple original names")
if len(cub_collisions) > 0:
    print("\nExamples:")
    for norm_name, orig_names in list(cub_collisions.items())[:10]:
        print(f"\n  {norm_name}:")
        for orig in orig_names:
            print(f"    - {orig}")

CHECKING FOR COLLISIONS (many-to-one mappings)

Xeno-Canto: 0 normalized species have multiple original names

CUB: 0 normalized species have multiple original names


## Manual Validation: Species Normalization

Compare original species names with their normalized versions to verify the normalization logic.

## Compute Dataset Intersection

In [25]:
print("Computing species intersection...")

# Get unique normalized species from each dataset
xc_species = set(xc_df['species_normalized'].unique())
cub_species = set(cub_df['species_normalized'].unique())

# Compute intersection
intersection_species = xc_species & cub_species

print(f"\nIntersection contains {len(intersection_species)} species")
print(f"Sample species: {sorted(list(intersection_species))[:10]}")

# Save
intersection_meta = {
    'intersection_count': len(intersection_species),
    'xeno_canto_species': len(xc_species),
    'cub_species': len(cub_species),
    'intersection_species': sorted(list(intersection_species))
}

with open(ARTIFACTS / 'intersection_metadata.json', 'w') as f:
    json.dump(intersection_meta, f, indent=2)
    
print(f"Saved to {ARTIFACTS / 'intersection_metadata.json'}")

Computing species intersection...

Intersection contains 90 species
Sample species: ['american crow', 'american goldfinch', 'american redstart', 'baltimore oriole', 'barn swallow', 'belted kingfisher', 'black and white warbler', 'black billed cuckoo', 'black throated blue warbler', 'black throated sparrow']
Saved to /home/giovanni/ufmg/speckitdlbird/artifacts/intersection_metadata.json


In [26]:
# Show intersection species with their original names from both datasets
print("=" * 80)
print("INTERSECTION SPECIES - ORIGINAL NAMES COMPARISON")
print("=" * 80)
print(f"\nShowing all {len(intersection_species)} species in the intersection:\n")

# Create mapping dictionaries
xc_norm_to_orig = xc_species_mapping.groupby('species_normalized')['species'].apply(list).to_dict()
cub_norm_to_orig = cub_species_mapping.groupby('species_normalized')['species'].apply(list).to_dict()

# Display each intersection species
intersection_sorted = sorted(list(intersection_species))
for idx, norm_species in enumerate(intersection_sorted, 1):
    xc_originals = xc_norm_to_orig.get(norm_species, [])
    cub_originals = cub_norm_to_orig.get(norm_species, [])
    
    print(f"\n{idx:2d}. {norm_species}")
    print(f"    Xeno-Canto: {', '.join(xc_originals)}")
    print(f"    CUB:        {', '.join(cub_originals)}")

INTERSECTION SPECIES - ORIGINAL NAMES COMPARISON

Showing all 90 species in the intersection:


 1. american crow
    Xeno-Canto: American Crow
    CUB:        American Crow

 2. american goldfinch
    Xeno-Canto: American Goldfinch
    CUB:        American Goldfinch

 3. american redstart
    Xeno-Canto: American Redstart
    CUB:        American Redstart

 4. baltimore oriole
    Xeno-Canto: Baltimore Oriole
    CUB:        Baltimore Oriole

 5. barn swallow
    Xeno-Canto: Barn Swallow
    CUB:        Barn Swallow

 6. belted kingfisher
    Xeno-Canto: Belted Kingfisher
    CUB:        Belted Kingfisher

 7. black and white warbler
    Xeno-Canto: Black-and-white Warbler
    CUB:        Black and white Warbler

 8. black billed cuckoo
    Xeno-Canto: Black-billed Cuckoo
    CUB:        Black billed Cuckoo

 9. black throated blue warbler
    Xeno-Canto: Black-throated Blue Warbler
    CUB:        Black throated Blue Warbler

10. black throated sparrow
    Xeno-Canto: Black-throated 

In [27]:
# Create a detailed DataFrame for easier inspection
intersection_validation = []

# Need to get the sorted list and mappings
intersection_sorted = sorted(list(intersection_species))
xc_norm_to_orig = xc_species_mapping.groupby('species_normalized')['species'].apply(list).to_dict()
cub_norm_to_orig = cub_species_mapping.groupby('species_normalized')['species'].apply(list).to_dict()

for norm_species in intersection_sorted:
    xc_originals = xc_norm_to_orig.get(norm_species, [])
    cub_originals = cub_norm_to_orig.get(norm_species, [])
    
    # Count samples in each dataset
    xc_count = xc_df[xc_df['species_normalized'] == norm_species].shape[0]
    cub_count = cub_df[cub_df['species_normalized'] == norm_species].shape[0]
    
    intersection_validation.append({
        'normalized_species': norm_species,
        'xeno_canto_original': ', '.join(xc_originals),
        'xeno_canto_samples': xc_count,
        'cub_original': ', '.join(cub_originals),
        'cub_samples': cub_count,
        'xc_variants': len(xc_originals),
        'cub_variants': len(cub_originals)
    })

intersection_df = pd.DataFrame(intersection_validation)
print("\n" + "=" * 80)
print("INTERSECTION VALIDATION TABLE")
print("=" * 80)
print(f"\nTotal intersection species: {len(intersection_df)}")
print(f"Total Xeno-Canto samples: {intersection_df['xeno_canto_samples'].sum():,}")
print(f"Total CUB samples: {intersection_df['cub_samples'].sum():,}")
print(f"\nSpecies with multiple name variants in Xeno-Canto: {(intersection_df['xc_variants'] > 1).sum()}")
print(f"Species with multiple name variants in CUB: {(intersection_df['cub_variants'] > 1).sum()}")

intersection_df


INTERSECTION VALIDATION TABLE

Total intersection species: 90
Total Xeno-Canto samples: 11,076
Total CUB samples: 5,385

Species with multiple name variants in Xeno-Canto: 0
Species with multiple name variants in CUB: 0


Unnamed: 0,normalized_species,xeno_canto_original,xeno_canto_samples,cub_original,cub_samples,xc_variants,cub_variants
0,american crow,American Crow,147,American Crow,60,1,1
1,american goldfinch,American Goldfinch,88,American Goldfinch,60,1,1
2,american redstart,American Redstart,198,American Redstart,60,1,1
3,baltimore oriole,Baltimore Oriole,69,Baltimore Oriole,60,1,1
4,barn swallow,Barn Swallow,608,Barn Swallow,60,1,1
...,...,...,...,...,...,...,...
85,white throated sparrow,White-throated Sparrow,170,White throated Sparrow,60,1,1
86,winter wren,Winter Wren,109,Winter Wren,60,1,1
87,yellow bellied flycatcher,Yellow-bellied Flycatcher,14,Yellow bellied Flycatcher,59,1,1
88,yellow headed blackbird,Yellow-headed Blackbird,25,Yellow headed Blackbird,56,1,1


In [28]:
# Save the validation table for reference
intersection_df.to_csv(ARTIFACTS / 'intersection_validation.csv', index=False)
print(f"✓ Saved validation table to {ARTIFACTS / 'intersection_validation.csv'}")

# Show species with potential issues (multiple variants)
potential_issues = intersection_df[(intersection_df['xc_variants'] > 1) | (intersection_df['cub_variants'] > 1)]
if len(potential_issues) > 0:
    print(f"\n⚠️  {len(potential_issues)} species have multiple name variants:")
    print(potential_issues[['normalized_species', 'xeno_canto_original', 'cub_original']])

✓ Saved validation table to /home/giovanni/ufmg/speckitdlbird/artifacts/intersection_validation.csv


## Manual Validation: Intersection Species

Show which original species names from both datasets map to the same normalized name in the intersection.

## Filter Datasets to Intersection

In [29]:
# Filter Xeno-Canto to intersection species
xc_filtered = xc_df[xc_df['species_normalized'].isin(intersection_species)].copy()
print(f"Xeno-Canto filtered: {len(xc_filtered)} recordings ({xc_filtered['species_normalized'].nunique()} species)")

# Filter CUB to intersection species
cub_filtered = cub_df[cub_df['species_normalized'].isin(intersection_species)].copy()
print(f"CUB filtered: {len(cub_filtered)} images ({cub_filtered['species_normalized'].nunique()} species)")

# Save filtered datasets
xc_filtered.to_parquet(ARTIFACTS / 'xeno_canto_filtered.parquet', index=False, engine='pyarrow')
cub_filtered.to_parquet(ARTIFACTS / 'cub_filtered.parquet', index=False, engine='pyarrow')

print("\n✓ Filtered datasets saved!")

Xeno-Canto filtered: 11076 recordings (90 species)
CUB filtered: 5385 images (90 species)

✓ Filtered datasets saved!


## Summary Statistics

In [30]:
summary = {
    'xeno_canto_total': len(xc_df),
    'xeno_canto_species': xc_df['species'].nunique(),
    'xeno_canto_normalized_species': xc_df['species_normalized'].nunique(),
    'cub_total': len(cub_df),
    'cub_species': cub_df['species'].nunique(),
    'cub_normalized_species': cub_df['species_normalized'].nunique(),
    'intersection_count': len(intersection_species),
    'xeno_canto_filtered_count': len(xc_filtered),
    'cub_filtered_count': len(cub_filtered),
}

print("\n" + "="*60)
print("DATASET SUMMARY")
print("="*60)
for key, value in summary.items():
    print(f"  {key:.<40} {value:>6,}")
    
with open(ARTIFACTS / 'dataset_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)
    
print(f"\n✓ Summary saved to {ARTIFACTS / 'dataset_summary.json'}")


DATASET SUMMARY
  xeno_canto_total........................ 23,784
  xeno_canto_species......................    259
  xeno_canto_normalized_species...........    259
  cub_total............................... 11,788
  cub_species.............................    200
  cub_normalized_species..................    200
  intersection_count......................     90
  xeno_canto_filtered_count............... 11,076
  cub_filtered_count......................  5,385

✓ Summary saved to /home/giovanni/ufmg/speckitdlbird/artifacts/dataset_summary.json
