# Dataset Intersection Analysis

This notebook:
1. Indexes Xeno-Canto audio metadata (A-M v11 and N-Z v11)
2. Indexes CUB-200-2011 image metadata
3. Extracts and indexes SSW60 dataset
4. Normalizes species names across datasets
5. Computes species intersection between Xeno-Canto and CUB-200
6. Filters samples to intersection species
7. Saves filtered indices for downstream tasks

## Setup and Imports

In [1]:
import sys
from pathlib import Path
import json

# Add src to path
ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
sys.path.insert(0, str(ROOT))

# Import pyarrow first to avoid extension type registration issues
import pyarrow as pa
import pandas as pd

from src.data.xeno_canto import index_xeno_canto
from src.data.cub import index_cub
from src.utils.species import normalize_species_name

# Paths
DATA_DIR = ROOT / 'data'
ARTIFACTS = ROOT / 'artifacts'
ARTIFACTS.mkdir(exist_ok=True)

# Dataset paths
XENO_CANTO_AM = DATA_DIR / 'datasets/rohanrao/xeno-canto-bird-recordings-extended-a-m/versions/11'
XENO_CANTO_NZ = DATA_DIR / 'datasets/rohanrao/xeno-canto-bird-recordings-extended-n-z/versions/11'
CUB_ROOT = DATA_DIR / 'datasets/visipedia/cub-200-2011/versions/1/CUB_200_2011'

print(f"Root: {ROOT}")
print(f"Artifacts: {ARTIFACTS}")
print(f"PyArrow version: {pa.__version__}")

Root: /home/giovanni/ufmg/speckitdlbird
Artifacts: /home/giovanni/ufmg/speckitdlbird/artifacts
PyArrow version: 22.0.0


## Index Xeno-Canto Metadata

## Load Existing Indexed Data

Since the data has already been indexed by the scripts, we'll load the existing parquet files.

In [2]:
# Load existing indexed data
print("Loading Xeno-Canto index...")
xc_df = pd.read_parquet(ARTIFACTS / 'xeno_canto_index.parquet')
print(f"Loaded {len(xc_df)} Xeno-Canto recordings")
print(f"Unique species: {xc_df['species'].nunique()}")
print(f"Columns: {list(xc_df.columns)}")

Loading Xeno-Canto index...
Loaded 23784 Xeno-Canto recordings
Unique species: 259
Columns: ['record_id', 'species', 'file_path', 'duration', 'sampling_rate', 'quality']


In [3]:
xc_df.head()

Unnamed: 0,record_id,species,file_path,duration,sampling_rate,quality
0,554809,Alder Flycatcher,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,49,44100 (Hz),0.0
1,552408,Alder Flycatcher,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,21,44100 (Hz),0.0
2,544552,Alder Flycatcher,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,35,44100 (Hz),0.0
3,544551,Alder Flycatcher,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,12,44100 (Hz),0.0
4,544550,Alder Flycatcher,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,38,44100 (Hz),0.0


## Index CUB-200-2011

In [4]:
print("Loading CUB-200-2011 index...")
cub_df = pd.read_parquet(ARTIFACTS / 'cub_index.parquet')
print(f"Loaded {len(cub_df)} images")
print(f"Unique species: {cub_df['species'].nunique()}")
print(f"Columns: {list(cub_df.columns)}")

Loading CUB-200-2011 index...
Loaded 11788 images
Unique species: 200
Columns: ['image_id', 'class_id', 'species', 'file_path']


In [5]:
cub_df.head()

Unnamed: 0,image_id,class_id,species,file_path
0,1,1,Black footed Albatross,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...
1,2,1,Black footed Albatross,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...
2,3,1,Black footed Albatross,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...
3,4,1,Black footed Albatross,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...
4,5,1,Black footed Albatross,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...


## Extract and Index SSW60

In [6]:
# SSW60 is optional - skip if not available
SSW60_TAR = DATA_DIR / 'mixed/ssw60.tar.gz'
print(SSW60_TAR)
if SSW60_TAR.exists():
    print("Extracting and indexing SSW60...")
    from src.data.ssw60 import extract_and_index_ssw60
    ssw60_df = extract_and_index_ssw60(SSW60_TAR, DATA_DIR / 'ssw60')
    print(f"Found {len(ssw60_df)} samples")
    print(f"Unique species: {ssw60_df['species'].nunique()}")
    ssw60_df.to_parquet(ARTIFACTS / 'ssw60_index.parquet', index=False, engine='pyarrow')
else:
    print("SSW60 not found - skipping")
    ssw60_df = pd.DataFrame(columns=['species'])

/home/giovanni/ufmg/speckitdlbird/data/mixed/ssw60.tar.gz
SSW60 not found - skipping


## Normalize Species Names

In [7]:
print("Normalizing species names...")

# Apply normalization to each dataset
xc_df['species_normalized'] = xc_df['species'].apply(normalize_species_name)
cub_df['species_normalized'] = cub_df['species'].apply(normalize_species_name)
if len(ssw60_df) > 0:
    ssw60_df['species_normalized'] = ssw60_df['species'].apply(normalize_species_name)

print(f"Xeno-Canto: {xc_df['species_normalized'].nunique()} normalized species")
print(f"CUB: {cub_df['species_normalized'].nunique()} normalized species")

# Save normalized dataframes
xc_df.to_parquet(ARTIFACTS / 'xeno_canto_normalized.parquet', index=False, engine='pyarrow')
cub_df.to_parquet(ARTIFACTS / 'cub_normalized.parquet', index=False, engine='pyarrow')
print("Saved normalized dataframes")

Normalizing species names...
Xeno-Canto: 259 normalized species
CUB: 200 normalized species
Saved normalized dataframes


## Compute Dataset Intersection

In [8]:
print("Computing species intersection...")

# Get unique normalized species from each dataset
xc_species = set(xc_df['species_normalized'].unique())
cub_species = set(cub_df['species_normalized'].unique())

# Compute intersection
intersection_species = xc_species & cub_species

print(f"\nIntersection contains {len(intersection_species)} species")
print(f"Sample species: {sorted(list(intersection_species))[:10]}")

# Save
intersection_meta = {
    'intersection_count': len(intersection_species),
    'xeno_canto_species': len(xc_species),
    'cub_species': len(cub_species),
    'intersection_species': sorted(list(intersection_species))
}

with open(ARTIFACTS / 'intersection_metadata.json', 'w') as f:
    json.dump(intersection_meta, f, indent=2)
    
print(f"Saved to {ARTIFACTS / 'intersection_metadata.json'}")

Computing species intersection...

Intersection contains 90 species
Sample species: ['american crow', 'american goldfinch', 'american redstart', 'baltimore oriole', 'barn swallow', 'belted kingfisher', 'black and white warbler', 'black billed cuckoo', 'black throated blue warbler', 'black throated sparrow']
Saved to /home/giovanni/ufmg/speckitdlbird/artifacts/intersection_metadata.json


## Filter Datasets to Intersection

In [9]:
# Filter Xeno-Canto to intersection species
xc_filtered = xc_df[xc_df['species_normalized'].isin(intersection_species)].copy()
print(f"Xeno-Canto filtered: {len(xc_filtered)} recordings ({xc_filtered['species_normalized'].nunique()} species)")

# Filter CUB to intersection species
cub_filtered = cub_df[cub_df['species_normalized'].isin(intersection_species)].copy()
print(f"CUB filtered: {len(cub_filtered)} images ({cub_filtered['species_normalized'].nunique()} species)")

# Save filtered datasets
xc_filtered.to_parquet(ARTIFACTS / 'xeno_canto_filtered.parquet', index=False, engine='pyarrow')
cub_filtered.to_parquet(ARTIFACTS / 'cub_filtered.parquet', index=False, engine='pyarrow')

print("\n✓ Filtered datasets saved!")

Xeno-Canto filtered: 11076 recordings (90 species)
CUB filtered: 5385 images (90 species)

✓ Filtered datasets saved!


## Summary Statistics

In [19]:
summary = {
    'xeno_canto_total': len(xc_df),
    'xeno_canto_species': xc_df['species'].nunique(),
    'xeno_canto_normalized_species': xc_df['species_normalized'].nunique(),
    'cub_total': len(cub_df),
    'cub_species': cub_df['species'].nunique(),
    'cub_normalized_species': cub_df['species_normalized'].nunique(),
    'intersection_count': len(intersection_species),
    'xeno_canto_filtered_count': len(xc_filtered),
    'cub_filtered_count': len(cub_filtered),
}

print("\n" + "="*60)
print("DATASET SUMMARY")
print("="*60)
for key, value in summary.items():
    print(f"  {key:.<40} {value:>6,}")
    
with open(ARTIFACTS / 'dataset_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)
    
print(f"\n✓ Summary saved to {ARTIFACTS / 'dataset_summary.json'}")


DATASET SUMMARY
  xeno_canto_total........................ 23,784
  xeno_canto_species......................    259
  xeno_canto_normalized_species...........    259
  cub_total............................... 11,788
  cub_species.............................    200
  cub_normalized_species..................    200
  intersection_count......................     90
  xeno_canto_filtered_count............... 11,076
  cub_filtered_count......................  5,385

✓ Summary saved to /home/giovanni/ufmg/speckitdlbird/artifacts/dataset_summary.json
