# Audio Feature Extraction (MFCC)

This notebook:
1. Loads filtered Xeno-Canto and SSW60 audio indices
2. Extracts MFCC static, delta, and delta-delta features
3. Normalizes audio length to fixed window (3 seconds)
4. Stacks features into (H, W, 3) tensors
5. Caches tensors for efficient training
6. Creates stratified train/val/test splits

## Setup

In [None]:
import sys
from pathlib import Path
import json
import numpy as np
import pandas as pd

# Add src to path
ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
sys.path.insert(0, str(ROOT))

from src.features.audio import extract_mfcc_features, cache_audio_features
from src.utils.splits import create_stratified_splits

# Paths
ARTIFACTS = ROOT / 'artifacts'
CACHE_DIR = ARTIFACTS / 'audio_mfcc_cache' / 'xeno_canto'

print(f"Root: {ROOT}")
print(f"Artifacts: {ARTIFACTS}")
print(f"Cache directory: {CACHE_DIR}")

Root: /home/giovanni/ufmg/speckitdlbird
Artifacts: /home/giovanni/ufmg/speckitdlbird/artifacts
Cache directory: /home/giovanni/ufmg/speckitdlbird/artifacts/audio_mfcc_cache/xeno_canto


## Load Audio Indices

In [2]:
# Load filtered Xeno-Canto data
xc_df = pd.read_parquet(ARTIFACTS / 'xeno_canto_filtered.parquet')
print(f"Loaded {len(xc_df)} audio recordings")
print(f"Unique species: {xc_df['species_normalized'].nunique()}")
print(f"\nColumns: {list(xc_df.columns)}")
print(f"\nFirst few rows:")
xc_df.head()

Loaded 11076 audio recordings
Unique species: 90

Columns: ['record_id', 'species', 'file_path', 'duration', 'sampling_rate', 'quality', 'species_normalized']

First few rows:


Unnamed: 0,record_id,species,file_path,duration,sampling_rate,quality,species_normalized
0,543339,American Crow,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,11,44100 (Hz),0.0,american crow
1,543338,American Crow,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,19,44100 (Hz),0.0,american crow
2,543337,American Crow,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,66,44100 (Hz),0.0,american crow
3,543336,American Crow,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,39,44100 (Hz),0.0,american crow
4,543335,American Crow,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,8,44100 (Hz),0.0,american crow


In [8]:
# Count audios per species
species_counts = xc_df['species_normalized'].value_counts()

print(f"Audio distribution across {len(species_counts)} species:")
print("=" * 60)
print(f"\nTotal audios: {len(xc_df)}")
print(f"Average audios per species: {species_counts.mean():.1f}")
print(f"Median audios per species: {species_counts.median():.0f}")
print(f"Min audios: {species_counts.min()} | Max audios: {species_counts.max()}")

print(f"\n{'Species':<40} {'Count':>8}")
print("-" * 60)
for species, count in species_counts.head(20).items():
    print(f"{species:<40} {count:>8}")

if len(species_counts) > 20:
    print(f"\n... and {len(species_counts) - 20} more species")
    print(f"\nBottom 10 species (least audios):")
    print("-" * 60)
    for species, count in species_counts.tail(10).items():
        print(f"{species:<40} {count:>8}")

Audio distribution across 90 species:

Total audios: 11076
Average audios per species: 123.1
Median audios per species: 54
Min audios: 1 | Max audios: 1216

Species                                     Count
------------------------------------------------------------
house sparrow                                1216
house wren                                    984
song sparrow                                  621
barn swallow                                  608
red winged blackbird                          466
mallard                                       407
carolina wren                                 371
common yellowthroat                           307
warbling vireo                                306
dark eyed junco                               257
marsh wren                                    240
blue jay                                      239
white breasted nuthatch                       228
common tern                                   224
western meadowlark              

## Audio Distribution per Species

## Extract and Cache MFCC Features - Xeno-Canto

In [3]:
# Load existing splits (created by scripts)
splits_file = ARTIFACTS / 'splits' / 'xeno_canto_audio_splits.json'

with open(splits_file, 'r') as f:
    splits = json.load(f)

print("Loaded existing splits:")
print(f"  Train: {len(splits['train'])} samples")
print(f"  Val: {len(splits['val'])} samples")
print(f"  Test: {len(splits['test'])} samples")

Loaded existing splits:
  Train: 7751 samples
  Val: 1662 samples
  Test: 1662 samples


## Create Stratified Splits - Xeno-Canto

In [None]:
# Filter out species with too few samples (need at least 2 for stratification)
print("\nFiltering species with insufficient samples...")
xc_counts = xc_df["species_normalized"].value_counts()
species_to_keep = xc_counts[xc_counts >= 2].index
xc_df_filtered = xc_df[xc_df["species_normalized"].isin(species_to_keep)].copy()
print(f"After filtering: {len(xc_df_filtered)} recordings, {xc_df_filtered['species_normalized'].nunique()} species")

# Create audio splits (70/15/15)
from src.utils.splits import create_stratified_splits

print("\nCreating stratified splits (70% train, 15% val, 15% test)...")
xc_splits = create_stratified_splits(
    xc_df_filtered, 
    "species_normalized", 
    train_ratio=0.7, 
    val_ratio=0.15, 
    test_ratio=0.15, 
    random_state=42
)

print(f"Train: {len(xc_splits['train'])} samples")
print(f"Val: {len(xc_splits['val'])} samples")
print(f"Test: {len(xc_splits['test'])} samples")

# Save splits
SPLITS_DIR = ARTIFACTS / "splits"
SPLITS_DIR.mkdir(exist_ok=True)

splits_file = SPLITS_DIR / "xeno_canto_audio_splits.json"
with open(splits_file, "w") as f:
    json.dump({k: [int(x) for x in v] for k, v in xc_splits.items()}, f)

print(f"\n✓ Saved splits to {splits_file}")


Filtering species with insufficient samples...
After filtering: 11075 recordings, 89 species

Creating stratified splits (70% train, 15% val, 15% test)...
Split sizes - Train: 7751, Val: 1662, Test: 1662
  train: 89 unique species
  val: 87 unique species
  test: 87 unique species
Train: 7751 samples
Val: 1662 samples
Test: 1662 samples

✓ Saved splits to /home/giovanni/ufmg/speckitdlbird/artifacts/splits/xeno_canto_audio_splits.json


## Feature Statistics

In [7]:
# Load and inspect a cached feature to verify format
sample_features = list((CACHE_DIR / "xeno_canto").rglob("*.npy"))
if sample_features:
    sample_file = sample_features[0]
    features = np.load(sample_file)
    print(f"Sample feature file: {sample_file.name}")
    print(f"Feature shape: {features.shape}")
    print(f"Data type: {features.dtype}")
    print(f"Value range: [{features.min():.2f}, {features.max():.2f}]")
    print(f"\n✓ Features are in correct format: (n_mfcc, time_frames, 3)")
else:
    print("No cached features found yet.")

No cached features found yet.
