# Audio Feature Extraction (MFCC)

This notebook:
1. Loads filtered Xeno-Canto and SSW60 audio indices
2. Extracts MFCC static, delta, and delta-delta features
3. Normalizes audio length to fixed window (3 seconds)
4. Stacks features into (H, W, 3) tensors
5. Caches tensors for efficient training
6. Creates stratified train/val/test splits

## Setup

In [2]:
import sys
from pathlib import Path
import json
import numpy as np
import pandas as pd

# Add src to path
ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
sys.path.insert(0, str(ROOT))

from src.features.audio import extract_mfcc_features
from src.utils.splits import create_stratified_splits

# Paths
ARTIFACTS = ROOT / 'artifacts'
CACHE_DIR = ARTIFACTS / 'audio_mfcc_cache' / 'xeno_canto'

print(f"Root: {ROOT}")
print(f"Artifacts: {ARTIFACTS}")
print(f"Cache directory: {CACHE_DIR}")

Root: /home/giovanni/ufmg/speckitdlbird
Artifacts: /home/giovanni/ufmg/speckitdlbird/artifacts
Cache directory: /home/giovanni/ufmg/speckitdlbird/artifacts/audio_mfcc_cache/xeno_canto


## Load Audio Indices

In [3]:
# Load filtered Xeno-Canto data
xc_df = pd.read_parquet(ARTIFACTS / 'xeno_canto_filtered.parquet')
print(f"Loaded {len(xc_df)} audio recordings")
print(f"Unique species: {xc_df['species_normalized'].nunique()}")
print(f"\nColumns: {list(xc_df.columns)}")
print(f"\nFirst few rows:")
xc_df.head()

Loaded 11076 audio recordings
Unique species: 90

Columns: ['record_id', 'species', 'file_path', 'duration', 'sampling_rate', 'quality', 'species_normalized']

First few rows:


Unnamed: 0,record_id,species,file_path,duration,sampling_rate,quality,species_normalized
0,543339,American Crow,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,11,44100 (Hz),0.0,american crow
1,543338,American Crow,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,19,44100 (Hz),0.0,american crow
2,543337,American Crow,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,66,44100 (Hz),0.0,american crow
3,543336,American Crow,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,39,44100 (Hz),0.0,american crow
4,543335,American Crow,/media/giovanni/TOSHIBA EXT/dlbird/datasets/da...,8,44100 (Hz),0.0,american crow


## Extract and Cache MFCC Features - Xeno-Canto

In [7]:
# Load existing splits (created by scripts)
splits_file = ARTIFACTS / 'splits' / 'xeno_canto_audio_splits.json'

with open(splits_file, 'r') as f:
    splits = json.load(f)

print("Loaded existing splits:")
print(f"  Train: {len(splits['train'])} samples")
print(f"  Val: {len(splits['val'])} samples")
print(f"  Test: {len(splits['test'])} samples")

Loaded existing splits:
  Train: 7751 samples
  Val: 1662 samples
  Test: 1662 samples


## Extract and Cache MFCC Features - SSW60

In [5]:
# Check cached features
cached_files = list(CACHE_DIR.glob('**/*.npy'))
print(f"Total cached features: {len(cached_files)}")

if len(cached_files) > 0:
    # Verify a few random samples
    import random
    sample_files = random.sample(cached_files, min(5, len(cached_files)))
    
    print("\nSample features:")
    for f in sample_files:
        features = np.load(f)
        print(f"  {f.parent.name}/{f.name}:")
        print(f"    Shape: {features.shape}, Dtype: {features.dtype}")
        print(f"    Range: [{features.min():.2f}, {features.max():.2f}]")
else:
    print("\n❌ No cached features found. Run feature extraction first.")

Total cached features: 11075

Sample features:
  northern_flicker/347999.npy:
    Shape: (40, 130, 3), Dtype: float32
    Range: [-667.39, 46.24]
  louisiana_waterthrush/538189.npy:
    Shape: (40, 130, 3), Dtype: float32
    Range: [-921.71, 77.93]
  house_wren/506163.npy:
    Shape: (40, 130, 3), Dtype: float32
    Range: [-417.85, 85.34]
  house_wren/552796.npy:
    Shape: (40, 130, 3), Dtype: float32
    Range: [-553.68, 86.77]
  red_eyed_vireo/550358.npy:
    Shape: (40, 130, 3), Dtype: float32
    Range: [-653.51, 103.29]


## Create Stratified Splits - Xeno-Canto

In [None]:
# Filter out species with too few samples (need at least 2 for stratification)
print("\nFiltering species with insufficient samples...")
xc_counts = xc_df["species_normalized"].value_counts()
species_to_keep = xc_counts[xc_counts >= 2].index
xc_df_filtered = xc_df[xc_df["species_normalized"].isin(species_to_keep)].copy()
print(f"After filtering: {len(xc_df_filtered)} recordings, {xc_df_filtered['species_normalized'].nunique()} species")

# Create audio splits (70/15/15)
from src.utils.splits import create_stratified_splits

print("\nCreating stratified splits (70% train, 15% val, 15% test)...")
xc_splits = create_stratified_splits(
    xc_df_filtered, 
    "species_normalized", 
    train_ratio=0.7, 
    val_ratio=0.15, 
    test_ratio=0.15, 
    random_seed=42
)

print(f"Train: {len(xc_splits['train'])} samples")
print(f"Val: {len(xc_splits['val'])} samples")
print(f"Test: {len(xc_splits['test'])} samples")

# Save splits
SPLITS_DIR = ARTIFACTS / "splits"
SPLITS_DIR.mkdir(exist_ok=True)

splits_file = SPLITS_DIR / "xeno_canto_audio_splits.json"
with open(splits_file, "w") as f:
    json.dump({k: [int(x) for x in v] for k, v in xc_splits.items()}, f)
    
print(f"\n✓ Saved splits to {splits_file}")

## Create Stratified Splits - SSW60

In [None]:
# Extract MFCC features for all audio files
print("=" * 80)
print("EXTRACTING MFCC FEATURES")
print("=" * 80)
print("\n⚠️  This will take 1-4 hours depending on CPU speed")
print("Features are cached, so subsequent runs will be much faster.\n")

import time
time.sleep(2)

print("Extracting MFCC features for Xeno-Canto audio...")
print("Parameters:")
print("  - 40 MFCC coefficients")
print("  - 3s duration")
print("  - 22.05kHz sampling rate")
print("  - Output: (40, W, 3) [static + delta + delta-delta]\n")

success_count = cache_audio_features(
    df=xc_df_filtered,
    cache_dir=CACHE_DIR / "xeno_canto",
    dataset_name="Xeno-Canto",
    n_mfcc=40,
    hop_length=512,
    n_fft=2048,
    target_sr=22050,
    duration=3.0
)

print(f"\n✓ Successfully cached {success_count}/{len(xc_df_filtered)} audio features")
print(f"✓ Cache location: {CACHE_DIR / 'xeno_canto'}")

## Feature Statistics

In [None]:
# Load and inspect a cached feature to verify format
sample_features = list((CACHE_DIR / "xeno_canto").rglob("*.npy"))
if sample_features:
    sample_file = sample_features[0]
    features = np.load(sample_file)
    print(f"Sample feature file: {sample_file.name}")
    print(f"Feature shape: {features.shape}")
    print(f"Data type: {features.dtype}")
    print(f"Value range: [{features.min():.2f}, {features.max():.2f}]")
    print(f"\n✓ Features are in correct format: (n_mfcc, time_frames, 3)")
else:
    print("No cached features found yet.")