In [1]:

import pandas as pd
from utils.audio_tools import extract_positive_clip_contextual, extract_positive_clip, extract_negative_clip, extract_negatives_iterative
import os

## Read the labels from the annotations file

Includes wav_filename, start_time_s, duration_s, location, date, etc.

In [2]:
annotations = pd.read_csv("annotations.tsv", sep="\t")
annotations

Unnamed: 0,dataset,wav_filename,start_time_s,duration_s,location,date,pst_or_master_tape_identifier
0,podcast_round1,60012.wav,34.126,2.918,"Dabob Bay, Seattle, Washington",1960-10-28,60012
1,podcast_round1,60012.wav,36.816,2.588,"Dabob Bay, Seattle, Washington",1960-10-28,60012
2,podcast_round1,60012.wav,42.550,2.055,"Dabob Bay, Seattle, Washington",1960-10-28,60012
3,podcast_round1,60012.wav,44.606,2.410,"Dabob Bay, Seattle, Washington",1960-10-28,60012
4,podcast_round1,60012.wav,46.636,3.425,"Dabob Bay, Seattle, Washington",1960-10-28,60012
...,...,...,...,...,...,...,...
5532,podcast_round12,rpi-port-townsend_2020_09_08_14_55_00.wav,0.000,0.000,port_townsend,2020-09-08,14:55:00
5533,podcast_round12,rpi-port-townsend_2020_09_08_14_28_00.wav,0.000,0.000,port_townsend,2020-09-08,14:28:00
5534,podcast_round12,rpi-port-townsend_2020_10_08_02_25_00.wav,0.000,0.000,port_townsend,2020-10-08,02:25:00
5535,podcast_round12,rpi-port-townsend_2020_09_08_15_30_00.wav,0.000,0.000,port_townsend,2020-09-08,15:30:00


## Set Up Output Folder for Clips



In [3]:
import os
import shutil

# Remove and recreate positive clips folder
pos_clip_dir = "data/clips/positive"
shutil.rmtree(pos_clip_dir, ignore_errors=True)
os.makedirs(pos_clip_dir, exist_ok=True)

# Remove and recreate negative clips folder
neg_clip_dir = "data/clips/negative"
shutil.rmtree(neg_clip_dir, ignore_errors=True)
os.makedirs(neg_clip_dir, exist_ok=True)

## Extract 200 Positive and 200 Negative Clips with 26 Features Each

This section builds a clean, balanced dataset by extracting 2-second clips from labeled orca calls and safe, non-overlapping segments from the same WAVs. Each record includes MFCC + delta features, labels, and metadata.


In [4]:
import random

# === POSITIVE CLIPS ===
records = []
pos_count = 0
seen_indices = set()
total_needed = 1000
output_dir = "data/clips/positive"
random.seed(42)  # For reproducibility

while pos_count < total_needed:
    if len(seen_indices) == len(annotations):
        print("Used all annotations, only collected", pos_count, "clips.")
        break

    rand_index = random.randint(0, len(annotations) - 1)
    if rand_index in seen_indices:
        continue
    seen_indices.add(rand_index)

    row = annotations.iloc[rand_index]
    wav_path = os.path.join("data", "wav", row["wav_filename"])
    wav_base = os.path.splitext(row["wav_filename"])[0]

    pos_output_path = os.path.join(
        output_dir,
        f"clip_{pos_count+1:05}_{wav_base}_{row['start_time_s']:.2f}".replace(".", "_") + ".wav"
    )

    mfcc, delta = extract_positive_clip_contextual(
        wav_path=wav_path,
        start_time=row["start_time_s"],
        duration=row["duration_s"],
        annotations_df=annotations,
        output_path=pos_output_path
    )

    if mfcc is not None:
        record = {
            "clip_name": os.path.basename(pos_output_path),
            "label": "orca_call",
            "source_wav": row["wav_filename"],
            "start_time": row["start_time_s"]
        }
        for i, val in enumerate(mfcc):
            record[f"mfcc_{i+1}"] = val
        for i, val in enumerate(delta):
            record[f"delta_mfcc_{i+1}"] = val
        records.append(record)
        print(f"Extracted {len(records)} positive clips", end="\r")
        pos_count += 1




# === NEGATIVE ===
def extract_negatives_random_multiwav(
    wav_dir,
    annotations_df,
    output_dir,
    clip_start_index=1,
    max_clips=1000,
    target_duration=2.0,
    buffer=1.0,
    sr_target=22050,
    n_mfcc=13
):
    import os, random
    import soundfile as sf
    import librosa
    import numpy as np

    os.makedirs(output_dir, exist_ok=True)
    records = []
    seen = set()
    attempts = 0
    max_attempts = max_clips * 20

    # Build exclusion zones: {wav_filename: [(start, end), ...]}
    exclusion = {}
    for _, row in annotations_df.iterrows():
        fname = row["wav_filename"]
        start = max(0, row["start_time_s"] - buffer)
        end = row["start_time_s"] + row["duration_s"] + buffer
        exclusion.setdefault(fname, []).append((start, end))

    wav_files = [f for f in os.listdir(wav_dir) if f.endswith(".wav")]

    while len(records) < max_clips and attempts < max_attempts:
        wav_file = random.choice(wav_files)
        wav_path = os.path.join(wav_dir, wav_file)
        try:
            data, sr = sf.read(wav_path)
        except:
            attempts += 1
            continue

        if data.ndim > 1:
            data = librosa.to_mono(data)

        if sr != sr_target:
            data = librosa.resample(data.astype(np.float32), orig_sr=sr, target_sr=sr_target)
            sr = sr_target

        duration = len(data) / sr
        max_start = duration - target_duration
        if max_start <= 0:
            attempts += 1
            continue

        start_time = round(random.uniform(0, max_start), 2)
        end_time = start_time + target_duration

        # Check for overlap with known orca call windows
        overlaps = any(
            max(start_time, excl_start) < min(end_time, excl_end)
            for excl_start, excl_end in exclusion.get(wav_file, [])
        )
        if overlaps:
            attempts += 1
            continue

        key = (wav_file, start_time)
        if key in seen:
            attempts += 1
            continue
        seen.add(key)

        # Extract audio
        start_sample = int(start_time * sr)
        end_sample = int(end_time * sr)
        clip = data[start_sample:end_sample]

        if len(clip) < sr * 0.3:
            attempts += 1
            continue

        # Save clip
        clip_idx = clip_start_index + len(records)
        filename = f"neg_{clip_idx:05}_{os.path.splitext(wav_file)[0]}_{start_time:.2f}".replace(".", "_") + ".wav"
        clip_path = os.path.join(output_dir, filename)
        sf.write(clip_path, clip, sr)

        # Extract features
        mfcc = librosa.feature.mfcc(y=clip, sr=sr, n_mfcc=n_mfcc)
        delta = librosa.feature.delta(mfcc)
        record = {
            "clip_name": filename,
            "label": "no_call",
            "source_wav": wav_file,
            "start_time": start_time
        }
        for i, val in enumerate(np.mean(mfcc, axis=1)):
            record[f"mfcc_{i+1}"] = val
        for i, val in enumerate(np.mean(delta, axis=1)):
            record[f"delta_mfcc_{i+1}"] = val
        records.append(record)

    print(f"Generated {len(records)} negative clips after {attempts} attempts.")
    return records

neg_records = extract_negatives_random_multiwav(
    wav_dir="data/wav",
    annotations_df=annotations,
    output_dir="data/clips/negative",
    clip_start_index=1,
    max_clips=1000,
    buffer=1.0
)

records.extend(neg_records)

# === Save final CSV ===

df = pd.DataFrame(records)
df.to_csv("mfcc_balanced_100x100.csv", index=False)


Error: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)
Error: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)
Error: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)
Error: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)
Error: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)
Error: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)
Extracted 75 positive clips



Error: when mode='interp', width=9 cannot exceed data.shape[axis]=1
Error: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)
Error: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)
Error: when mode='interp', width=9 cannot exceed data.shape[axis]=1
Error: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)
Error: when mode='interp', width=9 cannot exceed data.shape[axis]=1
Error: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)
Error: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)
Error: when mode='interp', widt

In [5]:
import pandas as pd
import os

# Get used files
df = pd.read_csv("mfcc_balanced_100x100.csv")
used_files = df["source_wav"].unique()

# List all WAVs
all_wavs = sorted(os.listdir("data/wav"))

# Filter unseen
unseen = [f for f in all_wavs if f not in used_files]

# Sort unseen by file size (descending)
unseen.sort(key=lambda f: os.path.getsize(os.path.join("data/wav", f)), reverse=True)

print("Unseen test files (largest first):")
for f in unseen:
    size_mb = os.path.getsize(os.path.join("data/wav", f)) / (1024 * 1024)
    print(f"{f} - {size_mb:.2f} MB")



Unseen test files (largest first):
live_feed_sim.wav - 509.90 MB
rpi-orcasound-lab_2020_09_06_13_32_00.wav - 10.99 MB
rpi-orcasound-lab_2020_07_25_19_17_00.wav - 10.99 MB
rpi-orcasound-lab_2020_07_25_19_40_00.wav - 10.99 MB
rpi-orcasound-lab_2020_07_25_19_42_00.wav - 10.99 MB
rpi-orcasound-lab_2020_07_25_19_43_00.wav - 10.99 MB
rpi-orcasound-lab_2020_07_25_19_45_00.wav - 10.99 MB
rpi-orcasound-lab_2020_07_25_19_46_00.wav - 10.99 MB
rpi-orcasound-lab_2020_07_25_19_48_00.wav - 10.99 MB
rpi-orcasound-lab_2020_07_25_19_51_00.wav - 10.99 MB
rpi-orcasound-lab_2020_07_25_19_53_00.wav - 10.99 MB
rpi-orcasound-lab_2020_07_25_19_54_00.wav - 10.99 MB
rpi-orcasound-lab_2020_07_25_19_59_00.wav - 10.99 MB
rpi-orcasound-lab_2020_07_25_20_01_00.wav - 10.99 MB
rpi-orcasound-lab_2020_07_25_20_04_00.wav - 10.99 MB
rpi-orcasound-lab_2020_07_25_20_07_00.wav - 10.99 MB
rpi-orcasound-lab_2020_07_25_20_10_00.wav - 10.99 MB
rpi-orcasound-lab_2020_09_01_14_46_00.wav - 10.99 MB
rpi-orcasound-lab_2020_09_01_14_49