In [5]:
import pandas as pd
from utils.audio_tools import extract_positive_clip, extract_negative_clip, extract_negatives_iterative
import os

## Read the labels from the annotations file

Includes wav_filename, start_time_s, duration_s, location, date, etc.

In [2]:
annotations = pd.read_csv("annotations.tsv", sep="\t")
annotations

Unnamed: 0,dataset,wav_filename,start_time_s,duration_s,location,date,pst_or_master_tape_identifier
0,podcast_round1,60012.wav,34.126,2.918,"Dabob Bay, Seattle, Washington",1960-10-28,60012
1,podcast_round1,60012.wav,36.816,2.588,"Dabob Bay, Seattle, Washington",1960-10-28,60012
2,podcast_round1,60012.wav,42.550,2.055,"Dabob Bay, Seattle, Washington",1960-10-28,60012
3,podcast_round1,60012.wav,44.606,2.410,"Dabob Bay, Seattle, Washington",1960-10-28,60012
4,podcast_round1,60012.wav,46.636,3.425,"Dabob Bay, Seattle, Washington",1960-10-28,60012
...,...,...,...,...,...,...,...
5532,podcast_round12,rpi-port-townsend_2020_09_08_14_55_00.wav,0.000,0.000,port_townsend,2020-09-08,14:55:00
5533,podcast_round12,rpi-port-townsend_2020_09_08_14_28_00.wav,0.000,0.000,port_townsend,2020-09-08,14:28:00
5534,podcast_round12,rpi-port-townsend_2020_10_08_02_25_00.wav,0.000,0.000,port_townsend,2020-10-08,02:25:00
5535,podcast_round12,rpi-port-townsend_2020_09_08_15_30_00.wav,0.000,0.000,port_townsend,2020-09-08,15:30:00


## Set Up Output Folder for Clips



In [3]:
import os
import shutil

# Remove and recreate positive clips folder
pos_clip_dir = "data/clips/positive"
shutil.rmtree(pos_clip_dir, ignore_errors=True)
os.makedirs(pos_clip_dir, exist_ok=True)

# Remove and recreate negative clips folder
neg_clip_dir = "data/clips/negative"
shutil.rmtree(neg_clip_dir, ignore_errors=True)
os.makedirs(neg_clip_dir, exist_ok=True)

## Extract 100 Positive and 100 Negative Clips with 26 Features Each

This section builds a clean, balanced dataset by extracting 2-second clips from labeled orca calls and safe, non-overlapping segments from the same WAVs. Each record includes MFCC + delta features, labels, and metadata.


In [6]:
# === Initialize ===
records = []
pos_count = 0
row_index = 0

# === Start loop ===
while pos_count < 100:
    if row_index >= len(annotations):
        print("Reached end of annotations")
        break

    # grab row
    row = annotations.iloc[row_index]
    wav_path = os.path.join("data", "wav", row["wav_filename"])
    wav_base = os.path.splitext(row["wav_filename"])[0]

    # === POSITIVE ===
    pos_output_path = os.path.join("data/clips/positive", f"clip_{pos_count+1:05}_{wav_base}_{row['start_time_s']:.2f}".replace(".", "_") + ".wav")
    
    mfcc, delta = extract_positive_clip(
        wav_path=wav_path,
        start_time=row["start_time_s"],
        duration=2.0,
        output_path=pos_output_path
    )

    if mfcc is not None:
        record = {
            "clip_name": os.path.basename(pos_output_path),
            "label": "orca_call",
            "source_wav": row["wav_filename"],
            "start_time": row["start_time_s"]
        }
        for i, val in enumerate(mfcc):
            record[f"mfcc_{i+1}"] = val
        for i, val in enumerate(delta):
            record[f"delta_mfcc_{i+1}"] = val
        records.append(record)
        pos_count += 1

    row_index += 1

# === NEGATIVE ===
neg_records_61062 = extract_negatives_iterative(
    wav_filename="61062.wav",
    annotations_df=annotations,
    output_dir="data/clips/negative",
    clip_start_index=101,  # continue numbering if you already have 100
    max_clips=100         # or whatever number you'd like
)

records.extend(neg_records_61062)

# === Save final CSV ===

df = pd.DataFrame(records)
df.to_csv("mfcc_balanced_100x100.csv", index=False)


✅ Extracted 100 negative clips from 61062.wav
