In [84]:
import pandas as pd
from pathlib import Path
import librosa
import soundfile as sf
import numpy as np

In [85]:
def get_noise_intervals(df: pd.DataFrame, file_col: str, start_col: str, end_col: str, total_dur_lookup: dict, min_len=2.0):
    """Return list of noise intervals [(filename, start, end)]"""
    noise_intervals = []

    for fname, group in df.groupby(file_col):
        total_dur = total_dur_lookup[fname]  # you can precompute this with librosa.get_duration()
        bird_segments = sorted(group[[start_col, end_col]].values.tolist())
        
        prev_end = 0.0
        for start, end in bird_segments:
            if start - prev_end > min_len:
                noise_intervals.append((fname, prev_end, start))
            prev_end = end
        
        # Add tail segment
        if total_dur - prev_end > min_len:
            noise_intervals.append((fname, prev_end, total_dur))
    return noise_intervals

In [86]:
def extract_noise_segments(noise_intervals, in_dir, out_dir, sr=22050, max_dur=3.0):
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    
    for i, (fname, start, end) in enumerate(noise_intervals):
        y, _ = librosa.load(in_dir / fname, sr=sr, offset=start, duration=min(end - start, max_dur))
        if len(y) > 0:
            sf.write(f"{out_dir}/noise_{i:04d}.wav", y, sr)

In [87]:
def get_total_durations(recordings_path: Path, file_list):
    """
    Compute total duration (in seconds) for each audio file in file_list.
    """
    total_dur_lookup = {}
    for f in tqdm(file_list, desc="Computing durations"):
        fpath = recordings_path / f
        try:
            dur = librosa.get_duration(path=fpath)
            total_dur_lookup[f] = dur
        except Exception as e:
            print(f"Could not read {f}: {e}")
    return total_dur_lookup

In [88]:
annotations_file = Path("~/data/kenya_birds/annotations.csv").expanduser()
recordings_path = Path("~/data/kenya_birds/soundscape_data/").expanduser()

print("annotations: ", annotations_file.exists())
print("recordings: ", recordings_path.exists())

annotations:  True
recordings:  True


In [89]:
df = pd.read_csv(annotations_file)

df.head()

Unnamed: 0,Filename,Start Time (s),End Time (s),Species eBird Code
0,KEN_001_20211207_153852.flac,67.8,67.8,slcbou1
1,KEN_001_20211207_153852.flac,106.8,106.8,slcbou1
2,KEN_001_20211207_153852.flac,107.5,107.5,hamerk1
3,KEN_001_20211207_153852.flac,118.4,118.4,slcbou1
4,KEN_001_20211207_153852.flac,209.1,209.1,hamerk1


In [90]:
file_list = df["Filename"].unique()

total_dur_lookup = get_total_durations(recordings_path, file_list)

# Example check
print(f"Duration of first file: {next(iter(total_dur_lookup.items()))}")


Computing durations: 100%|██████████| 35/35 [00:00<00:00, 13793.16it/s]

Duration of first file: ('KEN_001_20211207_153852.flac', 3600.0)





In [91]:
noise_intervals = get_noise_intervals(
    df=df,
    file_col="Filename",
    start_col="Start Time (s)",
    end_col="End Time (s)",
    total_dur_lookup=total_dur_lookup,
    min_len=3.0
)

In [92]:
noise_dir = recordings_path / "noise_clips"

if not noise_dir.exists():
    noise_dir.mkdir(parents=True)

In [94]:
extract_noise_segments(
  noise_intervals, 
  in_dir=recordings_path,
  out_dir=noise_dir, 
  sr=22050,
  max_dur=3.0)

KeyboardInterrupt: 