# Beehive Audio → NumPy Feature Builder
**Works with flat folders of WAVs** (all mixed) and **optionally** a CSV for labels.
Default features: **log-Mel**. You can switch to MFCC later.

In [None]:
# !pip install librosa soundfile scipy scikit-learn numpy pandas
import os
from pathlib import Path
import json
import random
import pandas as pd
import numpy as np
import librosa
import soundfile as sf
from scipy.signal import butter, sosfiltfilt
from sklearn.model_selection import train_test_split

## Configuration

In [None]:
# --- Paths (Windows friendly) ---
# Set to your flat WAV folder:
DATA_DIR = Path(r"C:\Users\leona\Documents\Thesis_Project_UACH\Temp\Dataset\BeesAnna\sound_files")

# Output directory (you can change this):
OUT_DIR  = Path(r"C:\Users\leona\Documents\Thesis_Project_UACH\Temp\Dataset\BeesAnna\features_bees")

# Optional: provide a CSV with columns: filename,label
# 'filename' should match the WAV file name in DATA_DIR (or a relative path).
# Example labels: queen, no_queen (strings). The notebook will map them to integers.
LABELS_CSV = None  # e.g., Path(r"C:\path\to\labels.csv")

# If LABELS_CSV is None, we'll export a single unlabeled X_all.npy and files.npy

# --- Audio & preprocessing ---
SR        = 16000
TRIM_DB   = 30
USE_BANDPASS = True
BP_LOW, BP_HIGH = 100, 1000  # Hz

# --- Segmentation ---
SEG_SEC   = 2.0
HOP_SEC   = 1.0

# --- Feature Type ---
FEATURE_TYPE = "logmel"  # "logmel" or "mfcc"

# Log-Mel parameters
N_MELS    = 64
N_FFT     = int(0.025 * SR)  # 25 ms
HOP_LEN   = int(0.010 * SR)  # 10 ms
FMIN, FMAX = 20, SR // 2

# MFCC parameters
N_MFCC      = 32
ADD_DELTAS  = True           # stack Δ and ΔΔ for CNN channels

# --- Splits & randomness (used only if labels CSV is provided) ---
RANDOM_SEED = 123
TEST_SIZE   = 0.15
VAL_SIZE    = 0.15           # of remaining after test

## Utilities

In [None]:
def peak_normalize(x, eps=1e-9):
    peak = np.max(np.abs(x)) + eps
    return x / peak

def bandpass_sos(sr, low_hz, high_hz, order=4):
    return butter(order, [low_hz, high_hz], btype='bandpass', fs=sr, output='sos')

def apply_bandpass(x, sr, low, high):
    sos = bandpass_sos(sr, low, high)
    return sosfiltfilt(sos, x)

def load_and_clean(path):
    x, _sr = librosa.load(path, sr=SR, mono=True)
    x, _ = librosa.effects.trim(x, top_db=TRIM_DB)
    x = peak_normalize(x)
    if USE_BANDPASS:
        x = apply_bandpass(x, SR, BP_LOW, BP_HIGH)
    return x

def segment_signal(x, sr, seg_sec, hop_sec):
    seg_len = int(seg_sec * sr)
    hop_len = int(hop_sec * sr)
    if len(x) < seg_len:
        pad = seg_len - len(x)
        x = np.pad(x, (0, pad), mode='reflect')
    segments = []
    for start in range(0, max(1, len(x)-seg_len+1), hop_len):
        end = start + seg_len
        seg = x[start:end]
        if len(seg) < seg_len:
            seg = np.pad(seg, (0, seg_len - len(seg)), mode='reflect')
        segments.append(seg)
    return segments

def compute_logmel(seg):
    S = librosa.feature.melspectrogram(
        y=seg, sr=SR, n_fft=N_FFT, hop_length=HOP_LEN,
        n_mels=N_MELS, fmin=FMIN, fmax=FMAX, power=2.0
    )
    logS = librosa.power_to_db(S, ref=np.max)
    return logS[np.newaxis, :, :].astype(np.float32)

def compute_mfcc(seg):
    mfcc = librosa.feature.mfcc(
        y=seg, sr=SR, n_mfcc=N_MFCC, n_fft=N_FFT, hop_length=HOP_LEN,
        fmin=FMIN, fmax=FMAX
    )
    if ADD_DELTAS:
        delta = librosa.feature.delta(mfcc)
        delta2 = librosa.feature.delta(mfcc, order=2)
        feat = np.stack([mfcc, delta, delta2], axis=0)
    else:
        feat = mfcc[np.newaxis, :, :]
    return feat.astype(np.float32)

def featurize(seg):
    return compute_logmel(seg) if FEATURE_TYPE == "logmel" else compute_mfcc(seg)

def standardize_features(train_X, val_X, test_X, eps=1e-6):
    C = train_X.shape[1]
    means, stds = [], []
    train_X_std = train_X.copy(); val_X_std = val_X.copy(); test_X_std = test_X.copy()
    for c in range(C):
        mu = train_X[:, c].mean()
        sigma = train_X[:, c].std() + eps
        means.append(float(mu)); stds.append(float(sigma))
        train_X_std[:, c] = (train_X[:, c] - mu) / sigma
        val_X_std[:, c]   = (val_X[:, c]   - mu) / sigma
        test_X_std[:, c]  = (test_X[:, c]  - mu) / sigma
    stats = {"channel_means": means, "channel_stds": stds}
    return train_X_std, val_X_std, test_X_std, stats

## Discover WAV files in a flat folder

In [None]:
wav_paths = sorted(list(DATA_DIR.rglob("*.wav"))) + sorted(list(DATA_DIR.rglob("*.WAV")))
print(f"Found {len(wav_paths)} wav files under {DATA_DIR}")
if len(wav_paths) == 0:
    raise SystemExit(f"No WAV files found under {DATA_DIR}. Check the path and file extensions.")

## Optional: load labels from CSV

In [None]:
labels_df = None
if LABELS_CSV is not None:
    labels_df = pd.read_csv(LABELS_CSV)
    assert {"filename","label"}.issubset(labels_df.columns), "CSV must have columns: filename,label"
    labels_df["filename"] = labels_df["filename"].astype(str)
    fn2lab = dict(zip(labels_df["filename"].map(lambda s: Path(s).name), labels_df["label"].astype(str)))
    y_labels = []
    missing = []
    for p in wav_paths:
        bn = p.name
        if bn in fn2lab:
            y_labels.append(fn2lab[bn])
        else:
            missing.append(bn)
    if missing:
        print(f"Warning: {len(missing)} files missing labels in CSV; they will be dropped.")
        keep_mask = [p.name in fn2lab for p in wav_paths]
        wav_paths = [p for p, k in zip(wav_paths, keep_mask) if k]
        y_labels  = [fn2lab[p.name] for p in wav_paths]
    classes = sorted(set(y_labels))
    lab2idx = {lab:i for i,lab in enumerate(classes)}
    y_all = np.array([lab2idx[l] for l in y_labels], dtype=np.int64)
    print("Classes -> indices:", lab2idx)
else:
    print("No LABELS_CSV provided: proceeding with unlabeled export.")

## Build features

In [None]:
random.seed(RANDOM_SEED); np.random.seed(RANDOM_SEED)
OUT_DIR.mkdir(parents=True, exist_ok=True)

def process_many(paths):
    feats = []
    for path in paths:
        x = load_and_clean(str(path))
        segments = segment_signal(x, SR, SEG_SEC, HOP_SEC)
        for seg in segments:
            feats.append(featurize(seg))
    X = np.stack(feats, axis=0)  # (N,C,H,W)
    return X

if 'labels_df' in globals() and labels_df is not None:
    # Labeled mode
    wavs = np.array(wav_paths); y_all = np.array(y_all)
    wavs_train, wavs_tmp, y_train, y_tmp = train_test_split(
        wavs, y_all, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y_all
    )
    val_ratio_of_remaining = VAL_SIZE / (1.0 - TEST_SIZE)
    wavs_val, wavs_test, y_val, y_test = train_test_split(
        wavs_tmp, y_tmp, test_size=1.0 - val_ratio_of_remaining,
        random_state=RANDOM_SEED, stratify=y_tmp
    )

    def process_many_labeled(paths, labels):
        feats = []; labs = []
        for path, lab in zip(paths, labels):
            x = load_and_clean(str(path))
            segments = segment_signal(x, SR, SEG_SEC, HOP_SEC)
            for seg in segments:
                feats.append(featurize(seg)); labs.append(lab)
        X = np.stack(feats, axis=0); y = np.array(labs, dtype=np.int64)
        return X, y

    print("Processing TRAIN..."); X_train, y_train = process_many_labeled(wavs_train, y_train)
    print("Processing VAL...");   X_val,   y_val   = process_many_labeled(wavs_val,   y_val)
    print("Processing TEST...");  X_test,  y_test  = process_many_labeled(wavs_test,  y_test)

    print("Standardizing (train stats only)...")
    X_train, X_val, X_test, stats = standardize_features(X_train, X_val, X_test)

    np.save(OUT_DIR / "X_train.npy", X_train); np.save(OUT_DIR / "y_train.npy", y_train)
    np.save(OUT_DIR / "X_val.npy",   X_val);   np.save(OUT_DIR / "y_val.npy",   y_val)
    np.save(OUT_DIR / "X_test.npy",  X_test);  np.save(OUT_DIR / "y_test.npy",  y_test)

    with open(OUT_DIR / "meta.json", "w") as f:
        json.dump({
            "sr": SR,
            "segment_seconds": SEG_SEC,
            "hop_seconds": HOP_SEC,
            "feature_type": FEATURE_TYPE,
            "n_mels": int(N_MELS) if FEATURE_TYPE=="logmel" else None,
            "n_mfcc": int(N_MFCC) if FEATURE_TYPE=="mfcc" else None,
            "add_deltas": bool(ADD_DELTAS) if FEATURE_TYPE=="mfcc" else None,
            "classes": {int(i): lab for lab, i in lab2idx.items()},  # inverse map
            "standardization": stats,
            "bandpass": {"enabled": bool(USE_BANDPASS), "low": int(BP_LOW), "high": int(BP_HIGH)}
        }, f, indent=2)
    print("Saved labeled splits to:", OUT_DIR.resolve())
else:
    # Unlabeled mode
    X_all = process_many(wav_paths)
    np.save(OUT_DIR / "X_all.npy", X_all)
    with open(OUT_DIR / "files.json", "w") as f:
        json.dump([str(p) for p in wav_paths], f, indent=2)
    with open(OUT_DIR / "meta.json", "w") as f:
        json.dump({
            "sr": SR,
            "segment_seconds": SEG_SEC,
            "hop_seconds": HOP_SEC,
            "feature_type": FEATURE_TYPE,
            "n_mels": int(N_MELS) if FEATURE_TYPE=="logmel" else None,
            "n_mfcc": int(N_MFCC) if FEATURE_TYPE=="mfcc" else None,
            "add_deltas": bool(ADD_DELTAS) if FEATURE_TYPE=="mfcc" else None,
            "bandpass": {"enabled": bool(USE_BANDPASS), "low": int(BP_LOW), "high": int(BP_HIGH)}
        }, f, indent=2)
    print("Saved unlabeled features to:", OUT_DIR.resolve())

## Sanity check

In [None]:
if (OUT_DIR / "X_all.npy").exists():
    Xa = np.load(OUT_DIR / "X_all.npy")
    print("X_all shape:", Xa.shape, "(N, C, H, W)")
else:
    Xt = np.load(OUT_DIR / "X_train.npy"); yt = np.load(OUT_DIR / "y_train.npy")
    print("X_train shape:", Xt.shape, " y_train:", yt.shape)