In [None]:
import tensorflow as tf
import numpy as np
import math
from typing import Sequence
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import os

# ------------------------------
# Config (edit to your needs)
# ------------------------------
SR = 22050
NFFT = 1024
HOP = 512
NMELS = 128
FMIN = 200.0
FMAX = 8000.0

FRAME_LENGTH = NFFT
FRAME_STEP = HOP

LOG_EPS = 1e-6

# Different functions for each dataset
def load_birdclef(audio_root, path, target, folds=5):
    df = pd.read_csv(path)
    
    df["path"] = audio_root + "/" + df["primary_label"] + "/" + df["filename"]
    
    # Generate the binary labels
    y = (df['primary_label'] == target).astype(int).values
    
    # Sanity check
    dups = df['path'].duplicated().sum()  # or 'filename'
    print("Duplicate file rows:", dups)
    
    # Get k folds
    skf = StratifiedKFold(folds, shuffle=True, random_state=42)
    splits = []
    for tr, va in skf.split(df, y):
        splits.append({
            "train":tr, 
            "val": va, 
            "train_size": len(tr),
            "val_size": len(va),
            "train_pos_ratio": y[tr].mean(),
            "val_pos_ratio": y[va].mean(),
            }
        )
    
    return df, splits

################################
# Data Augmentations #
################################
def aug_rms_dbfs(x):
    rms = tf.sqrt(tf.reduce_mean(tf.square(x)) + 1e-12)
    return 20.0 * tf.math.log(rms + 1e-12) / tf.math.log(10.0)

def aug_rms_normalize(x, target_db=-20.0):
    cur = aug_rms_dbfs(x)
    gain_db = target_db - cur
    gain = tf.pow(10.0, gain_db / 20.0)
    y = x * gain
    return tf.clip_by_value(y, -1.0, 1.0)

def aug_volume_jitter(x, min_db=-6.0, max_db=6.0):
    gain_db = tf.random.uniform([], min_db, max_db)
    gain = tf.pow(10.0, gain_db / 20.0)
    y = x * gain
    return tf.clip_by_value(y, -1.0, 1.0)

def aug_gaussian_noise_snr(x, snr_db=5.0):
    # Scale Gaussian noise to achieve target SNR in dB
    rms_sig = tf.sqrt(tf.reduce_mean(tf.square(x)) + 1e-12)
    snr_lin = tf.pow(10.0, snr_db / 20.0)
    rms_noise = rms_sig / snr_lin
    noise = tf.random.normal(tf.shape(x))
    noise_rms = tf.sqrt(tf.reduce_mean(tf.square(noise)) + 1e-12)
    noise = noise * (rms_noise / (noise_rms + 1e-12))
    y = x + noise
    return tf.clip_by_value(y, -1.0, 1.0)


# ------------------------------
# Helpers: audio decode + loudness + augment
# ------------------------------

def load_audio(path, sr=SR):
    """Return mono float32 waveform at target sr from any audio file (.ogg, .wav, .flac, ...)."""
    def _read(p):
        import soundfile as sf
        import librosa
        p = p.numpy().decode("utf-8")
        y, sr0 = sf.read(p, always_2d=False)       # y: [T] or [T, C], dtype float/PCM
        if y.ndim == 2:                             # stereo -> mono
            y = y.mean(axis=1)
        if sr0 != sr:                               # resample if needed
            y = librosa.resample(y.astype(np.float32), orig_sr=sr0, target_sr=sr)
        y = y.astype(np.float32)
        # clamp to [-1, 1] just in case
        m = np.max(np.abs(y)) + 1e-9
        if m > 1.0: y = (y / m).astype(np.float32)
        return y
    wav = tf.py_function(_read, [path], Tout=tf.float32)
    wav.set_shape([None])  # 1-D waveform
    return wav

def build_file_lists(df, split, target):
    # This function creates lists of filepaths for the split
    # This results in 4 lists, train_pos, train_neg, val_pos and val_neg
    tr_idx, va_idx = split["train"], split["val"]
    train = df.iloc[tr_idx]; val = df.iloc[va_idx]
    
    pos_train = train[train.primary_label == target]["path"].tolist()
    neg_train = train[train.primary_label != target]["path"].tolist()

    pos_val = val[val.primary_label == target]["path"].tolist()
    neg_val = val[val.primary_label != target]["path"].tolist()
    return (pos_train, neg_train), (pos_val, neg_val)

def plan_epoch(n_pos_train, pos_exposures=1.0, neg_ratio=1.0, batch_size=32, pos_ratio=0.5):
    # This function plans the amount of positive and negative samples per training
    # epoch. This ensures that we don't train on all the negatives and we can 
    # tune the amount of positives that we see
    # n_pos_train: unique positive files in the train split
    # pos_exposures: how many times you want to see each positive per epoch on average (1.0 ≈ once)
    # neg_ratio: negatives per positive overall (1.0 = 1:1, 2.0 = twice as many negs)
    # pos_ratio: within-batch sampling weight for positives (0.5 = 50/50)
    
    P_pos = math.ceil(n_pos_train * pos_exposures)
    # keep the global neg:pos = neg_ratio (independent of within-batch pos_ratio)
    P_neg = math.ceil(P_pos * neg_ratio)

    total_examples = P_pos + P_neg
    steps_per_epoch = math.ceil(total_examples / batch_size)
    return P_pos, P_neg, steps_per_epoch

# ------------------------------
# Feature extraction: power log-mel
# ------------------------------
def power_logmel(x, sr=SR, nfft=NFFT, hop=HOP, n_mels=NMELS, fmin=FMIN, fmax=FMAX):
    # STFT -> power
    stft = tf.signal.stft(x, frame_length=nfft, frame_step=hop, window_fn=tf.signal.hann_window, pad_end=True)
    S = tf.abs(stft) ** 2  # [T_frames, nfft/2+1]
    # Mel projection
    mel_w = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins=n_mels,
        num_spectrogram_bins=nfft // 2 + 1,
        sample_rate=sr,
        lower_edge_hertz=fmin,
        upper_edge_hertz=fmax,
    )
    M = tf.matmul(S, mel_w)  # [T_frames, n_mels]
    M = tf.transpose(M, [1, 0])  # [n_mels, time]
    # Log of power mel
    return tf.math.log(M + LOG_EPS)

# ------------------------------
# SpecAugment (optional) on mel features [n_mels, time]
# ------------------------------
def spec_augment(mel, num_time_masks=2, time_mask_width=16, num_freq_masks=2, freq_mask_width=8):
    n_mels = tf.shape(mel)[0]
    T = tf.shape(mel)[1]
    m = tf.identity(mel)

    def mask_freq(m):
        w = tf.random.uniform([], 0, freq_mask_width + 1, dtype=tf.int32)
        f0 = tf.random.uniform([], 0, tf.maximum(1, n_mels - w + 1), dtype=tf.int32)
        mask = tf.concat([
            tf.ones([f0, T], m.dtype),
            tf.zeros([w, T], m.dtype),
            tf.ones([n_mels - f0 - w, T], m.dtype)
        ], axis=0)
        return m * mask

    def mask_time(m):
        w = tf.random.uniform([], 0, time_mask_width + 1, dtype=tf.int32)
        t0 = tf.random.uniform([], 0, tf.maximum(1, T - w + 1), dtype=tf.int32)
        left = tf.ones([n_mels, t0], m.dtype)
        mid  = tf.zeros([n_mels, w], m.dtype)
        right= tf.ones([n_mels, T - t0 - w], m.dtype)
        mask = tf.concat([left, mid, right], axis=1)
        return m * mask

    for _ in range(num_freq_masks):
        m = mask_freq(m)
    for _ in range(num_time_masks):
        m = mask_time(m)
    return m

# ------------------------------
# Dataset builders
# ------------------------------
def make_waveform_ds(file_paths: Sequence[str], label: int, shuffle=True):
    # Convert the raw audio into a tensorflow dataset based on the paths
    ds = tf.data.Dataset.from_tensor_slices((list(file_paths), tf.fill([len(file_paths)], label)))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(file_paths), reshuffle_each_iteration=True)
    def _load(path, y):
        wav = load_audio(path)
        return wav, tf.cast(y, tf.float32)
    return ds.map(_load, num_parallel_calls=tf.data.AUTOTUNE)

def augment_waveform(wav, y, p_vol=0.9, p_noise=0.5, snr_low=-2.0, snr_high=8.0):
    # Normalize first for stable SNR math
    wav = aug_rms_normalize(wav, target_db=-20.0)
    # Volume jitter
    do_vol = tf.random.uniform([]) < p_vol
    wav = tf.cond(do_vol, lambda: aug_volume_jitter(wav, -6.0, 6.0), lambda: wav)
    # Gaussian SNR noise
    do_noise = tf.random.uniform([]) < p_noise
    snr = tf.random.uniform([], snr_low, snr_high)
    wav = tf.cond(do_noise, lambda: aug_gaussian_noise_snr(wav, snr), lambda: wav)
    return wav, y

def to_logmel(wav, y):
    mel = power_logmel(wav)   # [n_mels, time]
    return mel, y

def maybe_specaugment(mel, y, p=0.5):
    return tf.cond(tf.random.uniform([]) < p,
                   lambda: (spec_augment(mel), y),
                   lambda: (mel, y))

# ------------------------------
# Balanced pipeline with sample_from_datasets + MixUp
# ------------------------------
def build_balanced_mel_pipeline_epoch(pos_files, neg_files, batch_size=32,
                                      pos_ratio=0.5, apply_specaugment=True, mixup_alpha=0.3):
    pos_ds = make_waveform_ds(pos_files, label=1).shuffle(4096).repeat()
    neg_ds = make_waveform_ds(neg_files, label=0).shuffle(4096).repeat()

    pos_ds = pos_ds.map(lambda x,y: augment_waveform(x,y, p_vol=0.9, p_noise=0.7), num_parallel_calls=tf.data.AUTOTUNE)
    neg_ds = neg_ds.map(lambda x,y: augment_waveform(x,y, p_vol=0.5, p_noise=0.4), num_parallel_calls=tf.data.AUTOTUNE)

    to_feat = lambda ds: ds.map(to_logmel, num_parallel_calls=tf.data.AUTOTUNE)
    pos_ds, neg_ds = to_feat(pos_ds), to_feat(neg_ds)

    if apply_specaugment:
        pos_ds = pos_ds.map(maybe_specaugment, num_parallel_calls=tf.data.AUTOTUNE)
        neg_ds = neg_ds.map(maybe_specaugment, num_parallel_calls=tf.data.AUTOTUNE)

    ds = tf.data.Dataset.sample_from_datasets([pos_ds, neg_ds], weights=[pos_ratio, 1.0 - pos_ratio])
    ds = ds.shuffle(2048).padded_batch(
        batch_size,
        padded_shapes=(tf.TensorShape([NMELS, None]), tf.TensorShape([]))
    ).map(lambda m,y: (tf.ensure_shape(m, [None, NMELS, None]), tf.ensure_shape(y, [None])),
          num_parallel_calls=tf.data.AUTOTUNE)

    # Batch-level MixUp (feature-space)
    if mixup_alpha and mixup_alpha > 0:
        def mixup_batch(mel_batch, y_batch):
            # Random shuffle
            indices = tf.random.shuffle(tf.range(tf.shape(mel_batch)[0]))
            mel_shuf = tf.gather(mel_batch, indices)
            y_shuf = tf.gather(y_batch, indices)
            # Sample lambda
            beta = tfp_distrib_beta(mixup_alpha, mixup_alpha, shape=[tf.shape(mel_batch)[0], 1, 1])
            mel_mix = beta * mel_batch + (1. - beta) * mel_shuf
            y_mix = tf.expand_dims(y_batch, -1) * beta[...,0] + tf.expand_dims(y_shuf, -1) * (1. - beta[...,0])
            y_mix = tf.squeeze(y_mix, -1)
            return mel_mix, y_mix

        # simple Beta sampler without tfp dependency
        def tfp_distrib_beta(a, b, shape):
            # Gamma-based sampling: X~Gamma(a,1), Y~Gamma(b,1), X/(X+Y)
            x = tf.random.gamma(shape, a, beta=1.0)
            y = tf.random.gamma(shape, b, beta=1.0)
            return x / (x + y + 1e-8)

        ds = ds.map(mixup_batch, num_parallel_calls=tf.data.AUTOTUNE)

    return ds.prefetch(tf.data.AUTOTUNE)

# ------------------------------
# Simple model + focal loss (optional)
# ------------------------------
def make_cnn(input_time=None):
    inp = tf.keras.Input(shape=(NMELS, input_time), name="mel")  # time can be variable (None)
    x = tf.keras.layers.Permute((2,1))(inp)  # [B, T, n_mels]
    x = tf.keras.layers.Conv1D(64, 5, padding="same", activation="relu")(x)
    x = tf.keras.layers.Conv1D(64, 5, padding="same", activation="relu")(x)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = tf.keras.layers.Dense(64, activation="relu")(x)
    out = tf.keras.layers.Dense(1, activation="sigmoid")(x)
    return tf.keras.Model(inp, out)

def binary_focal_loss(gamma=2.0, alpha=0.25):
    def loss(y_true, y_pred):
        y_true = tf.cast(y_true, y_pred.dtype)
        eps = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, eps, 1. - eps)
        pt = tf.where(tf.equal(y_true, 1.), y_pred, 1. - y_pred)
        w = tf.where(tf.equal(y_true, 1.), alpha, 1. - alpha)
        return tf.reduce_mean(-w * tf.pow(1. - pt, gamma) * tf.math.log(pt))
    return loss

# ------------------------------
# Usage
# ------------------------------
# pos_files = [...]  # paths of target species files
# neg_files = [...]  # paths of other/background files

os.chdir("/home/joris/Thesis/new_attempt")
target = "rucwar"
birdclef_df, splits = load_birdclef("datasets/birdclef_2021/train_short_audio", "datasets/birdclef_2021/train_metadata.csv", target=target, folds=5)


(pos_tr, neg_tr), (pos_va, neg_va) = build_file_lists(birdclef_df, splits[0], target)
P_pos, P_neg, steps = plan_epoch(n_pos_train=len(pos_tr),
                                 pos_exposures=1.0,   # see each positive ~once per epoch
                                 neg_ratio=1.0,       # 1:1 overall
                                 batch_size=32, pos_ratio=0.5)

train_ds = build_balanced_mel_pipeline_epoch(pos_tr, neg_tr, batch_size=32, pos_ratio=0.5)
val_ds   = build_balanced_mel_pipeline_epoch(pos_va, neg_va, batch_size=32, pos_ratio=0.5)

for mel, label in train_ds.take(1):
    print("mel shape:", mel.shape)   # (batch_size, n_mels, T)
    print("label shape:", label.shape)
# model = make_cnn(input_time=None)
# model.compile(optimizer="adam", loss=binary_focal_loss(gamma=2.0, alpha=0.25), metrics=["AUC", "Precision", "Recall"])

# model.fit(train_ds,
#           steps_per_epoch=steps,
#           validation_data=val_ds,
#           validation_steps=max(1, len(pos_va+neg_va)//32),
#           epochs=200)
# model.fit(ds_train, epochs=20, steps_per_epoch=1000)  # steps_per_epoch because ds is infinite

# ds_train = build_balanced_mel_pipeline(pos_files, neg_files, batch_size=32, apply_specaugment=True, mixup_alpha=0.3)
# model = make_cnn(input_time=None)
# model.compile(optimizer="adam", loss=binary_focal_loss(gamma=2.0, alpha=0.25), metrics=["AUC", "Precision", "Recall"])
# model.fit(ds_train, epochs=20, steps_per_epoch=1000)  # steps_per_epoch because ds is infinite


Duplicate file rows: 0


2025-08-26 14:05:16.333073: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:453] fused(ShuffleDatasetV3:58,RepeatDataset:59): Filling up shuffle buffer (this may take a while): 757 of 4096
