In [2]:
# 00 - imports
import os, glob, math, json
import numpy as np
import pandas as pd
import librosa

# util: crea carpeta si no existe
def ensure_dir(path):
    os.makedirs(path, exist_ok=True)

In [3]:
# 01 - configuración
audio_dir = r"C:\Users\leona\Documents\Thesis_Project_UACH\Temp\Dataset\BeesAnna\sound_files"  # Folder with .wav files
out_dir = "mfcc_files_03-1"             # carpeta de salida

# audio
sr = 22050
win_ms = 0.025                  # 25 ms
hop_ms = 0.0125                 # 12.5 ms (50 % de la ventana)
win_length = int(round(win_ms * sr))       # ~551 muestras
hop_length = int(round(hop_ms * sr))       # ~276 muestras
n_fft = 1024                               
n_mels = 40
n_mfcc = 13

# features
drop_c0 = True        # 'True' excluye C0 (energía) en MFCC
apply_cmvn = True     # normaliza MFCC por coeficiente (CMVN)
fmin = 20
fmax = sr / 2

# crear vector agregado por archivo
make_statpool = True

# salidas
logmel_dir = os.path.join(out_dir, "logmel")
mfcc_dir = os.path.join(out_dir, "mfcc_cmvn_noc0" if (drop_c0 and apply_cmvn) else "mfcc_raw")
statpool_path_npy = os.path.join(out_dir, "features_statpool.npy")
statpool_path_csv = os.path.join(out_dir, "features_statpool.csv")
metadata_csv = os.path.join(out_dir, "metadata.csv")

# crea carpetas
ensure_dir(out_dir)
ensure_dir(logmel_dir)
ensure_dir(mfcc_dir)

print(f"win_length={win_length}, hop_length={hop_length}, n_fft={n_fft}")


win_length=551, hop_length=276, n_fft=1024


In [4]:
# 02 - helpers de features

def cmvn(x, axis=1, eps=1e-8):
    """
    Normalización de media y varianza por coeficiente (fila).
    x: np.array [n_feats x frames]
    """
    mu = x.mean(axis=axis, keepdims=True)
    sd = x.std(axis=axis, keepdims=True)
    return (x - mu) / (sd + eps)

def stat_pool(mat, percentiles=(10, 90)):
    """
    Pooling estadístico por fila (feature) -> concat [mean, std, p10, p90].
    mat: np.array [n_feats x frames]
    return: [4*n_feats]
    """
    mean = np.mean(mat, axis=1)
    std  = np.std(mat, axis=1)
    p10  = np.percentile(mat, percentiles[0], axis=1)
    p90  = np.percentile(mat, percentiles[1], axis=1)
    return np.concatenate([mean, std, p10, p90], axis=0)

In [5]:
# 03 - extractor principal por archivo

def extract_one(audio_path):
    """
    Devuelve:
      logmel_db: np.array [frames x n_mels]
      mfcc_mat: np.array [frames x (n_mfcc or 12)]
      info: dict con metadatos
      stat_vec: np.array [*,] opcional (si make_statpool=True)
    """
    y, _ = librosa.load(audio_path, sr=sr)

    # mel-spectrograma (potencia) -> dB
    s_mel = librosa.feature.melspectrogram(
        y=y, sr=sr, n_fft=n_fft, hop_length=hop_length,
        win_length=win_length, n_mels=n_mels, power=2.0,
        fmin=fmin, fmax=fmax
    )  # [n_mels x frames]
    s_db = librosa.power_to_db(s_mel, ref=np.max)

    # mfcc desde log-mel (en dB)
    mfcc = librosa.feature.mfcc(S=s_db, n_mfcc=n_mfcc)  # [n_mfcc x frames]

    # excluir C0 si aplica
    if drop_c0:
        mfcc = mfcc[1:, :]  # deja 12 x frames

    # cmvn si aplica (por coeficiente)
    if apply_cmvn:
        mfcc = cmvn(mfcc, axis=1)

    # transponer a [frames x feat] para guardar
    logmel_db = s_db.T
    mfcc_mat = mfcc.T

    # metadatos
    duration_s = len(y) / sr
    frames = logmel_db.shape[0]
    info = {
        "file": os.path.basename(audio_path),
        "sr": sr,
        "duration_s": float(duration_s),
        "frames": int(frames),
        "win_length": int(win_length),
        "hop_length": int(hop_length),
        "n_fft": int(n_fft),
        "n_mels": int(n_mels),
        "n_mfcc_effective": int(mfcc_mat.shape[1]),
        "fmin": float(fmin),
        "fmax": float(fmax),
        "drop_c0": bool(drop_c0),
        "apply_cmvn": bool(apply_cmvn),
    }

    # stat-pool opcional (concatenamos sobre [n_feats x frames])
    stat_vec = None
    if make_statpool:
        # volvemos a forma [feat x frames] temporalmente
        logmel_tf = s_db            # [n_mels x frames]
        mfcc_tf   = mfcc            # [(12 o 13) x frames]
        blocks = [
            stat_pool(logmel_tf),
            stat_pool(mfcc_tf),
        ]
        stat_vec = np.concatenate(blocks, axis=0)

    return logmel_db, mfcc_mat, info, stat_vec

In [6]:
# 04 - procesamiento por lote y guardado

wav_files = sorted(glob.glob(os.path.join(audio_dir, "*.wav")))
print("archivos .wav encontrados:", len(wav_files))

metas = []
stat_list = []
ids_for_stat = []

for i, w in enumerate(wav_files, 1):
    try:
        logmel_db, mfcc_mat, info, stat_vec = extract_one(w)
        stem = os.path.splitext(os.path.basename(w))[0]

        # rutas de guardado
        logmel_path = os.path.join(logmel_dir, f"{stem}_logmel.npy")
        mfcc_path   = os.path.join(mfcc_dir,   f"{stem}_mfcc.npy")

        # guardar matrices [frames x feat]
        np.save(logmel_path, logmel_db)
        np.save(mfcc_path,   mfcc_mat)

        # metadata
        metas.append(info)

        # stat-pool opcional
        if make_statpool and stat_vec is not None:
            stat_list.append(stat_vec)
            ids_for_stat.append(stem)

        if i % 25 == 0 or i == len(wav_files):
            print(f"[{i}/{len(wav_files)}] {stem} -> "
                  f"logmel {logmel_db.shape}, mfcc {mfcc_mat.shape}")

    except Exception as e:
        print("error con:", w, "->", e)

# guardar metadata
pd.DataFrame(metas).to_csv(metadata_csv, index=False)
print("metadata guardada en:", metadata_csv)

# guardar statpool si aplica
if make_statpool and len(stat_list) > 0:
    x = np.vstack(stat_list)
    df_ids = pd.DataFrame({"id": ids_for_stat})
    np.save(statpool_path_npy, x)
    df_ids.to_csv(statpool_path_csv, index=False)
    print("stat-pool shape:", x.shape)
    print("stat-pool guardado en:", statpool_path_npy, "y", statpool_path_csv)

archivos .wav encontrados: 7100


  from pkg_resources import resource_filename


[25/7100] 2022-06-05--21-37-36_2__segment0 -> logmel (4794, 40), mfcc (4794, 12)
[50/7100] 2022-06-06--01-34-26_2__segment1 -> logmel (4794, 40), mfcc (4794, 12)
[75/7100] 2022-06-06--05-31-11_2__segment2 -> logmel (4794, 40), mfcc (4794, 12)
[100/7100] 2022-06-06--09-28-32_2__segment3 -> logmel (4794, 40), mfcc (4794, 12)
[125/7100] 2022-06-06--13-25-30_2__segment4 -> logmel (4794, 40), mfcc (4794, 12)
[150/7100] 2022-06-06--17-22-09_2__segment5 -> logmel (4794, 40), mfcc (4794, 12)
[175/7100] 2022-06-06--22-17-46_2__segment0 -> logmel (4794, 40), mfcc (4794, 12)
[200/7100] 2022-06-07--02-14-33_2__segment1 -> logmel (4794, 40), mfcc (4794, 12)
[225/7100] 2022-06-07--06-11-25_2__segment2 -> logmel (4794, 40), mfcc (4794, 12)
[250/7100] 2022-06-07--10-09-14_2__segment3 -> logmel (4794, 40), mfcc (4794, 12)
[275/7100] 2022-06-07--14-06-09_2__segment4 -> logmel (4794, 40), mfcc (4794, 12)
[300/7100] 2022-06-07--18-02-41_2__segment5 -> logmel (4794, 40), mfcc (4794, 12)
[325/7100] 2022-06-

KeyboardInterrupt: 

In [None]:
# 05 - ejemplo: cargar un npy y revisar forma
test_npy = sorted(glob.glob(os.path.join(mfcc_dir, "*_mfcc.npy")))[0]
arr = np.load(test_npy)
print(os.path.basename(test_npy), "->", arr.shape, "[frames x feats]")
