## Celda 1 — import

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
import json
import random

## Celda 2 — Configuración (rutas + columnas), Cargar CSV, filtrar etiquetados, mapear labels

In [2]:
DATA_DIR = Path(r"C:\Users\leona\Documents\Thesis_Project_UACH\Temp\Dataset\BeesAnna\sound_files")
CSV_PATH = Path(r"C:\Users\leona\Documents\Thesis_Project_UACH\Temp\Dataset\BeesAnna\all_data_updated.csv")

ID_COL = "file name"            # <-- columna de nombres
TARGET_COL = "queen status"     # <-- la etiqueta que se utilizará
VALID_CLASSES = {0, 1, 2, 3}

# 1) Construir diccionario: nombre_archivo.wav -> ruta_completa
wav_paths = sorted(list(DATA_DIR.rglob("*.wav"))) + sorted(list(DATA_DIR.rglob("*.WAV")))
name_to_path = {p.name: str(p) for p in wav_paths}  # str() para que sea serializable y fácil

df = pd.read_csv(CSV_PATH)

assert ID_COL in df.columns, f"Falta ID_COL={ID_COL} en el CSV"
assert TARGET_COL in df.columns, f"Falta TARGET_COL={TARGET_COL} en el CSV"

# 2) Normaliza nombres (solo basename)
df["_basename"] = df[ID_COL].astype(str).map(lambda s: Path(s).name)

# 3) Convertir queen status a numérico y filtrar clases válidas
df["_y"] = pd.to_numeric(df[TARGET_COL], errors="coerce")
labeled = df[df["_y"].isin([0,1,2,3])].copy()
labeled["y"] = labeled["_y"].astype(np.int64)

# 4) Crear columna "path" (ruta completa al wav)
labeled["path"] = labeled["_basename"].map(name_to_path)



# 5) Verificar que no falte ninguno
missing = labeled["path"].isna().sum()
print("Etiquetados:", len(labeled), "| faltantes sin path:", int(missing))
assert missing == 0, "Hay archivos etiquetados que no se encontraron en la carpeta."
print("Conteo por clase:\n", labeled["y"].value_counts().sort_index())

Etiquetados: 1275 | faltantes sin path: 0
Conteo por clase:
 y
0    179
1    158
2    259
3    679
Name: count, dtype: int64


In [3]:
# Audio / MFCC params (igual que la prueba "TestMFCC")
SR = 16000
TRIM_DB = 30
SEG_SEC = 2.0
HOP_SEC = 1.0

N_MFCC = 32
N_FFT  = int(0.025 * SR)
HOP_LEN= int(0.010 * SR)
FMIN, FMAX = 20, SR//2

ADD_DELTAS = True

RANDOM_SEED = 123
TEST_SIZE = 0.15
VAL_SIZE  = 0.15

In [4]:
OUT_DIR = Path(r"C:\Users\leona\Documents\Thesis_Project_UACH\Temp\Dataset\features_mfcc_labeled")
OUT_DIR.mkdir(parents=True, exist_ok=True)

## Celda 3 — Funciones: limpiar, segmentar, MFCC (+Δ +ΔΔ)

In [5]:
def peak_normalize(x, eps=1e-9):
    return x / (np.max(np.abs(x)) + eps)

def load_and_clean(path):
    x, _ = librosa.load(str(path), sr=SR, mono=True)
    x, _ = librosa.effects.trim(x, top_db=TRIM_DB)
    x = peak_normalize(x)
    return x

def segment_signal(x, sr, seg_sec, hop_sec):
    seg_len = int(seg_sec * sr)
    hop_len = int(hop_sec * sr)
    if len(x) < seg_len:
        x = np.pad(x, (0, seg_len - len(x)), mode="reflect")
    segments = []
    for start in range(0, max(1, len(x)-seg_len+1), hop_len):
        seg = x[start:start+seg_len]
        if len(seg) < seg_len:
            seg = np.pad(seg, (0, seg_len - len(seg)), mode="reflect")
        segments.append(seg)
    return segments

def mfcc_features(seg):
    mfcc = librosa.feature.mfcc(
        y=seg, sr=SR, n_mfcc=N_MFCC, n_fft=N_FFT, hop_length=HOP_LEN,
        fmin=FMIN, fmax=FMAX
    )
    if ADD_DELTAS:
        d1 = librosa.feature.delta(mfcc)
        d2 = librosa.feature.delta(mfcc, order=2)
        feat = np.stack([mfcc, d1, d2], axis=0)  # (3, n_mfcc, T)
    else:
        feat = mfcc[np.newaxis, :, :]            # (1, n_mfcc, T)
    return feat.astype(np.float32)

## Celda 4 — Split estratificado

In [None]:
paths = labeled["path"].values
ys    = labeled["y"].values

# paths, ys ya definidos (por archivo, no por segmento)
paths_train, paths_tmp, y_train, y_tmp = train_test_split(
    paths, ys, test_size=(TEST_SIZE + VAL_SIZE), random_state=RANDOM_SEED, stratify=ys
)

# aquí tmp es (val+test); lo partimos a la mitad si VAL_SIZE==TEST_SIZE
rel_test = TEST_SIZE / (TEST_SIZE + VAL_SIZE)

paths_val, paths_test, y_val, y_test = train_test_split(
    paths_tmp, y_tmp, test_size=rel_test, random_state=RANDOM_SEED, stratify=y_tmp
)

print("Train/Val/Test:", len(paths_train), "/", len(paths_val), "/", len(paths_test))

Train/Val/Test: 1083 / 33 / 159


## Celda 5 — Extraer MFCC por segmento y “expandir” etiquetas a segmentos

In [None]:
def process_labeled_file_list(file_list, label_list):
    feats = []
    labs = []
    file_index = []  # trazabilidad: (archivo, segmento)
    for p, y in zip(file_list, label_list):
        x = load_and_clean(p)
        segs = segment_signal(x, SR, SEG_SEC, HOP_SEC)
        for k, seg in enumerate(segs):
            feats.append(mfcc_features(seg))
            labs.append(y)
            file_index.append({"file": str(p), "segment": int(k)})
    X = np.stack(feats, axis=0)  # (Nseg, C, n_mfcc, T)
    y = np.array(labs, dtype=np.int64)
    return X, y, file_index

print("Extrayendo TRAIN...")
X_train, y_train_seg, idx_train = process_labeled_file_list(paths_train, y_train)
print("Extrayendo VAL...")
X_val, y_val_seg, idx_val = process_labeled_file_list(paths_val, y_val)
print("Extrayendo TEST...")
X_test, y_test_seg, idx_test = process_labeled_file_list(paths_test, y_test)

print("Shapes:")
print("X_train:", X_train.shape, "y:", y_train_seg.shape)
print("X_val:", X_val.shape, "y:", y_val_seg.shape)
print("X_test:", X_test.shape, "y:", y_test_seg.shape)

Extrayendo TRAIN...


  from pkg_resources import resource_filename


Extrayendo VAL...
Extrayendo TEST...
Shapes:
X_train: (63803, 3, 32, 201) y: (63803,)
X_val: (1944, 3, 32, 201) y: (1944,)
X_test: (9365, 3, 32, 201) y: (9365,)


## Celda 6 — Guardar ```.npy``` + metadatos

In [None]:
# Para reducir memoria en disco:
X_train = X_train.astype(np.float16)
X_val   = X_val.astype(np.float16)
X_test  = X_test.astype(np.float16)

np.save(OUT_DIR / "X_train.npy", X_train)
np.save(OUT_DIR / "y_train.npy", y_train_seg)
np.save(OUT_DIR / "X_val.npy",   X_val)
np.save(OUT_DIR / "y_val.npy",   y_val_seg)
np.save(OUT_DIR / "X_test.npy",  X_test)
np.save(OUT_DIR / "y_test.npy",  y_test_seg)

with open(OUT_DIR / "files_train.json", "w") as f:
    json.dump(idx_train, f, indent=2)
with open(OUT_DIR / "files_val.json", "w") as f:
    json.dump(idx_val, f, indent=2)
with open(OUT_DIR / "files_test.json", "w") as f:
    json.dump(idx_test, f, indent=2)

class_meaning = {
    0: "original / con reina funcional",
    1: "no presente",
    2: "presente y rechazada",
    3: "presente y recién aceptada"
}

with open(OUT_DIR / "meta.json", "w") as f:
    json.dump({
        "data_dir": str(DATA_DIR),
        "csv_path": str(CSV_PATH),
        "id_col": ID_COL,
        "target_col": TARGET_COL,
        "classes": [0, 1, 2, 3],
        "class_meaning": class_meaning,
        "sr": SR,
        "trim_db": TRIM_DB,
        "segment_seconds": SEG_SEC,
        "hop_seconds": HOP_SEC,
        "n_mfcc": N_MFCC,
        "add_deltas": ADD_DELTAS,
        "n_fft": N_FFT,
        "hop_len": HOP_LEN,
        "fmin": FMIN,
        "fmax": FMAX,
        "random_seed": RANDOM_SEED
    }, f, indent=2, ensure_ascii=False)

print("Meta guardada en:", OUT_DIR / "meta.json")

Meta guardada en: C:\Users\leona\Documents\Thesis_Project_UACH\Temp\Dataset\features_mfcc_labeled\meta.json
