# Cell 1 — Config (paths + MFCC params)

In [4]:
import numpy as np
import pandas as pd
import librosa
from pathlib import Path
import json

# --- Paths ---
DATA_DIR = Path(r"C:\Users\leona\Documents\Thesis_Project_UACH\Temp\Dataset\BeesAnna\sound_files")
CSV_PATH = Path(r"C:\Users\leona\Documents\Thesis_Project_UACH\Temp\Dataset\BeesAnna\all_data_updated.csv")

UNLABELED_DIR = Path(r"C:\Users\leona\Documents\Thesis_Project_UACH\Temp\Dataset\features_mfcc_unlabeled")
UNLABELED_DIR.mkdir(parents=True, exist_ok=True)

OUT_X = UNLABELED_DIR / "X_unlabeled.npy"
OUT_INDEX = UNLABELED_DIR / "unlabeled_index.json"
OUT_META = UNLABELED_DIR / "meta_unlabeled.json"

# --- CSV columns ---
ID_COL = "file name"
TARGET_COL = "queen status"
VALID_CLASSES = {0, 1, 2, 3}

# --- Audio / MFCC parameters (match your labeled pipeline) ---
SR = 16000
TRIM_DB = 30

SEG_SEC = 2.0
HOP_SEC = 1.0

N_MFCC = 32
N_FFT  = int(0.025 * SR)   # 25 ms
HOP_LEN= int(0.010 * SR)   # 10 ms
FMIN, FMAX = 20, SR // 2

ADD_DELTAS = True

# --- Output dtype ---
# float16 saves ~50% disk; safe for features; you can cast back to float32 during training.
OUT_DTYPE = np.float16

# Cell 2 — Define helper functions

In [5]:
def peak_normalize(x, eps=1e-9):
    return x / (np.max(np.abs(x)) + eps)

def load_and_clean(path):
    x, _ = librosa.load(str(path), sr=SR, mono=True)
    x, _ = librosa.effects.trim(x, top_db=TRIM_DB)
    x = peak_normalize(x)
    return x

def segment_signal(x, sr, seg_sec, hop_sec):
    seg_len = int(seg_sec * sr)
    hop_len = int(hop_sec * sr)

    if len(x) < seg_len:
        x = np.pad(x, (0, seg_len - len(x)), mode="reflect")

    segments = []
    for start in range(0, max(1, len(x)-seg_len+1), hop_len):
        seg = x[start:start+seg_len]
        if len(seg) < seg_len:
            seg = np.pad(seg, (0, seg_len - len(seg)), mode="reflect")
        segments.append(seg)
    return segments

def mfcc_features(seg):
    mfcc = librosa.feature.mfcc(
        y=seg, sr=SR, n_mfcc=N_MFCC, n_fft=N_FFT, hop_length=HOP_LEN,
        fmin=FMIN, fmax=FMAX
    )
    if ADD_DELTAS:
        d1 = librosa.feature.delta(mfcc)
        d2 = librosa.feature.delta(mfcc, order=2)
        feat = np.stack([mfcc, d1, d2], axis=0)   # (3, n_mfcc, T)
    else:
        feat = mfcc[np.newaxis, :, :]             # (1, n_mfcc, T)
    return feat.astype(np.float32)  # compute in float32; cast to OUT_DTYPE when writing

# Cell 3 — Build the unlabeled file list (exclude the 1275 labeled)

In [6]:
# All WAVs
wav_paths = sorted(list(DATA_DIR.rglob("*.wav"))) + sorted(list(DATA_DIR.rglob("*.WAV")))
wav_by_name = {p.name: p for p in wav_paths}

print("Total WAVs in folder:", len(wav_paths))

# Read CSV and extract labeled basenames
df = pd.read_csv(CSV_PATH)
assert ID_COL in df.columns, f"Missing column: {ID_COL}"
assert TARGET_COL in df.columns, f"Missing column: {TARGET_COL}"

df["_basename"] = df[ID_COL].astype(str).map(lambda s: Path(s).name)
df["_y"] = pd.to_numeric(df[TARGET_COL], errors="coerce")

labeled_names = set(df.loc[df["_y"].isin(list(VALID_CLASSES)), "_basename"])
print("Labeled WAVs in CSV (valid 0-3):", len(labeled_names))

# Unlabeled = WAVs not in labeled set
unlabeled_paths = [p for p in wav_paths if p.name not in labeled_names]
print("Unlabeled WAVs (to process):", len(unlabeled_paths))

# Sanity
if len(unlabeled_paths) == 0:
    raise SystemExit("No unlabeled WAVs found. Check CSV/paths.")

Total WAVs in folder: 14200
Labeled WAVs in CSV (valid 0-3): 1275
Unlabeled WAVs (to process): 11650


# Cell 4 — PASS 1: count segments and keep only “good” files

We do a first pass so we can preallocate the ```.npy``` memmap with the correct shape.

In [7]:
good_files = []
seg_counts = []
failed = []

total = len(unlabeled_paths)

for i, p in enumerate(unlabeled_paths, 1):
    try:
        x = load_and_clean(p)
        segs = segment_signal(x, SR, SEG_SEC, HOP_SEC)
        good_files.append(p)
        seg_counts.append(len(segs))
    except Exception as e:
        failed.append({"file": str(p), "error": str(e)})

    if i % 200 == 0 or i == total:
        print(f"Pass1: {i}/{total} | good={len(good_files)} | failed={len(failed)}")

total_segments = int(np.sum(seg_counts))
print("\nGood files:", len(good_files))
print("Failed files:", len(failed))
print("Total segments to write:", total_segments)

if total_segments == 0:
    raise SystemExit("Total segments is 0. Something is wrong with segmentation/loading.")

  from pkg_resources import resource_filename


Pass1: 200/11650 | good=200 | failed=0
Pass1: 400/11650 | good=400 | failed=0
Pass1: 600/11650 | good=600 | failed=0
Pass1: 800/11650 | good=800 | failed=0
Pass1: 1000/11650 | good=1000 | failed=0
Pass1: 1200/11650 | good=1200 | failed=0
Pass1: 1400/11650 | good=1400 | failed=0
Pass1: 1600/11650 | good=1600 | failed=0
Pass1: 1800/11650 | good=1800 | failed=0
Pass1: 2000/11650 | good=2000 | failed=0
Pass1: 2200/11650 | good=2200 | failed=0
Pass1: 2400/11650 | good=2400 | failed=0
Pass1: 2600/11650 | good=2600 | failed=0
Pass1: 2800/11650 | good=2800 | failed=0
Pass1: 3000/11650 | good=3000 | failed=0
Pass1: 3200/11650 | good=3200 | failed=0
Pass1: 3400/11650 | good=3400 | failed=0
Pass1: 3600/11650 | good=3600 | failed=0
Pass1: 3800/11650 | good=3800 | failed=0
Pass1: 4000/11650 | good=4000 | failed=0
Pass1: 4200/11650 | good=4200 | failed=0
Pass1: 4400/11650 | good=4400 | failed=0
Pass1: 4600/11650 | good=4600 | failed=0
Pass1: 4800/11650 | good=4800 | failed=0
Pass1: 5000/11650 | good

# Cell 5 — Preallocate ```X_unlabeled.npy``` as a memmapped ```.npy``` file

We also compute the fixed feature shape ```(C, n_mfcc, T)``` once.

In [8]:
# Determine feature shape using a dummy segment
seg_len = int(SEG_SEC * SR)
dummy = np.zeros(seg_len, dtype=np.float32)
dummy_feat = mfcc_features(dummy)
C, H, W = dummy_feat.shape

print("Feature per segment shape:", (C, H, W))

# Create memmap .npy with known shape
X_mm = np.lib.format.open_memmap(
    OUT_X, mode="w+",
    dtype=OUT_DTYPE,
    shape=(total_segments, C, H, W)
)

print("Allocated:", OUT_X)
print("Memmap shape:", X_mm.shape, "dtype:", X_mm.dtype)

Feature per segment shape: (3, 32, 201)
Allocated: C:\Users\leona\Documents\Thesis_Project_UACH\Temp\Dataset\features_mfcc_unlabeled\X_unlabeled.npy
Memmap shape: (687350, 3, 32, 201) dtype: float16


# Cell 6 — PASS 2: extract MFCC and write directly into the memmap

Also writes a compact index: per file start offset and number of segments.

In [9]:
write_pos = 0
index = []  # compact mapping per file

for i, (p, nsegs) in enumerate(zip(good_files, seg_counts), 1):
    x = load_and_clean(p)
    segs = segment_signal(x, SR, SEG_SEC, HOP_SEC)

    start = write_pos
    for k, seg in enumerate(segs):
        feat = mfcc_features(seg)  # float32
        X_mm[write_pos] = feat.astype(OUT_DTYPE, copy=False)
        write_pos += 1

    index.append({
        "file": str(p),
        "start": int(start),
        "n_segments": int(len(segs))
    })

    if i % 100 == 0 or i == len(good_files):
        print(f"Pass2: {i}/{len(good_files)} | written segments={write_pos}/{total_segments}")

# Flush to disk
del X_mm

print("\nDone writing:", OUT_X)
print("Total written:", write_pos, "Expected:", total_segments)
if write_pos != total_segments:
    print("WARNING: written != expected. Something changed between passes.")

Pass2: 100/11650 | written segments=5900/687350
Pass2: 200/11650 | written segments=11800/687350
Pass2: 300/11650 | written segments=17700/687350
Pass2: 400/11650 | written segments=23600/687350
Pass2: 500/11650 | written segments=29500/687350
Pass2: 600/11650 | written segments=35400/687350
Pass2: 700/11650 | written segments=41300/687350
Pass2: 800/11650 | written segments=47200/687350
Pass2: 900/11650 | written segments=53100/687350
Pass2: 1000/11650 | written segments=59000/687350
Pass2: 1100/11650 | written segments=64900/687350
Pass2: 1200/11650 | written segments=70800/687350
Pass2: 1300/11650 | written segments=76700/687350
Pass2: 1400/11650 | written segments=82600/687350
Pass2: 1500/11650 | written segments=88500/687350
Pass2: 1600/11650 | written segments=94400/687350
Pass2: 1700/11650 | written segments=100300/687350
Pass2: 1800/11650 | written segments=106200/687350
Pass2: 1900/11650 | written segments=112100/687350
Pass2: 2000/11650 | written segments=118000/687350
Pass2:

# Cell 7 — Save index + metadata

In [10]:
with open(OUT_INDEX, "w", encoding="utf-8") as f:
    json.dump(index, f, indent=2, ensure_ascii=False)

meta = {
    "data_dir": str(DATA_DIR),
    "csv_path": str(CSV_PATH),
    "excluded_labeled_count": int(len(labeled_names)),
    "unlabeled_requested": int(len(unlabeled_paths)),
    "unlabeled_processed_good": int(len(good_files)),
    "unlabeled_failed": int(len(failed)),
    "total_segments": int(total_segments),
    "feature_shape_per_segment": [int(C), int(H), int(W)],
    "dtype": str(OUT_DTYPE),
    "sr": SR,
    "trim_db": TRIM_DB,
    "seg_sec": SEG_SEC,
    "hop_sec": HOP_SEC,
    "n_mfcc": N_MFCC,
    "add_deltas": ADD_DELTAS,
    "n_fft": int(N_FFT),
    "hop_len": int(HOP_LEN),
    "fmin": int(FMIN),
    "fmax": int(FMAX),
}

with open(OUT_META, "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2, ensure_ascii=False)

print("Saved index:", OUT_INDEX)
print("Saved meta:", OUT_META)

if failed:
    print("\nSome files failed. First 3:")
    for item in failed[:3]:
        print(item["file"])
        print("  ", item["error"][:200])

Saved index: C:\Users\leona\Documents\Thesis_Project_UACH\Temp\Dataset\features_mfcc_unlabeled\unlabeled_index.json
Saved meta: C:\Users\leona\Documents\Thesis_Project_UACH\Temp\Dataset\features_mfcc_unlabeled\meta_unlabeled.json


# Cell 8 — Quick sanity load (memmap) + shape check

In [11]:
X_unl = np.load(OUT_X, mmap_mode="r")
print("Loaded X_unlabeled:", X_unl.shape, X_unl.dtype)

Loaded X_unlabeled: (687350, 3, 32, 201) float16
