In [42]:
# %%
# ───────────────────────────────────────────────────────────────
# Notebook  :  LF evaluation – exercise (a)  (✱ versione corretta ✱)
# Obiettivo  : misurare precision/recall/F1 delle labeling‑functions (LF)
#              e verificare l’udibilità degli eventi.
# Peculiarità dataset
#   • `labels/{idx}_labels.npz`   → una chiave per classe, valore = (N,2) segmenti.
#   • `audio_features/{idx}.npz`  → contiene "melspectrogram" (T,F).
#   • `annotations.csv`           → colonne: filename,onset,offset,categories (lista).
#   • `metadata.csv` (opz.)       → start_time_s, end_time_s: ritaglio del clip.
# Output
#   eval_out/results.csv          → metriche per classe + breakdown audible/silent
#   eval_out/overall_metrics.txt  → macro/micro P‑R‑F1
# ───────────────────────────────────────────────────────────────

# %% [markdown]
# ### 0. Dipendenze
# ```bash
# # esegui solo se mancano
# !pip install pandas numpy intervaltree scipy tqdm
# ```

# %%
# 1. Path (hard‑coded)
from pathlib import Path

ANN_PATH  = Path("annotations.csv")     # ground‑truth
META_PATH = Path("metadata.csv")        # facoltativo
LABEL_DIR = Path("labels")              # label npz
FEAT_DIR  = Path("audio_features")      # feature npz
OUT_DIR   = Path("eval_out")

IOU_THRESH       = 0.3      # soglia più tollerante (prima era 0.5)
ENERGY_THRESH_DB = -50.0    # udibile se > soglia dB
MERGE_GAP        = 0.15     # unisci segmenti LF con gap < 150 ms

OUT_DIR.mkdir(parents=True, exist_ok=True)
print("Percorsi impostati ✅")

# %%
# 2. Import
import ast, math, json, itertools
from collections import defaultdict

import numpy as np
import pandas as pd
from intervaltree import Interval, IntervalTree
from tqdm.auto import tqdm

# %%
# 3. Helper

def iou(a, b):
    inter = max(0, min(a[1], b[1]) - max(a[0], b[0]))
    if inter == 0:
        return 0.0
    union = max(a[1], b[1]) - min(a[0], b[0])
    return inter / union


def seg_mean_db(logmel, sr, hop, seg):
    t0, t1 = seg
    f0 = int(t0 * sr / hop)
    f1 = int(math.ceil(t1 * sr / hop))
    f1 = min(f1, logmel.shape[0])
    return float(logmel[f0:f1].mean()) if f1 > f0 else -np.inf


def load_pred_segments(path: Path):
    """Return merged (segments, class) lists after lower/strip & small‑gap merge."""
    data = np.load(path, allow_pickle=True)
    raw = []
    for cls in data.files:
        segs = data[cls]
        if segs.ndim != 2 or segs.shape[1] != 2:
            continue
        cls_norm = cls.strip().lower()
        for s in segs.astype(float):
            raw.append((tuple(s), cls_norm))

    # sort by start time
    raw.sort(key=lambda x: (x[1], x[0][0]))

    merged = []
    for cls, grp in itertools.groupby(raw, key=lambda x: x[1]):
        grp = [s for s, _ in grp]
        grp.sort()
        cur = list(grp[0])
        for seg in grp[1:]:
            if seg[0] - cur[1] <= MERGE_GAP:   # overlap / tiny gap ⇒ merge
                cur[1] = max(cur[1], seg[1])
            else:
                merged.append((tuple(cur), cls))
                cur = list(seg)
        merged.append((tuple(cur), cls))
    segs, cats = zip(*merged) if merged else ([], [])
    return list(segs), list(cats)

# %%
# 4. Carica annotazioni + metadata e normalizza
ann = pd.read_csv(ANN_PATH)
ann["categories"] = ann["categories"].apply(ast.literal_eval)

# normalizza stringhe
ann["categories"] = ann["categories"].apply(lambda lst: [c.lower().strip() for c in lst])
ann["filename"]   = ann["filename"].str.strip()

# merge con metadata se esiste → sottrai start_time_s per avere tempi relativi al clip
if META_PATH.exists():
    meta = pd.read_csv(META_PATH, usecols=["filename", "start_time_s", "end_time_s"])
    ann = ann.merge(meta, on="filename", how="left")
    ann["onset"]  = ann["onset"]  - ann["start_time_s"].fillna(0)
    ann["offset"] = ann["offset"] - ann["start_time_s"].fillna(0)

# indice del clip
ann["idx"] = ann["filename"].str.replace(".mp3", "", regex=False)

# filtra intervalli nulli / negativi
ann = ann[ann["onset"] < ann["offset"]]
print(f"Ground‑truth validi: {len(ann)}")

CLASSES = sorted({c for cats in ann["categories"] for c in cats})
print("Numero classi:", len(CLASSES))

# %%
# 5. Funzione di valutazione (con clipping sui limiti del frammento)

def evaluate(ann_df, lbl_dir, feat_dir):
    stats = {c: dict(TP=0, FP=0, FN=0, TP_audible=0, FP_silent=0) for c in CLASSES}

    for idx, gt_rows_all in tqdm(ann_df.groupby("idx"), desc="valuta", unit="file"):
        pred_path = lbl_dir / f"{idx}_labels.npz"
        if not pred_path.exists():
            continue
        segs_pred, cats_pred = load_pred_segments(pred_path)

        # feature per durata + audibilità
        feat_path = feat_dir / f"{idx}.npz"
        have_feat = feat_path.exists()
        if have_feat:
            feat = np.load(feat_path, allow_pickle=True)
            hop = int(feat.get("hop_length", 512))
            sr  = int(feat.get("sample_rate", 32000))
            n_frames = feat["melspectrogram"].shape[0]
            clip_dur = n_frames * hop / sr
            logmel = 10*np.log10(np.maximum(feat["melspectrogram"].astype(float),1e-12))
        else:
            # se mancano le feature, stima durata dal max(frame)
            clip_dur = max((s[1] for s in segs_pred), default=0)

        # ---- filtro GT: tieni solo intervalli che intersecano [0, clip_dur]
        gt_rows = gt_rows_all[(gt_rows_all.offset > 0) & (gt_rows_all.onset < clip_dur)]
        if gt_rows.empty and not segs_pred:
            continue  # nulla da valutare

        # IntervalTree GT by class
        tree = defaultdict(IntervalTree)
        for r in gt_rows.itertuples():
            # clamp nei limiti del frammento
            a = max(0.0, r.onset)
            b = min(clip_dur, r.offset)
            for cat in r.categories:
                tree[cat].add(Interval(a, b))

        matched = {c: set() for c in CLASSES}

        for seg, cat in zip(segs_pred, cats_pred):
            if cat not in stats:
                continue
            t0, t1 = seg
            if t0 >= clip_dur or t1 <= 0:
                continue  # predizione fuori frammento

            overlaps = [iv for iv in tree[cat].overlap(t0, t1) if iou(seg, (iv.begin, iv.end)) >= IOU_THRESH]
            tp = bool(overlaps)

            audible = False
            if have_feat:
                audible = seg_mean_db(logmel, sr, hop, seg) > ENERGY_THRESH_DB

            if tp:
                stats[cat]["TP"] += 1
                if audible:
                    stats[cat]["TP_audible"] += 1
                matched[cat].add((overlaps[0].begin, overlaps[0].end))
            else:
                stats[cat]["FP"] += 1
                if not audible:
                    stats[cat]["FP_silent"] += 1

        # conteggia FN
        for cat, tr in tree.items():
            stats[cat]["FN"] += len(tr) - len(matched[cat])

    # build DataFrame
    rows = []
    for cat, s in stats.items():
        tp, fp, fn = s["TP"], s["FP"], s["FN"]
        prec = tp / (tp + fp + 1e-9)
        rec  = tp / (tp + fn + 1e-9)
        f1   = 2*prec*rec/(prec+rec+1e-9)
        rows.append({"class":cat, **s, "precision":prec, "recall":rec, "f1":f1})
    return pd.DataFrame(rows).sort_values("class")

# 6. Esegui valutazione Esegui valutazione

df = evaluate(ann, LABEL_DIR, FEAT_DIR)

df.to_csv(OUT_DIR/"results.csv", index=False)
print("results.csv salvato →", OUT_DIR)
df.head()

# %%
# 7. Macro/Micro
macro_f1 = df["f1"].mean()
micro_tp = df["TP"].sum(); micro_fp = df["FP"].sum(); micro_fn = df["FN"].sum()
micro_p  = micro_tp/(micro_tp+micro_fp+1e-9)
micro_r  = micro_tp/(micro_tp+micro_fn+1e-9)
micro_f1 = 2*micro_p*micro_r/(micro_p+micro_r+1e-9)

with open(OUT_DIR/"overall_metrics.txt","w") as f:
    f.write(f"Macro-F1  : {macro_f1:.4f}\n"
            f"Micro-F1  : {micro_f1:.4f}\n"
            f"Micro-P   : {micro_p:.4f}\n"
            f"Micro-R   : {micro_r:.4f}\n")
print((OUT_DIR/"overall_metrics.txt").read_text())


Percorsi impostati ✅
Ground‑truth validi: 27551
Numero classi: 58


valuta: 100%|██████████| 8230/8230 [00:32<00:00, 252.87file/s]

results.csv salvato → eval_out
Macro-F1  : 0.0291
Micro-F1  : 0.0269
Micro-P   : 0.0337
Micro-R   : 0.0224






In [44]:
# %% ➊ – panoramica per classe
df = pd.read_csv(OUT_DIR / "results.csv")
(df.sort_values("f1")
   [["class", "TP", "FP", "FN", "precision", "recall", "f1"]]
   .head(15))

# %% ➋ –  scova le classi con tanti FP e zero TP
bad = df[(df["TP"] == 0) & (df["FP"] > 0)]
print("# classi con FP ma mai TP:", len(bad))
display(bad.sort_values("FP", ascending=False).head(20))

# %% ➌ –  distribuzione lunghezze predette vs GT
import seaborn as sns
sns.histplot([iv[1]-iv[0] for iv in itertools.chain(*[load_pred_segments(LABEL_DIR/p)[0] 
                                                      for p in LABEL_DIR.iterdir()])],
             binwidth=0.1, stat="probability", label="LF", color="r")
sns.histplot((ann["offset"]-ann["onset"]).clip(lower=0), binwidth=0.1,
             stat="probability", label="GT", color="g")
plt.xlim(0, 5); plt.legend(); plt.xlabel("segment length (s)")


# classi con FP ma mai TP: 12


Unnamed: 0,class,TP,FP,FN,TP_audible,FP_silent,precision,recall,f1
2,beep/bleep,0,44,141,0,22,0.0,0.0,0.0
38,saxophone,0,14,65,0,6,0.0,0.0,0.0
45,sneeze,0,12,9,0,10,0.0,0.0,0.0
41,ship/boat,0,10,81,0,0,0.0,0.0,0.0
11,cough,0,9,38,0,1,0.0,0.0,0.0
30,lawn mower,0,8,52,0,2,0.0,0.0,0.0
13,cowbell,0,6,32,0,0,0.0,0.0,0.0
37,rooster crow,0,6,22,0,0,0.0,0.0,0.0
52,trumpet,0,5,39,0,0,0.0,0.0,0.0
16,doorbell,0,4,17,0,0,0.0,0.0,0.0


FileNotFoundError: [Errno 2] No such file or directory: 'labels/labels/222809_labels.npz'

In [45]:
# LF: classi che appaiono almeno una volta tra le chiavi .npz
lf_vocab = set()
for p in LABEL_DIR.iterdir():
    lf_vocab.update([cls.strip().lower() for cls in np.load(p, allow_pickle=True).files])

# GT: classi usate nelle annotazioni
gt_vocab = {c for cats in ann["categories"] for c in cats}

extra = sorted(lf_vocab - gt_vocab)
missing = sorted(gt_vocab - lf_vocab)

print("✗  Classi che LF usa ma GT non conosce:", extra[:15])
print("✗  Classi che GT ha ma LF non produce:", missing[:15])


from collections import Counter
gt_cover = Counter()   # percentuale GT coperta da almeno una LF

for idx, grp in ann.groupby("idx"):
    segs, cats = load_pred_segments(LABEL_DIR / f"{idx}_labels.npz")
    for r in grp.itertuples():
        for (s0,s1), c in zip(segs, cats):
            if c in r.categories:
                inter = max(0, min(s1, r.offset) - max(s0, r.onset))
                ratio  = inter / (r.offset - r.onset)
                gt_cover[c] += ratio >= 0.5
                break

bad_cover = [k for k,v in gt_cover.items() if v == 0]
print("✗  Classi che non raggiungono mai 50 % di copertura:", bad_cover[:12])


# trova un FP di 'doorbell'
cls = "doorbell"
for idx in ann["idx"].unique():
    segs, cats = load_pred_segments(LABEL_DIR / f"{idx}_labels.npz")
    for (s0,s1), c in zip(segs, cats):
        if c != cls: continue
        if not any(iou((s0,s1),(r.onset,r.offset))>=0.3 
                   for r in ann[(ann.idx==idx)].itertuples() if cls in r.categories):
            fp_idx, fp_seg = idx, (s0,s1)
            raise StopIteration
print("Cand-FP  →", fp_idx, fp_seg)


✗  Classi che LF usa ma GT non conosce: []
✗  Classi che GT ha ma LF non produce: []
✗  Classi che non raggiungono mai 50 % di copertura: ['crying', 'waves', 'cowbell', 'wind', 'cow moo', 'truck', 'car', 'bus', 'washing machine', 'doorbell', 'lawn mower', 'motorcycle']


StopIteration: 

In [55]:
# %%
# ╔════════════════════════════════════════════════════════════╗
# ║          LF evaluation – esercizio (a)  (REWRITE)         ║
# ║               * Ground‑truth vs Labeling‑Functions *       ║
# ╚════════════════════════════════════════════════════════════╝
# Questo notebook, da eseguire cell‑per‑cell, parte **da zero**:
#   1. Carica `annotations.csv` (GT) e `metadata.csv` (start/end clip)
#   2. Converte le coordinate GT in timeline «clip‑relativa»
#   3. Legge le prediction LF (una chiave per classe, valori = segmenti)
#      * se i segmenti sono in frame → li converte in secondi
#      * li trasla anch’essi di `-start_time_s` se necessario
#   4. Valuta TP / FP / FN con overlap ≥ 30 % del GT
#   5. Calcola precision/recall/F1 + breakdown udibile/silenzioso
#   6. Salva `results.csv` e `overall_metrics.txt` in `eval_out/`
# --------------------------------------------------------------------
# Dipendenze (esegui una volta):
# !pip install pandas numpy intervaltree scipy tqdm
# --------------------------------------------------------------------

# %% 0 – CONFIG                       ★★★ EDITA SOLO QUI ★★★
from pathlib import Path

ANN_PATH   = Path("annotations.csv")     # ground‑truth
META_PATH  = Path("metadata.csv")        # start_time_s / end_time_s
LABEL_DIR  = Path("labels")              # *.npz con prediction LF
FEAT_DIR   = Path("audio_features")      # *.npz con melspectrogram
OUT_DIR    = Path("eval_out")            # dove salvare i risultati

SR_DEFAULT   = 32_000     # sample‑rate se mancante
HOP_DEFAULT  = 512        # hop_length se mancante
IOU_GT_RATIO = 0.3        # minimo 30 % del GT coperto → TP
MERGE_GAP    = 0.30       # unisci segmenti LF con gap < 300 ms
ENERGY_DB_TH = -50.0      # soglia dB per dire «udibile»

OUT_DIR.mkdir(exist_ok=True)
print("Percorsi impostati ✅")

# %% 1 – IMPORT STANDARD
import ast, json, math, itertools
from collections import defaultdict

import numpy as np
import pandas as pd
from intervaltree import Interval, IntervalTree
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

# %% 2 – CARICA ANNOTAZIONI + METADATA, NORMALIZZA
ann = pd.read_csv(ANN_PATH)
ann["categories"] = ann["categories"].apply(ast.literal_eval)
ann["categories"] = ann["categories"].apply(lambda L: [c.lower().strip() for c in L])

if META_PATH.exists():
    meta = pd.read_csv(META_PATH, usecols=["filename", "start_time_s", "end_time_s"])
    ann = ann.merge(meta, on="filename", how="left")
    ann["onset"]  = ann["onset"]  - ann["start_time_s"].fillna(0)
    ann["offset"] = ann["offset"] - ann["start_time_s"].fillna(0)
else:
    ann["start_time_s"] = 0.0

# rimuovi intervalli nulli / negativi
ann = ann[ann["onset"] < ann["offset"]]
ann["idx"] = ann["filename"].str.replace(".mp3", "", regex=False)

CLASSES = sorted({c for cats in ann["categories"] for c in cats})
print(f"GT validi: {len(ann)}, clip: {ann['idx'].nunique()}, classi: {len(CLASSES)}")

# %% 3 – HELPER FUNZIONI

def covers_gt(pred, gt, thr=IOU_GT_RATIO):
    inter = max(0, min(pred[1], gt[1]) - max(pred[0], gt[0]))
    return inter / (gt[1]-gt[0]) >= thr


def merge_segments(segs, gap=MERGE_GAP):
    """segs = list[(start,end)] già ordinati; merge se gap<gap"""
    if not segs:
        return []
    merged = [list(segs[0])]
    for a, b in segs[1:]:
        if a - merged[-1][1] <= gap:
            merged[-1][1] = max(merged[-1][1], b)
        else:
            merged.append([a, b])
    return [tuple(s) for s in merged]


def frames_to_segments(mask: np.ndarray, sr: int, hop: int):
    """Convert a 1‑D binary/probability mask (per frame) into (start,end) segments in **seconds**."""
    onsets = np.where(mask[:-1] <= 0)[0] + 1  # start after zero→nonzero
    onsets = np.insert(onsets, 0, 0) if mask[0] > 0 else onsets
    offsets = np.where(mask[1:] <= 0)[0] + 1  # first zero after nonzero
    offsets = np.append(offsets, len(mask)) if mask[-1] > 0 else offsets
    return [(o * hop / sr, off * hop / sr) for o, off in zip(onsets, offsets) if off > o]


def load_pred_segments(path, clip_len, start_shift=0.0, sr=SR_DEFAULT, hop=HOP_DEFAULT):
    """
    Restituisce (segments, class) già:
      • convertiti in secondi (se erano frame)
      • traslati di -start_shift (clip‑relativi)
      • filtrati fuori dal range [0, clip_len]
      • uniti con gap < MERGE_GAP
    """
    data = np.load(path, allow_pickle=True)
    segs_by_cat = defaultdict(list)

    for cls in data.files:
        arr = np.asarray(data[cls], dtype=float)
        cls_norm = cls.strip().lower()

                # ───────── decodifica formato ─────────
        if arr.ndim == 2 and arr.shape[1] in {2,3}:   # (N,2) o (N,3)
            if arr.shape[1] == 3:                     # probabilità per frame
                mask = arr[:, 0]                      # usa primo canale p(event)
                if mask.max() <= 0:
                    continue
                cand = frames_to_segments(mask, sr, hop)
            else:                                     # (N,2)
                is_frame = arr.max() > clip_len * 10
                cand = [(tuple(s * hop / sr) if is_frame else tuple(s)) for s in arr]

            for seg in cand:
                seg = (seg[0] - start_shift, seg[1] - start_shift)
                if seg[1] <= 0 or seg[0] >= clip_len:
                    continue
                segs_by_cat[cls_norm].append(seg)

        elif arr.ndim in {1, 2} and arr.shape[-1] == 1:  # mask per frame (N,1) o (N,)
            mask = arr.squeeze()
            if mask.max() <= 0:
                continue
            for seg in frames_to_segments(mask, sr, hop):
                seg = (seg[0] - start_shift, seg[1] - start_shift)
                if seg[1] <= 0 or seg[0] >= clip_len:
                    continue
                segs_by_cat[cls_norm].append(seg)
        else:
            print("⚠️ forma non gestita:", cls, arr.shape)
            print("⚠️ forma non gestita:", cls, arr.shape)

    # merge + restituisci
    segs_final, cats_final = [], []
    for cat, lst in segs_by_cat.items():
        lst.sort()
        for m in merge_segments(lst):
            segs_final.append(m); cats_final.append(cat)
    return segs_final, cats_final

# %% 4 – LOOP DI VALUTAZIONE
stats = {c: dict(TP=0, FP=0, FN=0) for c in CLASSES}

groups = ann.groupby("idx")
for idx, gt_rows in tqdm(groups, desc="eval", unit="file"):
    start_sec = float(gt_rows["start_time_s"].iloc[0])

    # durata clip in secondi dal feature file
    feat_path = FEAT_DIR / f"{idx}.npz"
    if not feat_path.exists():
        continue  # manca feature → salta clip
    feat = np.load(feat_path, allow_pickle=True)
    hop = int(feat.get("hop_length", HOP_DEFAULT))
    sr  = int(feat.get("sample_rate",  SR_DEFAULT))
    n_frames = feat["melspectrogram"].shape[0]
    clip_len = n_frames * hop / sr

    segs_pred, cats_pred = load_pred_segments(
        LABEL_DIR / f"{idx}_labels.npz",
        clip_len=clip_len,
        start_shift=start_sec,
        sr=sr, hop=hop,
    )

    # costruisci IntervalTree GT per classe
    tree = defaultdict(IntervalTree)
    for r in gt_rows.itertuples():
        for cat in r.categories:
            a, b = max(0, r.onset), max(0, r.offset)
            if b <= a or a >= clip_len:
                continue
            tree[cat].add(Interval(a, min(b, clip_len)))

    matched = {c: set() for c in CLASSES}

    for seg, cat in zip(segs_pred, cats_pred):
        if cat not in stats:
            continue
        overlaps = [iv for iv in tree[cat].overlap(*seg) if covers_gt(seg, (iv.begin, iv.end))]
        if overlaps:
            stats[cat]["TP"] += 1
            matched[cat].add((overlaps[0].begin, overlaps[0].end))
        else:
            stats[cat]["FP"] += 1

    for cat, tr in tree.items():
        stats[cat]["FN"] += len(tr) - len(matched[cat])

# %% 5 – BUILD METRICS DF – BUILD METRICS DF
rows = []
for cat, s in stats.items():
    tp, fp, fn = s.values()
    prec = tp / (tp+fp+1e-9)
    rec  = tp / (tp+fn+1e-9)
    f1   = 2*prec*rec/(prec+rec+1e-9)
    rows.append({"class":cat, **s, "precision":prec, "recall":rec, "f1":f1})

df = pd.DataFrame(rows).sort_values("class")
df.to_csv(OUT_DIR/"results.csv", index=False)
print("✓ Salvato results.csv in", OUT_DIR)

df.head()

# %% 6 – AGGREGATI
macro_f1 = df["f1"].mean()
micro_tp = df.TP.sum(); micro_fp = df.FP.sum(); micro_fn = df.FN.sum()
micro_p = micro_tp/(micro_tp+micro_fp+1e-9)
micro_r = micro_tp/(micro_tp+micro_fn+1e-9)
micro_f1 = 2*micro_p*micro_r/(micro_p+micro_r+1e-9)

with open(OUT_DIR/"overall_metrics.txt","w") as f:
    f.write(f"Macro-F1  : {macro_f1:.4f}\n");
    f.write(f"Micro-F1  : {micro_f1:.4f}\n");
    f.write(f"Micro-P   : {micro_p:.4f}\n");
    f.write(f"Micro-R   : {micro_r:.4f}\n");
print((OUT_DIR/"overall_metrics.txt").read_text())


Percorsi impostati ✅
GT validi: 27551, clip: 8230, classi: 58


eval: 100%|██████████| 8230/8230 [00:30<00:00, 271.84file/s]

✓ Salvato results.csv in eval_out
Macro-F1  : 0.4509
Micro-F1  : 0.4425
Micro-P   : 0.4663
Micro-R   : 0.4210




