In [None]:
# %% [markdown]
# # MLPC 2025 · Frame-level classifier (Task 1 → Task 2 bridge)
#
# Output: per ogni clip `ID.mp3` un file `frame_out/ID.npz`
#         contenente `probs` ∈ ℝ^{T×10} con l'ordine delle 10 classi target.

# %% ---------------------------------------------------------------------------
import os, itertools, math, pickle
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from joblib import Parallel, delayed
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression

# ---- costanti ---------------------------------------------------------------
CLASSES = ['Speech', 'Shout', 'Chainsaw', 'Jackhammer', 'Lawn Mower',
           'Power Drill', 'Dog Bark', 'Rooster Crow', 'Horn Honk', 'Siren']

DATASET_PATH   = Path("../MLPC2025_classification")          # adattare se necessario
AF_DIR         = DATASET_PATH / "audio_features"
LABEL_DIR      = DATASET_PATH / "labels"           # presente solo nel set di training
META_CSV       = DATASET_PATH / "metadata.csv"
FRAME_OUT_DIR  = Path("./frame_out")
FRAME_OUT_DIR.mkdir(exist_ok=True)

SEED           = 42

print(f"Audio‑features dir: {AF_DIR}")
print(f"Label dir        : {LABEL_DIR}  (solo in training set)")
print(f"Salvataggio in   : {FRAME_OUT_DIR.resolve()}")

# %% ---------------------------------------------------------------------------
# Utility: carica features & labels di un singolo file -> (embeddings, Y matrix T×10)
def load_xy(file_id: str):
    feat_path  = AF_DIR   / f"{file_id}.npz"
    label_path = LABEL_DIR / f"{file_id}_labels.npz"
    if not feat_path.exists() or not label_path.exists():
        return None

    X = np.load(feat_path)["embeddings"]           # shape (T, D)
    lab_npz = np.load(label_path)
    # Costruisci matrice labels frame‑level T×10 nell'ordine di CLASSES
    Y = np.stack([np.array(lab_npz[c]).reshape(-1) for c in CLASSES], axis=1)
    return X, Y

# %% ---------------------------------------------------------------------------
# 1. Costruiamo il dataset frame‑level concatenando tutti i file disponibili
file_ids = [p.stem for p in AF_DIR.glob("*.npz")]
print(f"Feature files trovati: {len(file_ids)}")

def load_valid(fid):
    out = load_xy(fid)
    if out is None:
        return None
    x, y = out
    if x.shape[0] != y.shape[0]:
        return None
    return x, y

pairs = Parallel(n_jobs=-1)(delayed(load_valid)(fid) for fid in file_ids)
pairs = [p for p in pairs if p is not None]

Xs, Ys = zip(*pairs)
X_all  = np.concatenate(Xs, axis=0)          # ∑T × D
Y_all  = np.concatenate(Ys, axis=0).astype(int)

print("Dataset complessivo:", X_all.shape, Y_all.shape)

# %% ---------------------------------------------------------------------------
# 2. Split train / validation
strat = Y_all.sum(axis=1)
X_tr, X_val, Y_tr, Y_val = train_test_split(
    X_all, Y_all, test_size=0.2, random_state=SEED, stratify=strat
)
print("Train:", X_tr.shape, "Val:", X_val.shape)

# %% ---------------------------------------------------------------------------
# 3. Addestriamo 10 classificatori RandomForest binari (uno per classe)
# base_clf = RandomForestClassifier(
#     n_estimators=100, n_jobs=-1, class_weight='balanced', random_state=SEED
# )
base_clf = LogisticRegression(
    penalty='l2',
    class_weight='balanced',
    solver='saga',   
    max_iter=1000,
    random_state=SEED,
    n_jobs=-1
)

classifiers = []

for i in range(len(CLASSES)):
    clf_i = clone(base_clf)
    clf_i.fit(X_tr, Y_tr[:, i])
    classifiers.append(clf_i)


Audio‑features dir: ../MLPC2025_classification/audio_features
Label dir        : ../MLPC2025_classification/labels  (solo in training set)
Salvataggio in   : /Users/Q540900/Desktop/Sparkling---Pattern-Classification-Project/04 - Model Training/frame_out
Feature files trovati: 8230




Dataset complessivo: (1416482, 768) (1416482, 10)
Train: (1133185, 768) Val: (283297, 768)




In [None]:
# ---- quick report -----------------------------------------------------------
# pred_val = np.column_stack([clf.predict(X_val) for clf in classifiers])
# # print(classification_report(Y_val, pred_val, target_names=CLASSES, zero_division=0))
# pred_val_binary = (pred_val > 0).astype(int)
# print(classification_report(Y_val, pred_val_binary, target_names=CLASSES, zero_division=0))


# try:
#     proba_val = np.column_stack([clf.predict_proba(X_val)[:, 1] for clf in classifiers])
#     print("Micro-AUC:", roc_auc_score(Y_val, proba_val, average="micro"))
# except Exception as e:
#     print("AUC skip:", e)

# %% ---------------------------------------------------------------------------
# 4. Inference su **tutti** i file e salvataggio .npz   -------------------------
def save_probs(file_id: str):
    feat_path = AF_DIR / f"{file_id}.npz"
    out_path  = FRAME_OUT_DIR / f"{file_id}.npz"
    X = np.load(feat_path)["embeddings"]
    probs = np.column_stack([clf.predict_proba(X)[:, 1] for clf in classifiers])
    np.savez_compressed(out_path, probs=probs)
    return out_path.name

saved = [save_probs(fid) for fid in file_ids]
print(f"Salvati {len(saved)} file in {FRAME_OUT_DIR}")

Salvati 8230 file in frame_out


In [None]:
# %% ---------------------------------------------------------------------------
# 3b. Salvataggio dei modelli
import joblib

MODEL_DIR = Path("./models")
MODEL_DIR.mkdir(exist_ok=True)

for i, clf in enumerate(classifiers):
    model_path = MODEL_DIR / f"classifier_{i}_{CLASSES[i]}.pkl"
    joblib.dump(clf, model_path)

print(f"Salvati {len(classifiers)} modelli in {MODEL_DIR.resolve()}")


Salvati 10 modelli in /Users/Q540900/Desktop/Sparkling---Pattern-Classification-Project/04 - Model Training/models
