In [3]:
import os
import numpy as np
import pandas as pd
import librosa
import soundfile as sf
from tqdm import tqdm
from pathlib import Path

# === CONFIGURAZIONE ===
SAMPLING_RATE = 32000
SEGMENT_LENGTH = 1.2  # in secondi
N_MELS = 64
HOP_LENGTH = 320  # ≈ 10ms
WINDOW_SIZE = 1024

# === PERCORSI ===
LABELS_DIR = Path("../../MLPC2025_classification/labels")
AUDIO_DIR = Path("../../MLPC2025_classification/audio")
METADATA_PATH = Path("../../MLPC2025_classification/metadata.csv")
OUTPUT_DIR = Path("data/segments")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# === CLASSI TARGET ===
CLASSES = ['Speech', 'Dog Bark', 'Rooster Crow', 'Shout',
           'Lawn Mower', 'Chainsaw', 'Jackhammer',
           'Power Drill', 'Horn Honk', 'Siren']

def extract_logmel(y, sr):
    mel_spec = librosa.feature.melspectrogram(
        y=y,
        sr=sr,
        n_fft=WINDOW_SIZE,
        hop_length=HOP_LENGTH,
        n_mels=N_MELS,
        power=2.0
    )
    logmel = librosa.power_to_db(mel_spec, ref=np.max)
    return logmel.T  # shape: (T, F)

def load_labels_for_file(file_stem):
    label_path = LABELS_DIR / f"{file_stem}_labels.npz"
    if not label_path.exists():
        return None
    raw = np.load(label_path)
    return {cls: raw[cls] for cls in CLASSES}

def process_file(filename):
    filepath = AUDIO_DIR / filename
    y, _ = librosa.load(filepath, sr=SAMPLING_RATE)

    file_stem = Path(filename).stem
    labels = load_labels_for_file(file_stem)
    if labels is None:
        print(f"⚠️ Labels not found for {file_stem}")
        return

    total_duration = librosa.get_duration(y=y, sr=SAMPLING_RATE)
    n_segments = int(np.floor(total_duration / SEGMENT_LENGTH))

    for i in range(n_segments):
        start = i * SEGMENT_LENGTH
        end = start + SEGMENT_LENGTH
        start_sample = int(start * SAMPLING_RATE)
        end_sample = int(end * SAMPLING_RATE)

        segment_audio = y[start_sample:end_sample]
        if len(segment_audio) < (SEGMENT_LENGTH * SAMPLING_RATE):
            continue  # skip segment if too short

        # log-mel
        logmel = extract_logmel(segment_audio, SAMPLING_RATE)

        # label vector
        # label vector
        segment_label = np.zeros(len(CLASSES), dtype=np.float32)
        n_total_frames = len(labels[CLASSES[0]])
        segment_start_frame = int(start / (HOP_LENGTH / SAMPLING_RATE))  # frame corrispondente
        segment_end_frame = int(end / (HOP_LENGTH / SAMPLING_RATE))

        for j, cls in enumerate(CLASSES):
            cls_array = labels[cls].squeeze()
            if np.any(cls_array[segment_start_frame:segment_end_frame] > 0.5):
                segment_label[j] = 1.0


        # segment_label = np.zeros(len(CLASSES), dtype=np.float32)
        # for j, cls in enumerate(CLASSES):
        #     for onset, offset in labels[cls]:
        #         if onset < end and offset > start:
        #             segment_label[j] = 1.0
        #             break

        # salvataggio
        out_name = f"{file_stem}_start-{start:.1f}.npz"
        out_path = OUTPUT_DIR / out_name
        np.savez_compressed(out_path, logmel=logmel, labels=segment_label)


In [4]:
metadata_df = pd.read_csv(METADATA_PATH)
filenames = metadata_df['filename'].tolist()

for fname in tqdm(filenames):
    try:
        process_file(fname)
    except Exception as e:
        print(f"Errore su {fname}: {e}")


100%|██████████| 8230/8230 [18:37<00:00,  7.36it/s]  
