# PROBABILE STRUTTURA
1. Caricamento dei file audio
2. Parsing del filename → estrazione emotion_id
3. Creazione DataFrame per vedere se tutto è corretto  ← QUI metti il codice
4. Generazione dei mel-spectrogram
5. Decisione dei parametri (n_mels, hop, win, durata, padding)
6. Test di un paio di augmentations


In [2]:
import os
os.getcwd()


'c:\\Users\\franc\\Politecnico Di Torino Studenti Dropbox\\Francesca Melloni\\POLITO\\2°anno, I semestre\\ML\\Project ML\\SER-Machine-Learning-Project\\notebooks'

In [9]:
import sys
sys.path.append("../src")

import pandas as pd
import os
from glob import glob
from preprocessing.dataset import extract_emotion_label, EMOTION_MAP

audio_files = glob("../data/Actor_*/*.wav", recursive=True)
print("Numero totale file:", len(audio_files))
audio_files[:3]

rows = []

for f in audio_files:
    filename = os.path.basename(f)
    parts = filename.split('-')

    emotion_id = parts[2]
    emotion_label = EMOTION_MAP[emotion_id]

    actor = parts[-1].split('.')[0]  # ultimo numero prima del .wav

    rows.append([f, emotion_id, emotion_label, actor])

df = pd.DataFrame(rows, columns=["filepath", "emotion_id", "emotion_label", "actor"])
df.head()



Numero totale file: 1440


Unnamed: 0,filepath,emotion_id,emotion_label,actor
0,../data\Actor_01\03-01-01-01-01-01-01.wav,1,neutral,1
1,../data\Actor_01\03-01-01-01-01-02-01.wav,1,neutral,1
2,../data\Actor_01\03-01-01-01-02-01-01.wav,1,neutral,1
3,../data\Actor_01\03-01-01-01-02-02-01.wav,1,neutral,1
4,../data\Actor_01\03-01-02-01-01-01-01.wav,2,calm,1


In [None]:
# Verifica della corretta popolazione del dataset

print("=== DATASET CHECK ===\n")

print(f"Totale file audio: {len(df)}\n")

print("Distribuzione emozioni:")
print(df["emotion_label"].value_counts().sort_index(), "\n")

print("Distribuzione attori:")
print(df["actor"].value_counts().sort_index(), "\n")


=== DATASET CHECK ===

Totale file audio: 1440

Distribuzione emozioni:
emotion_label
angry        192
calm         192
disgust      192
fearful      192
happy        192
neutral       96
sad          192
surprised    192
Name: count, dtype: int64 

Distribuzione attori:
actor
01    60
02    60
03    60
04    60
05    60
06    60
07    60
08    60
09    60
10    60
11    60
12    60
13    60
14    60
15    60
16    60
17    60
18    60
19    60
20    60
21    60
22    60
23    60
24    60
Name: count, dtype: int64 

✔ Nessun valore mancante nel dataset


In [None]:
import torch
import torchaudio
import torchaudio.transforms as T
import matplotlib.pyplot as plt
import numpy as np

SAMPLE_RATE = 16000
N_MELS = 64
N_FFT = 1024
HOP_LENGTH = 160      # ~10 ms
WIN_LENGTH = 400      # ~25 ms
MAX_DURATION = 4.0    # secondi
MAX_SAMPLES = int(SAMPLE_RATE * MAX_DURATION)

# 1. CARICAMENTO AUDIO
audio_path = audio_files[0]  # prendiamo un file qualsiasi per test

# Carica audio e ricampiona a 16 kHz
waveform, sr = torchaudio.load(audio_path)

# (solo mono perchè non ci servono informazioni stereo che servono per spazialità e ambienti)
# Carica audio (torna: waveform [channels, samples], sample_rate)
if waveform.shape[0] > 1:
    waveform = waveform.mean(dim=0, keepdim=True)

# Ricampionamento a 16 kHz
if sr != SAMPLE_RATE:
    resampler = T.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
    waveform = resampler(waveform)

# 2. PAD / CROP A DURATA FISSA
num_samples = waveform.shape[1]

if num_samples < MAX_SAMPLES:
    # padding con zeri
    padding = MAX_SAMPLES - num_samples
    waveform = torch.nn.functional.pad(waveform, (0, padding))
else:
    # crop
    waveform = waveform[:, :MAX_SAMPLES]

# 3. MEL-SPECTROGRAM
mel_transform = T.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=N_FFT,
    hop_length=HOP_LENGTH,
    win_length=WIN_LENGTH,
    n_mels=N_MELS,
    power=2.0  # spettrogramma di potenza
)

mel_spec = mel_transform(waveform)  # shape: [1, n_mels, time]

# 4. LOG-MEL
log_mel_spec = torch.log(mel_spec + 1e-9)

# 5. NORMALIZZAZIONE (per sample)
mean = log_mel_spec.mean()
std = log_mel_spec.std()
log_mel_spec = (log_mel_spec - mean) / std

# 6. VERIFICA SHAPE E VALORI
print("Shape log-mel:", log_mel_spec.shape)
print("Min:", log_mel_spec.min().item())
print("Max:", log_mel_spec.max().item())
print("Mean:", log_mel_spec.mean().item())
print("Std:", log_mel_spec.std().item())

# 7. VISUALIZZAZIONE
plt.figure(figsize=(10, 4))
plt.imshow(
    log_mel_spec.squeeze(0).numpy(),
    origin="lower",
    aspect="auto"
)
plt.colorbar()
plt.title("Log-Mel Spectrogram (normalizzato)")
plt.xlabel("Time frames")
plt.ylabel("Mel bins")
plt.tight_layout()
plt.show()