# IEMOCAP Training Baseline

In [13]:
# Setup environment
import os
import sys
import numpy as np
import torch
from pathlib import Path

NOTEBOOK_DIR = Path(os.getcwd())
PROJECT_ROOT = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'iemocap_only_train' else NOTEBOOK_DIR
sys.path.insert(0, str(PROJECT_ROOT))

print(f"PyTorch: {torch.__version__}")
print(f"Project Root: {PROJECT_ROOT}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device} {'(' + torch.cuda.get_device_name(0) + ')' if torch.cuda.is_available() else ''}")

PyTorch: 2.10.0+cu128
Project Root: d:\Roba da D\Poli\ML Vision\speech-emotion-recognition-25
Device: cuda (NVIDIA GeForce RTX 5060 Ti)


In [14]:
# Download IEMOCAP dataset
from utils.download_dataset_local import dowload_iemocap_local

iemocap_path = dowload_iemocap_local()
print(f"‚úÖ IEMOCAP: {iemocap_path}" if iemocap_path else "‚ùå Download failed")


--- Download IEMOCAP (locale) ---
‚úì IEMOCAP gi√† presente: d:\Roba da D\Poli\ML Vision\speech-emotion-recognition-25\data\iemocap
Numero di file: 81249
‚úÖ IEMOCAP: d:\Roba da D\Poli\ML Vision\speech-emotion-recognition-25\data\iemocap\IEMOCAP_full_release


In [15]:
# Create IEMOCAP DataLoaders
from torch.utils.data import DataLoader
from dataset.custom_iemocap_dataset import CustomIEMOCAPDataset

train_dataset = CustomIEMOCAPDataset(dataset_root=iemocap_path, split='train', spec_freq_mask=30, spec_time_mask=15)
val_dataset = CustomIEMOCAPDataset(dataset_root=iemocap_path, split='validation')
test_dataset = CustomIEMOCAPDataset(dataset_root=iemocap_path, split='test')

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Train: {len(train_dataset)} | Val: {len(val_dataset)} | Test: {len(test_dataset)}")

‚úÖ Caricate 5531 etichette
üîç Raccogliendo campioni audio...
‚úÖ Raccolti 2943 campioni audio validi
   - Solo campioni improvvisati
   - Emozioni: ['neutral', 'happy', 'sad', 'angry', 'happy']
üìä Statistiche del dataset IEMOCAP:

üìä ANALISI IEMOCAP TRAINING SET

üîπ SAMPLES TOTALI: 1678
üîπ SESSIONI: ['1', '2', '3']
üîπ SPEAKER UNICI (session, gender): 6
   Elenco: [('1', 'F'), ('1', 'M'), ('2', 'F'), ('2', 'M'), ('3', 'F'), ('3', 'M')]
üîπ IMPROVVISAZIONI UNICHE: 12
   Elenco: ['01', '02', '03', '04', '05', '05a', '05b', '06', '07', '08', '08a', '08b']

üë• SPEAKER INDEPENDENCE (per verificare leakage):
   - Sessione 1: (Ses1, F), (Ses1, M)
   - Sessione 2: (Ses2, F), (Ses2, M)
   - Sessione 3: (Ses3, F), (Ses3, M)

üé≠ DISTRIBUZIONE EMOZIONI:
   - Angry     :  174 ( 10.4%) ‚ñà‚ñà
   - Happy     :  472 ( 28.1%) ‚ñà‚ñà‚ñà‚ñà‚ñà
   - Neutral   :  638 ( 38.0%) ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
   - Sad       :  394 ( 23.5%) ‚ñà‚ñà‚ñà‚ñà

üìã DISTRIBUZIONE CAMPIONI PER SESSIONE:
   - S

In [16]:
# W&B Login
import wandb
import os
os.environ['WANDB_API_KEY'] = '7ade30086de7899bed412e3eb5c2da065c146f90'
wandb.login()

True

In [17]:
#start training
%cd {PROJECT_ROOT}/iemocap_only_train
!python train.py

d:\Roba da D\Poli\ML Vision\speech-emotion-recognition-25\iemocap_only_train
Using device: cuda

‚úÖ IEMOCAP trovato: d:\Roba da D\Poli\ML Vision\speech-emotion-recognition-25\data\iemocap\IEMOCAP_full_release

‚úÖ Caricate 5531 etichette
üîç Raccogliendo campioni audio...
‚úÖ Raccolti 2943 campioni audio validi
   - Solo campioni improvvisati
   - Emozioni: ['neutral', 'happy', 'sad', 'angry', 'happy']
üìä Statistiche del dataset IEMOCAP:

üìä ANALISI IEMOCAP TRAINING SET

üîπ SAMPLES TOTALI: 1678
üîπ SESSIONI: ['1', '2', '3']
üîπ SPEAKER UNICI (session, gender): 6
   Elenco: [('1', 'F'), ('1', 'M'), ('2', 'F'), ('2', 'M'), ('3', 'F'), ('3', 'M')]
üîπ IMPROVVISAZIONI UNICHE: 12
   Elenco: ['01', '02', '03', '04', '05', '05a', '05b', '06', '07', '08', '08a', '08b']

üë• SPEAKER INDEPENDENCE (per verificare leakage):
   - Sessione 1: (Ses1, F), (Ses1, M)
   - Sessione 2: (Ses2, F), (Ses2, M)
   - Sessione 3: (Ses3, F), (Ses3, M)

üé≠ DISTRIBUZIONE EMOZIONI:
   - Angry     :  17

wandb: Currently logged in as: pagliarellomatteo (pagliarellomatteo-politecnico-di-torino) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: setting up run xa80iz5j
wandb: Tracking run with wandb version 0.23.1
wandb: Run data is saved locally in d:\Roba da D\Poli\ML Vision\speech-emotion-recognition-25\iemocap_only_train\wandb\run-20260124_153517-xa80iz5j
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run train_20260124_163516
wandb:  View project at https://wandb.ai/pagliarellomatteo-politecnico-di-torino/speech-emotion-recognition
wandb:  View run at https://wandb.ai/pagliarellomatteo-politecnico-di-torino/speech-emotion-recognition/runs/xa80iz5j
  result = _VF.lstm(
wandb: updating run metadata
wandb: uploading output.log; uploading wandb-summary.json
wandb: 
wandb: Run history:
wandb:            epoch ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà
wandb: swa_val_accuracy ‚ñÅ‚ñÖ‚ñá

In [18]:
#start evaluation
%cd {PROJECT_ROOT}/iemocap_only_train
!python eval.py

d:\Roba da D\Poli\ML Vision\speech-emotion-recognition-25\iemocap_only_train
Using device: cuda

Timestamp valutazione: 20260124_164020

‚úÖ IEMOCAP trovato: d:\Roba da D\Poli\ML Vision\speech-emotion-recognition-25\data\iemocap\IEMOCAP_full_release

Loading IEMOCAP test set...
‚úÖ Caricate 5531 etichette
üîç Raccogliendo campioni audio...
‚úÖ Raccolti 2943 campioni audio validi
   - Solo campioni improvvisati
   - Emozioni: ['neutral', 'happy', 'sad', 'angry', 'happy']
üìä Statistiche del dataset IEMOCAP:

üìä ANALISI IEMOCAP TEST SET

üîπ SAMPLES TOTALI: 731
üîπ SESSIONI: ['5']
üîπ SPEAKER UNICI (session, gender): 2
   Elenco: [('5', 'F'), ('5', 'M')]
üîπ IMPROVVISAZIONI UNICHE: 8
   Elenco: ['01', '02', '03', '04', '05', '06', '07', '08']

üë• SPEAKER INDEPENDENCE (per verificare leakage):
   - Sessione 5: (Ses5, F), (Ses5, M)

üé≠ DISTRIBUZIONE EMOZIONI:
   - Angry     :   31 (  4.2%) 
   - Happy     :  280 ( 38.3%) ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
   - Neutral   :  287 ( 39.3%) ‚ñà‚

wandb: Currently logged in as: pagliarellomatteo (pagliarellomatteo-politecnico-di-torino) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: Tracking run with wandb version 0.23.1
wandb: Run data is saved locally in d:\Roba da D\Poli\ML Vision\speech-emotion-recognition-25\iemocap_only_train\wandb\run-20260124_154020-jd0lnv2w
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run eval_20260124_164020
wandb:  View project at https://wandb.ai/pagliarellomatteo-politecnico-di-torino/speech-emotion-recognition
wandb:  View run at https://wandb.ai/pagliarellomatteo-politecnico-di-torino/speech-emotion-recognition/runs/jd0lnv2w
wandb: uploading artifact run-jd0lnv2w-classification_report; updating run metadata
wandb: uploading artifact run-jd0lnv2w-classification_report; uploading config.yaml
wandb: uploading artifact run-jd0lnv2w-classification_report
wandb: uploading history steps 0-1, summary, console lines 0-82
wandb: 
wandb: Run history:
wandb:    