# LAB 3: How to setup a project from Scratch

In [None]:
!rm -rf speech-emotion-recognition-25

# Step 1: Clone your project from Github

In [None]:
#main
#!git clone https://github.com/MatteoPaglia/speech-emotion-recognition-25.git

#             nome branch

!git clone -b RavdnessTrain https://github.com/MatteoPaglia/speech-emotion-recognition-25.git

In [None]:
!ls

In [None]:
# %cd mldl_project_skeleton
%cd speech-emotion-recognition-25

In [None]:
!ls

# Step 2: Packages Installation


In [None]:
%pip install -r requirements.txt

# Step 3: Dataset Setup
## Different options
- First one is downloading using a script that places the data in the download folder (usually recommended)
- Second one is uploading the dataset to your personal/institutional Google Drive and load it from there ([Read More](https://saturncloud.io/blog/google-colab-how-to-read-data-from-my-google-drive/))
- Place the download script directly here on colab

You are free to do as you please in this phase.


In [None]:
!python utils/download_dataset.py

In [None]:
import os
from pathlib import Path

print("="*80)
print("üîç RICERCA PERCORSI DATASET")
print("="*80)

# Percorsi possibili dove potrebbero essere i dataset
possible_paths = [
    Path('/kaggle/input/'),
    Path.home() / '.cache' / 'kagglehub' / 'datasets',
    Path('/root/.cache/kagglehub/datasets'),
    Path('/tmp/kagglehub/datasets'),
    Path('./data'),
    Path('../data'),
    Path('../../data'),
]

# Aggiungi anche la directory corrente
possible_paths.append(Path.cwd())

print(f"\nüìÅ Directory corrente: {Path.cwd()}\n")

# Ricerca IEMOCAP
print("üîé Ricerca IEMOCAP_full_release...")
iemocap_path = None
iemocap_found = False
for base_path in possible_paths:
    if base_path.exists():
        for root, dirs, files in os.walk(base_path):
            if 'IEMOCAP_full_release' in dirs:
                iemocap_path = Path(root) / 'IEMOCAP_full_release'
                print(f"‚úÖ IEMOCAP trovato a: {iemocap_path}")
                iemocap_found = True
                break
    if iemocap_found:
        break

if not iemocap_found:
    print("‚ùå IEMOCAP non trovato nei percorsi standard")
    iemocap_path = None

# Ricerca RAVDESS
print("\nüîé Ricerca ravdess-emotional-speech-audio...")
ravdess_path = None
ravdess_found = False
for base_path in possible_paths:
    if base_path.exists():
        for root, dirs, files in os.walk(base_path):
            if 'ravdess-emotional-speech-audio' in dirs:
                ravdess_path = Path(root) / 'ravdess-emotional-speech-audio'
                print(f"‚úÖ RAVDESS trovato a: {ravdess_path}")
                ravdess_found = True
                break
    if ravdess_found:
        break

if not ravdess_found:
    print("‚ùå RAVDESS non trovato nei percorsi standard")
    ravdess_path = None

# Lista contenuti della directory data/ se esiste
print("\nüìÇ Contenuto della cartella 'data/' (se presente):")
data_dir = Path('./data')
if data_dir.exists():
    for item in data_dir.iterdir():
        print(f"   - {item.name}")
else:
    print("   ‚ùå Cartella 'data/' non trovata")

print("\n" + "="*80)
print("‚úÖ VARIABILI SALVATE:")
print(f"   - iemocap_path = {iemocap_path}")
print(f"   - ravdess_path = {ravdess_path}")
print("="*80)

In [None]:
from torch.utils.data import DataLoader
from dataset.custom_iemocap_dataset import CustomIEMOCAPDataset
from dataset.custom_ravdess_dataset import CustomRAVDESSDataset

print(f"////////////////////////////////////////////////////////////////////////////////////////////")
print(f"Dataset IEMOCAP")
print(f"////////////////////////////////////////////////////////////////////////////////////////////")

# Usa il percorso trovato in precedenza, altrimenti fallback
if iemocap_path and iemocap_path.exists():
    dataset_IEMOCAP_path = str(iemocap_path)
    print(f"‚úÖ Usando percorso trovato: {dataset_IEMOCAP_path}")
else:
    dataset_IEMOCAP_path = '/kaggle/input/iemocapfullrelease/IEMOCAP_full_release'
    print(f"‚ö†Ô∏è  Percorso non trovato, usando fallback: {dataset_IEMOCAP_path}")

# Create IEMOCAPdatasets
train_IEMOCAP_dataset = CustomIEMOCAPDataset(dataset_root=dataset_IEMOCAP_path, split='train')
val_IEMOCAP_dataset = CustomIEMOCAPDataset(dataset_root=dataset_IEMOCAP_path, split='validation')
test_IEMOCAP_dataset = CustomIEMOCAPDataset(dataset_root=dataset_IEMOCAP_path, split='test')

print(f"Train samples: {len(train_IEMOCAP_dataset)}")
print(f"Val samples: {len(val_IEMOCAP_dataset)}")
print(f"Test samples: {len(test_IEMOCAP_dataset)}")

# Create IEMOCAP DataLoaders
batch_size = 4
train_IEMOCAP_dataloader = DataLoader(train_IEMOCAP_dataset, batch_size=batch_size, shuffle=True)
val_IEMOCAP_dataloader = DataLoader(val_IEMOCAP_dataset, batch_size=batch_size, shuffle=False)
test_IEMOCAP_dataloader = DataLoader(test_IEMOCAP_dataset, batch_size=batch_size, shuffle=False)


print(f"////////////////////////////////////////////////////////////////////////////////////////////")
print(f"Dataset RAVDESS")
print(f"////////////////////////////////////////////////////////////////////////////////////////////")

# Usa il percorso trovato in precedenza, altrimenti fallback
if ravdess_path and ravdess_path.exists():
    dataset_RAVDESS_path = str(ravdess_path)
    print(f"‚úÖ Usando percorso trovato: {dataset_RAVDESS_path}")
else:
    dataset_RAVDESS_path = '/kaggle/input/ravdess-emotional-speech-audio'
    print(f"‚ö†Ô∏è  Percorso non trovato, usando fallback: {dataset_RAVDESS_path}")

# Create RAVDESS datasets
train_RAVDESS_dataset = CustomRAVDESSDataset(dataset_root=dataset_RAVDESS_path, split='train')
val_RAVDESS_dataset = CustomRAVDESSDataset(dataset_root=dataset_RAVDESS_path, split='validation')
test_RAVDESS_dataset = CustomRAVDESSDataset(dataset_root=dataset_RAVDESS_path, split='test')

print(f"Train samples: {len(train_RAVDESS_dataset)}")
print(f"Val samples: {len(val_RAVDESS_dataset)}")
print(f"Test samples: {len(test_RAVDESS_dataset)}")

# Create RAVDESS DataLoaders
batch_size = 4
train_RAVDESS_dataloader = DataLoader(train_RAVDESS_dataset, batch_size=batch_size, shuffle=True)
val_RAVDESS_dataloader = DataLoader(val_RAVDESS_dataset, batch_size=batch_size, shuffle=False)
test_RAVDESS_dataloader = DataLoader(test_RAVDESS_dataset, batch_size=batch_size, shuffle=False)

In [None]:
""" import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# Colori per le emozioni
emotion_colors = {
    'neutral': '#4285F4',  # Blu
    'happy': '#34A853',    # Verde
    'sad': '#EA4335',      # Rosso
    'angry': '#FBBC04'     # Giallo/Arancione
}

fig, axes = plt.subplots(3, 2, figsize=(16, 14))
fig.suptitle('Log-Mel Spectrograms: IEMOCAP (Train vs Validation) vs RAVDESS (Train)', fontsize=16, fontweight='bold')

# ===== IEMOCAP - VALIDATION (ha le label) =====
print("Loading IEMOCAP VALIDATION samples...")
iemocap_val_sample_1 = val_IEMOCAP_dataset[0]
iemocap_val_sample_2 = val_IEMOCAP_dataset[1]

# Plot IEMOCAP Validation Sample 1
im1 = axes[0, 0].imshow(iemocap_val_sample_1['audio_features'].squeeze().numpy(), 
                        aspect='auto', origin='lower', cmap='viridis')
axes[0, 0].set_title(f"IEMOCAP VALIDATION Sample 1\nEmotion: {iemocap_val_sample_1['emotion']} | Actor: {iemocap_val_sample_1['actor_id']}", 
                     fontweight='bold', color='green')
axes[0, 0].set_ylabel('Mel Frequency Bins')
axes[0, 0].set_xlabel('Time Frames')
plt.colorbar(im1, ax=axes[0, 0], label='dB')

# Plot IEMOCAP Validation Sample 2
im2 = axes[0, 1].imshow(iemocap_val_sample_2['audio_features'].squeeze().numpy(), 
                        aspect='auto', origin='lower', cmap='viridis')
axes[0, 1].set_title(f"IEMOCAP VALIDATION Sample 2\nEmotion: {iemocap_val_sample_2['emotion']} | Actor: {iemocap_val_sample_2['actor_id']}", 
                     fontweight='bold', color='green')
axes[0, 1].set_ylabel('Mel Frequency Bins')
axes[0, 1].set_xlabel('Time Frames')
plt.colorbar(im2, ax=axes[0, 1], label='dB')

# ===== IEMOCAP - TRAIN (NO label) =====
print("Loading IEMOCAP TRAIN samples...")
iemocap_train_sample_1 = train_IEMOCAP_dataset[0]
iemocap_train_sample_2 = train_IEMOCAP_dataset[1]

# Plot IEMOCAP Train Sample 1
im3 = axes[1, 0].imshow(iemocap_train_sample_1['audio_features'].squeeze().numpy(), 
                        aspect='auto', origin='lower', cmap='viridis')
axes[1, 0].set_title(f"IEMOCAP TRAIN Sample 1\nEmotion: {iemocap_train_sample_1['emotion']} (NO LABEL) | Actor: {iemocap_train_sample_1['actor_id']}", 
                     fontweight='bold', color='red')
axes[1, 0].set_ylabel('Mel Frequency Bins')
axes[1, 0].set_xlabel('Time Frames')
plt.colorbar(im3, ax=axes[1, 0], label='dB')

# Plot IEMOCAP Train Sample 2
im4 = axes[1, 1].imshow(iemocap_train_sample_2['audio_features'].squeeze().numpy(), 
                        aspect='auto', origin='lower', cmap='viridis')
axes[1, 1].set_title(f"IEMOCAP TRAIN Sample 2\nEmotion: {iemocap_train_sample_2['emotion']} (NO LABEL) | Actor: {iemocap_train_sample_2['actor_id']}", 
                     fontweight='bold', color='red')
axes[1, 1].set_ylabel('Mel Frequency Bins')
axes[1, 1].set_xlabel('Time Frames')
plt.colorbar(im4, ax=axes[1, 1], label='dB')

# ===== RAVDESS - TRAIN =====
print("Loading RAVDESS TRAIN samples...")
ravdess_sample_1 = train_RAVDESS_dataset[0]
ravdess_sample_2 = train_RAVDESS_dataset[1]

# Plot RAVDESS Sample 1
im5 = axes[2, 0].imshow(ravdess_sample_1['audio_features'].squeeze().numpy(), 
                        aspect='auto', origin='lower', cmap='viridis')
axes[2, 0].set_title(f"RAVDESS TRAIN Sample 1\nEmotion: {ravdess_sample_1['emotion']} | Actor: {ravdess_sample_1['actor_id']}", 
                     fontweight='bold')
axes[2, 0].set_ylabel('Mel Frequency Bins')
axes[2, 0].set_xlabel('Time Frames')
plt.colorbar(im5, ax=axes[2, 0], label='dB')

# Plot RAVDESS Sample 2
im6 = axes[2, 1].imshow(ravdess_sample_2['audio_features'].squeeze().numpy(), 
                        aspect='auto', origin='lower', cmap='viridis')
axes[2, 1].set_title(f"RAVDESS TRAIN Sample 2\nEmotion: {ravdess_sample_2['emotion']} | Actor: {ravdess_sample_2['actor_id']}", 
                     fontweight='bold')
axes[2, 1].set_ylabel('Mel Frequency Bins')
axes[2, 1].set_xlabel('Time Frames')
plt.colorbar(im6, ax=axes[2, 1], label='dB')

plt.tight_layout()
plt.show()

# ===== Stampa delle statistiche =====
print("\n" + "="*80)
print("SAMPLE DETAILS")
print("="*80)

print("\n‚úÖ IEMOCAP VALIDATION (con label):")
print("\nüìä Sample 1:")
print(f"   Emotion: {iemocap_val_sample_1['emotion']} (ID: {iemocap_val_sample_1['emotion_id']})")
print(f"   Actor: {iemocap_val_sample_1['actor_id']}")
print(f"   Spectrogram Shape: {iemocap_val_sample_1['audio_features'].shape} (channels, mel_bins, time_frames)")
print(f"   Min value: {iemocap_val_sample_1['audio_features'].min().item():.2f} dB")
print(f"   Max value: {iemocap_val_sample_1['audio_features'].max().item():.2f} dB")
print(f"   Mean value: {iemocap_val_sample_1['audio_features'].mean().item():.2f} dB")

print("\nüìä Sample 2:")
print(f"   Emotion: {iemocap_val_sample_2['emotion']} (ID: {iemocap_val_sample_2['emotion_id']})")
print(f"   Actor: {iemocap_val_sample_2['actor_id']}")
print(f"   Spectrogram Shape: {iemocap_val_sample_2['audio_features'].shape}")
print(f"   Min value: {iemocap_val_sample_2['audio_features'].min().item():.2f} dB")
print(f"   Max value: {iemocap_val_sample_2['audio_features'].max().item():.2f} dB")
print(f"   Mean value: {iemocap_val_sample_2['audio_features'].mean().item():.2f} dB")

print("\n‚ùå IEMOCAP TRAIN (SENZA label - Unsupervised):")
print("\nüìä Sample 1:")
print(f"   Emotion: {iemocap_train_sample_1['emotion']} | Audio ID: {iemocap_train_sample_1['emotion_id']}")
print(f"   Actor: {iemocap_train_sample_1['actor_id']}")
print(f"   Spectrogram Shape: {iemocap_train_sample_1['audio_features'].shape}")
print(f"   Min value: {iemocap_train_sample_1['audio_features'].min().item():.2f} dB")
print(f"   Max value: {iemocap_train_sample_1['audio_features'].max().item():.2f} dB")
print(f"   Mean value: {iemocap_train_sample_1['audio_features'].mean().item():.2f} dB")

print("\nüìä Sample 2:")
print(f"   Emotion: {iemocap_train_sample_2['emotion']} | Audio ID: {iemocap_train_sample_2['emotion_id']}")
print(f"   Actor: {iemocap_train_sample_2['actor_id']}")
print(f"   Spectrogram Shape: {iemocap_train_sample_2['audio_features'].shape}")
print(f"   Min value: {iemocap_train_sample_2['audio_features'].min().item():.2f} dB")
print(f"   Max value: {iemocap_train_sample_2['audio_features'].max().item():.2f} dB")
print(f"   Mean value: {iemocap_train_sample_2['audio_features'].mean().item():.2f} dB")

print("\n‚úÖ RAVDESS TRAIN (con label):")
print("\nüìä Sample 1:")
print(f"   Emotion: {ravdess_sample_1['emotion']} (ID: {ravdess_sample_1['emotion_id']})")
print(f"   Actor: {ravdess_sample_1['actor_id']}")
print(f"   Spectrogram Shape: {ravdess_sample_1['audio_features'].shape}")
print(f"   Min value: {ravdess_sample_1['audio_features'].min().item():.2f} dB")
print(f"   Max value: {ravdess_sample_1['audio_features'].max().item():.2f} dB")
print(f"   Mean value: {ravdess_sample_1['audio_features'].mean().item():.2f} dB")

print("\nüìä Sample 2:")
print(f"   Emotion: {ravdess_sample_2['emotion']} (ID: {ravdess_sample_2['emotion_id']})")
print(f"   Actor: {ravdess_sample_2['actor_id']}")
print(f"   Spectrogram Shape: {ravdess_sample_2['audio_features'].shape}")
print(f"   Min value: {ravdess_sample_2['audio_features'].min().item():.2f} dB")
print(f"   Max value: {ravdess_sample_2['audio_features'].max().item():.2f} dB")
print(f"   Mean value: {ravdess_sample_2['audio_features'].mean().item():.2f} dB")

print("\n" + "="*80)
print("‚úÖ Datasets are ready for training!")
print("="*80) """

# Step 4: Train your model and visualize training

 Weights & Biases : Genera i grafici e compara gli esperimenti

In [None]:
import wandb
import os
os.environ['WANDB_API_KEY'] = '7ade30086de7899bed412e3eb5c2da065c146f90'
wandb.login()

In [None]:
!python train.py

# Step 5: Evaluate your model



In [None]:
!python eval.py