In [None]:
%matplotlib inline
import torch
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Audio, display
import os

In [1]:
MODEL_TYPE = "yamnet"  # yamnet/evo_cnn/whisper
FEATURE_TYPE = {
    "yamnet": "waveforms",
    "evo_cnn": "melspecs",
    "whisper": "whisper_inputs"
}[MODEL_TYPE]


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

NameError: name 'torch' is not defined

In [None]:
import sys

sys.path.append("..")

from python_helpers import get_project_root_dir
from datasets import SoundTracksDataset
from models import YAMNetMER, EvoCNN, WhisperMER
from train import ModelTrainer



In [None]:
full_dataset = SoundTracksDataset()
print(f"Dataset size: {len(full_dataset)} samples")
print(f"Sample features shape: {full_dataset.melspecs[0].shape}")

In [None]:
if MODEL_TYPE == "whisper":
    from transformers import WhisperProcessor
    
    processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
    
    def process_whisper(waveform):
        waveform_16k = torchaudio.functional.resample(waveform, 44100, 16000)
        return processor(
            waveform_16k.numpy(),
            sampling_rate=16000,
            return_tensors="pt"
        )
    
    # Convert all waveforms to Whisper inputs
    whisper_features = [process_whisper(wf) for wf in full_dataset.waveforms]
    full_dataset.features['whisper_inputs'] = whisper_features


train_val, test = full_dataset.train_test_split(split_size=0.2)
train, val = train_val.train_test_split(split_size=0.25)
print(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")

In [None]:
model_selector = {
    "yamnet": YAMNetMER(),
    "evo_cnn": EvoCNN(),
    "whisper": WhisperMER()
}
model = model_selector[MODEL_TYPE].to(device)
print(f"Selected {MODEL_TYPE} model:")
print(model)

# Training Configuration
trainer = ModelTrainer(
    task='multiclass',
    num_classes=4,
    device=device
)

# Training Parameters
config = {
    "yamnet": {"batch_size": 64, "lr": 0.001, "epochs": 20},
    "evo_cnn": {"batch_size": 32, "lr": 0.0005, "epochs": 50},
    "whisper": {"batch_size": 2, "lr": 2e-5, "epochs": 10}
}[MODEL_TYPE]

# Start Training

trainer.train(
    model=model,
    train_dset=train.to(device),
    val_dset=val.to(device),
    batch_size=config["batch_size"],
    max_epochs=config["epochs"],
    lr=config["lr"],
    take_best=True,
)


In [None]:
test = test.to(device)
test_loss, test_acc, test_cm = trainer.evaluate_performance(model, test)
print(f"\nTest Accuracy: {test_acc:.2%}")
print("Confusion Matrix:")
print(test_cm)

In [None]:
plt.figure(figsize=(10,8))
plt.imshow(test_cm.cpu().numpy(), cmap='Blues')
plt.title('Confusion Matrix')
plt.xticks(range(4), ['Happy', 'Sad', 'Anger', 'Neutral'])
plt.yticks(range(4), ['Happy', 'Sad', 'Anger', 'Neutral'])
plt.colorbar()
plt.show()