In [10]:
from datasets import load_from_disk
from sklearn.model_selection import train_test_split
import torch

dataset = load_from_disk("./iemocap_precomputed")

emotion_labels = ['neutral', 'happy', 'sad', 'angry', 'frustrated', 'excited', 'fear', 'disgust', 'surprise', 'other']
num_classes = len(emotion_labels)

label_to_idx = {label: idx for idx, label in enumerate(emotion_labels)}

full_data = dataset['train']
full_data_list = full_data.to_list()

import random
random.seed(42)
random.shuffle(full_data_list)

# Split into train (80%), val (10%), and test (10%)
train_data, temp_data = train_test_split(full_data_list, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

train_dataset = dataset["train"].select(range(len(train_data)))
val_dataset = dataset["train"].select(range(len(train_data), len(train_data) + len(val_data)))
test_dataset = dataset["train"].select(range(len(train_data) + len(val_data), len(full_data_list)))

print(f"Train size: {len(train_dataset)}")
print(f"Validation size: {len(val_dataset)}")
print(f"Test size: {len(test_dataset)}")


Train size: 8031
Validation size: 1004
Test size: 1004


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [12]:
from torch.utils.data import Dataset

class IEMOCAPDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)


    def __getitem__(self, idx):
        sample = self.dataset[idx]

        audio_emb = torch.tensor(sample['audio_embedding']).float() # (768,)
        text_emb = torch.tensor(sample["text_embedding"]).float() # (768,)
        label = torch.tensor(sample["label_id"], dtype=torch.long)

        input_features = torch.cat((audio_emb, text_emb), dim=0) # (1536,)

        return {"input_features": input_features, "labels": label}

train_ds = IEMOCAPDataset(train_dataset)
val_ds = IEMOCAPDataset(val_dataset)
test_ds = IEMOCAPDataset(test_dataset)

In [13]:
import torch.nn as nn

class Classifier(nn.Module):
    def __init__(self, num_classes=10):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(768 + 768, 512)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, num_classes)
    
    def forward(self, input_features, labels=None):
        x = self.fc1(input_features)
        x = self.relu(x)
        x = self.fc2(x)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(x, labels)  # Compute loss

        return {"loss": loss, "logits": x} if loss is not None else {"logits": x}
    
model = Classifier(num_classes).to(device)
model

Classifier(
  (fc1): Linear(in_features=1536, out_features=512, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=512, out_features=10, bias=True)
)

In [14]:
import numpy as np
import evaluate

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./classifier_model",
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    learning_rate=5e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    fp16=True
)



In [16]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics
    
)

trainer.train()

Step,Training Loss,Validation Loss,Accuracy
100,1.7605,1.612756,0.336653
200,1.6178,1.53363,0.381474
300,1.5538,1.46943,0.438247
400,1.5036,1.482029,0.39741
500,1.452,1.426551,0.416335
600,1.459,1.409524,0.449203
700,1.3799,1.371469,0.474104
800,1.3561,1.382987,0.471116
900,1.3661,1.357896,0.478088
1000,1.3858,1.360279,0.472112


TrainOutput(global_step=1506, training_loss=1.4303562337183857, metrics={'train_runtime': 59.1424, 'train_samples_per_second': 407.373, 'train_steps_per_second': 25.464, 'total_flos': 0.0, 'train_loss': 1.4303562337183857, 'epoch': 3.0})

In [17]:
test_results = trainer.evaluate(test_ds)
test_results

{'eval_loss': 1.4017748832702637,
 'eval_accuracy': 0.47410358565737054,
 'eval_runtime': 1.29,
 'eval_samples_per_second': 778.284,
 'eval_steps_per_second': 48.837,
 'epoch': 3.0}