In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd drive/MyDrive/Colab \Notebooks

In [22]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel, BertTokenizer, BertModel
import pandas as pd
import numpy as np

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

print(device)

cpu


# Data preparation

In [3]:
DATASET = 'MaSaC'    # @param ['MELD', 'MaSaC']
MAX_LENGTH = 128    # @param [96, 128, 256] {type: 'raw'}
BATCH_SIZE = 16    # @param [8, 16, 32] {type: 'raw'}

In [4]:
def get_data(dataset_name, stage):
    def to_float(x):
        try:
            return float(x)
        except ValueError:
            return 1
    
    df = pd.read_json(f'data/EDiReF_{stage}_data/{dataset_name}_{stage}_efr.json')
    df["triggers"] = df["triggers"].apply(lambda lst: [np.nan if x is None else x for x in lst])
    df = df[df["triggers"].apply(lambda lst: not any(pd.isna(x) for x in lst))]
    df["triggers"] = df["triggers"].apply(lambda lst: [to_float(x) for x in lst])

    conversations = list(df['utterances'])
    emotions = list(df['emotions'])
    triggers = list(df['triggers'])

    return conversations, emotions, triggers

In [5]:
train_conversations, train_emotions, train_triggers = get_data(DATASET, 'train')
val_conversations, val_emotions, val_triggers = get_data(DATASET, 'val')

In [6]:
conversations = train_conversations + val_conversations
emotions = train_emotions + val_emotions
triggers = train_triggers + val_triggers

In [7]:
flattened_emotions = [sent for conv in emotions for sent in conv]
unique_emotions = set(flattened_emotions)

labels_to_ids = {k: v for v, k in enumerate(unique_emotions)}
emotions = [[labels_to_ids[emotion] for emotion in conv] for conv in emotions]

In [8]:
from sklearn.model_selection import train_test_split

def train_val_test_split(X, y1, y2, val_size = 0.2, test_size = 0.2, random_state = None):
    X_train_val, X_test, y1_train_val, y1_test, y2_train_val, y2_test = train_test_split(
        X, y1, y2, test_size=test_size, random_state=random_state
    )

    val_relative_size = val_size / (1 - test_size)

    X_train, X_val, y1_train, y1_val, y2_train, y2_val = train_test_split(
        X_train_val, y1_train_val, y2_train_val, test_size=val_relative_size, random_state=random_state
    )

    return (X_train, X_val, X_test, y1_train, y1_val, y1_test, y2_train, y2_val, y2_test)

In [9]:
X_train, X_val, X_test, y1_train, y1_val, y1_test, y2_train, y2_val, y2_test = train_val_test_split(
    conversations, emotions, triggers, test_size=0.15, val_size=0.15, random_state=2024
    )

In [10]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base') if DATASET == 'MELD' else BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [11]:
def tokenize_conversation(conversations, max_length = 128):
    input_ids = []
    attention_masks = []

    for conversation in conversations:
        dialogue = " [SEP] ".join(conversation)
        encoded = tokenizer(
            dialogue,
            truncation = True,
            padding = 'max_length',
            max_length = max_length,
            return_tensors = "pt"
        )
        input_ids.append(encoded["input_ids"].squeeze(0))
        attention_masks.append(encoded["attention_mask"].squeeze(0))

    return input_ids, attention_masks

In [12]:
def pad_labels(labels, max_length = 128):
    padded_labels = []
    for label_set in labels:
        label_tensor = torch.tensor(label_set, dtype = torch.float)
        # Pad with -1 to ignore padding tokens in the loss function
        padded_tensor = torch.cat(
            [label_tensor, torch.full((max_length - len(label_set),), -1)]
        )
        padded_labels.append(padded_tensor)
    return padded_labels

In [13]:
class ConversationDataset(Dataset):
    def __init__(self, input_ids, attention_masks, emotion_labels, trigger_labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.emotion_labels = emotion_labels
        self.trigger_labels = trigger_labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_masks[idx],
            "emotion_labels": self.emotion_labels[idx],
            "trigger_labels": self.trigger_labels[idx],
        }

In [14]:
def create_dataloader(conversations, emotions, triggers, max_length = 128):
    input_ids, attention_masks = tokenize_conversation(conversations, max_length = max_length)
    emotion_labels = pad_labels(emotions, max_length = max_length)
    trigger_labels = pad_labels(triggers, max_length = max_length)

    dataset = ConversationDataset(input_ids, attention_masks, emotion_labels, trigger_labels)
    loader = DataLoader(dataset, batch_size = BATCH_SIZE, shuffle = False)

    return loader

In [15]:
train_loader = create_dataloader(X_train, y1_train, y2_train, max_length = MAX_LENGTH)
val_loader = create_dataloader(X_val, y1_val, y2_val, max_length = MAX_LENGTH)
test_loader = create_dataloader(X_test, y1_test, y2_test, max_length = MAX_LENGTH)

# Model configuration

In [16]:
GATE_TYPE = 'linear'  # @param ['linear', 'mlp']
EXPERT_TYPE = 'linear' # @param ['linear', 'mlp', 'rnn']
NUM_EXPERTS = 2 # @param {type: 'slider', min: 1, max: 8, step: 1}
TOP_K = 2 # @param {type: 'slider', min: 1, max: 8, step: 1}

In [17]:
assert TOP_K <= NUM_EXPERTS, "Select different values for TOP_K and NUM_EXPERTS!"

In [23]:
class MoEForEmotionAndTriggerClassification(nn.Module):
    def __init__(self, num_experts, k, num_classes, gate_type = 'linear', expert_type = 'linear'):
        super(MoEForEmotionAndTriggerClassification, self).__init__()

        self.pretrained_model = RobertaModel.from_pretrained('roberta-base') if DATASET == 'MELD' else BertModel.from_pretrained("bert-base-multilingual-cased")
        for param in self.pretrained_model.parameters():
            param.requires_grad = True  # Set to True if you want to fine-tune RoBERTa
        hidden_size = self.pretrained_model.config.hidden_size

        gate_setup = {
            'linear': nn.Linear(hidden_size, num_experts),
            'mlp': nn.Sequential(nn.Linear(hidden_size, 512), nn.ReLU(), nn.Linear(512, num_experts)),
        }

        expert_setup = {
            'linear': nn.Linear(hidden_size, hidden_size),
            'mlp': nn.Sequential(nn.Linear(hidden_size, 512), nn.ReLU(), nn.Linear(512, hidden_size)),
            'rnn': nn.LSTM(hidden_size, hidden_size),
        }

        self.gating_network_emotion = gate_setup[GATE_TYPE]
        self.gating_network_trigger = gate_setup[GATE_TYPE]
        self.experts = nn.ModuleList([expert_setup[EXPERT_TYPE] for _ in range(num_experts)])

        self.emotion_classifier = nn.Linear(hidden_size, num_classes)
        self.trigger_classifier = nn.Linear(hidden_size, 1)

        self.k = k
        self.dropout = nn.Dropout(p = 0.1)

    def forward(self, input_ids, attention_mask):
        model_outputs = self.pretrained_model(input_ids = input_ids, attention_mask = attention_mask)
        embeddings = model_outputs.last_hidden_state  # (batch_size, seq_len, hidden_size)
        pooled_embeddings = embeddings.mean(dim = 1)    # (batch_size, hidden_size)
        pooled_embeddings = self.dropout(pooled_embeddings)

        # expert weights for emotion classification
        expert_weights_emotion = self.gating_network_emotion(pooled_embeddings)  # (batch_size, num_experts)
        expert_weights_emotion = torch.softmax(expert_weights_emotion, dim = -1)

        # expert weights for trigger classification
        expert_weights_trigger = self.gating_network_trigger(pooled_embeddings)  # (batch_size, num_experts)
        expert_weights_trigger = torch.softmax(expert_weights_trigger, dim = -1)

        # aggregate expert outputs for each task
        combined_output_emotion = self._compute_expert_output(embeddings, expert_weights_emotion)
        combined_output_trigger = self._compute_expert_output(embeddings, expert_weights_trigger)

        combined_output_emotion = self.dropout(combined_output_emotion)
        combined_output_trigger = self.dropout(combined_output_trigger)

        emotion_logits = self.emotion_classifier(combined_output_emotion)   # (batch_size, seq_len, num_classes)
        trigger_logits = self.trigger_classifier(combined_output_trigger).squeeze(-1)   # (batch_size, seq_len)

        return emotion_logits, trigger_logits

    def _compute_expert_output(self, embeddings, expert_weights):
        combined_output = torch.zeros_like(embeddings)

        # top-k experts only are activated
        topk_weights, topk_indices = torch.topk(expert_weights, self.k, dim = -1)

        for i in range(self.k):
            expert_idx = topk_indices[:, i]
            weight = topk_weights[:, i].unsqueeze(-1).unsqueeze(-1)

            expert_outputs = []
            for j in range(expert_idx.size(0)):
                expert = self.experts[expert_idx[j]]

                if isinstance(expert, nn.LSTM):
                    embedding_input = embeddings[j].unsqueeze(0)
                    output, _ = expert(embedding_input)
                    expert_outputs.append(output.squeeze(0))

                elif isinstance(expert, nn.Linear) or isinstance(expert, nn.Sequential):
                    output = expert(embeddings[j])
                    expert_outputs.append(output)

            expert_outputs = torch.stack(expert_outputs)
            combined_output += weight * expert_outputs

        return combined_output

# Training parameters

In [20]:
LEARNING_RATE = 0.00002  # @param {type: 'slider', min: 1E-5, max: 5E-5, step: 1E-5}
NUM_EPOCHS = 10  # @param {type: 'slider', min: 5, max: 25, step: 5}

In [24]:
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss, BCEWithLogitsLoss

moe = MoEForEmotionAndTriggerClassification(num_experts = NUM_EXPERTS, k = TOP_K, num_classes = len(labels_to_ids), gate_type = GATE_TYPE, expert_type = EXPERT_TYPE)
optimizer = AdamW(moe.parameters(), lr = LEARNING_RATE)

emotion_loss_fn = CrossEntropyLoss()
trigger_loss_fn = BCEWithLogitsLoss()

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [25]:
moe.to(device)

MoEForEmotionAndTriggerClassification(
  (pretrained_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Lay

In [None]:
def remove_padding(logits, labels, task):
    mask = labels != -1

    logits_flat = logits.view(-1, logits.size(-1)) if task == 'emotion' else logits.view(-1)
    labels_flat = labels.view(-1)

    logits = logits_flat[mask.view(-1)]
    labels = labels_flat[mask.view(-1)]

    return logits, labels

In [None]:
def evaluate(model, val_loader):
    model.eval()
    val_loss, nb_steps = 0.0, 0
    total_emotion_preds, correct_emotion_preds = 0, 0
    total_trigger_preds, correct_trigger_preds = 0, 0

    with torch.no_grad():
        for idx, batch in enumerate(val_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            emotion_labels = batch['emotion_labels'].to(device)
            trigger_labels = batch['trigger_labels'].to(device)

            emotion_logits, trigger_logits = model(input_ids, attention_mask)

            # removing padding
            emotion_logits, emotion_labels = remove_padding(emotion_logits, emotion_labels, 'emotion')
            trigger_logits, trigger_labels = remove_padding(trigger_logits, trigger_labels, 'trigger')

            # calculating loss
            emotion_loss = emotion_loss_fn(emotion_logits, emotion_labels.long())
            trigger_loss = trigger_loss_fn(trigger_logits, trigger_labels)

            loss = emotion_loss + trigger_loss
            val_loss += loss.item()

            # calculating accuracy
            emotion_preds = torch.argmax(emotion_logits, dim=-1)
            trigger_preds = (torch.sigmoid(trigger_logits).squeeze(-1) > 0.5).long()

            correct_emotion_preds += torch.sum(emotion_preds == emotion_labels).item()
            correct_trigger_preds += torch.sum(trigger_preds == trigger_labels).item()

            total_emotion_preds += emotion_labels.numel()
            total_trigger_preds += trigger_labels.numel()

            nb_steps += 1

            if idx % 100 == 0:
                loss_step = val_loss / nb_steps
                print(f'      Validation loss per 100 training steps: {loss_step}')

        avg_val_loss = val_loss / len(val_loader)
        emotion_accuracy = correct_emotion_preds / total_emotion_preds
        trigger_accuracy = correct_trigger_preds / total_trigger_preds
        avg_val_accuracy = (emotion_accuracy + trigger_accuracy)/2

    return avg_val_loss, avg_val_accuracy

In [None]:
def train_and_validate(model, train_loader, val_loader, num_epochs = 3):
    for epoch in range(num_epochs):
        print(f"Epoch [{epoch + 1}/{num_epochs}]")
        model.train()
        train_loss, nb_steps = 0.0, 0
        total_emotion_preds, correct_emotion_preds = 0, 0
        total_trigger_preds, correct_trigger_preds = 0, 0

        for idx, batch in enumerate(train_loader):
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            emotion_labels = batch['emotion_labels'].to(device)
            trigger_labels = batch['trigger_labels'].to(device)

            emotion_logits, trigger_logits = model(input_ids, attention_mask)

            # removing padding
            emotion_logits, emotion_labels = remove_padding(emotion_logits, emotion_labels, 'emotion')
            trigger_logits, trigger_labels = remove_padding(trigger_logits, trigger_labels, 'trigger')

            # calculating loss
            emotion_loss = emotion_loss_fn(emotion_logits, emotion_labels.long())
            trigger_loss = trigger_loss_fn(trigger_logits, trigger_labels)

            loss = emotion_loss + trigger_loss
            train_loss += loss.item()

            loss.backward()
            optimizer.step()

            # calculating accuracy
            emotion_preds = torch.argmax(emotion_logits, dim=-1)
            trigger_preds = (torch.sigmoid(trigger_logits).squeeze(-1) > 0.5).long()

            correct_emotion_preds += torch.sum(emotion_preds == emotion_labels).item()
            correct_trigger_preds += torch.sum(trigger_preds == trigger_labels).item()

            total_emotion_preds += emotion_labels.numel()
            total_trigger_preds += trigger_labels.numel()
            nb_steps += 1

            if idx % 100 == 0:
                loss_step = train_loss / nb_steps
                print(f'      Training loss per 100 training steps: {loss_step}')

        avg_train_loss = train_loss / len(train_loader)
        emotion_accuracy = correct_emotion_preds / total_emotion_preds
        trigger_accuracy = correct_trigger_preds / total_trigger_preds
        avg_train_accuracy = (emotion_accuracy + trigger_accuracy)/2

        val_loss, val_accuracy = evaluate(model, val_loader)

        print(f"   Training Loss: {avg_train_loss:.3f}, Training Accuracy: {avg_train_accuracy:.3f}")
        print(f"   Validation Loss: {val_loss:.3f}, Validation Accuracy: {val_accuracy:.3f}\n")

In [None]:
train_and_validate(moe, train_loader, val_loader, num_epochs = NUM_EPOCHS)

In [None]:
torch.save(moe.state_dict(), f'trained_models/{DATASET}/moe_model_{GATE_TYPE}_dual_gate_{NUM_EXPERTS}_{EXPERT_TYPE}_experts_{TOP_K}_active_{LEARNING_RATE}_lr_{NUM_EPOCHS}_epochs.pth')

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def get_metrics(model, data_loader, dev):
    model.eval()

    emotion_accuracy = 0.0
    emotion_precision = 0.0
    emotion_recall = 0.0
    emotion_f1 = 0.0

    trigger_accuracy = 0.0
    trigger_precision = 0.0
    trigger_recall = 0.0
    trigger_f1 = 0.0

    num_samples, nb_steps = 0, 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(dev)
        attention_mask = batch['attention_mask'].to(dev)
        emotion_labels = batch['emotion_labels'].to(dev)
        trigger_labels = batch['trigger_labels'].to(dev)

        with torch.no_grad():
            # Forward pass
            emotion_logits, trigger_logits = model(input_ids, attention_mask)

            # Compute predictions for emotions
            emotion_logits, emotion_labels = remove_padding(emotion_logits, emotion_labels, 'emotion')

            emotion_preds = torch.argmax(emotion_logits, dim = -1)

            emotion_preds_flat = emotion_preds.cpu().numpy()
            emotion_labels_flat = emotion_labels.cpu().numpy()

            # Compute predictions for triggers
            trigger_logits, trigger_labels = remove_padding(trigger_logits, trigger_labels, 'trigger')

            trigger_preds = (torch.sigmoid(trigger_logits).squeeze(-1) > 0.5).long()

            trigger_preds_flat = trigger_preds.cpu().numpy()
            trigger_labels_flat = trigger_labels.cpu().numpy()

            # Calculate metrics for emotion classification
            accuracy = accuracy_score(emotion_labels_flat, emotion_preds_flat)

            precision, recall, f1, _ = precision_recall_fscore_support(
                emotion_labels_flat, emotion_preds_flat, average='weighted', zero_division = 0
            )

            emotion_accuracy += accuracy
            emotion_precision += precision
            emotion_recall += recall
            emotion_f1 += f1

            # Calculate metrics for trigger classification
            accuracy = accuracy_score(trigger_labels_flat, trigger_preds_flat)

            precision, recall, f1, _ = precision_recall_fscore_support(
                trigger_labels_flat, trigger_preds_flat, average='weighted', zero_division = 0
            )

            trigger_accuracy += accuracy
            trigger_precision += precision
            trigger_recall += recall
            trigger_f1 += f1

            nb_steps += 1

    # Calculate average metrics
    avg_emotion_accuracy = emotion_accuracy / nb_steps
    avg_emotion_precision = emotion_precision / nb_steps
    avg_emotion_recall = emotion_recall / nb_steps
    avg_emotion_f1 = emotion_f1 / nb_steps

    avg_trigger_accuracy = trigger_accuracy / nb_steps
    avg_trigger_precision = trigger_precision / nb_steps
    avg_trigger_recall = trigger_recall / nb_steps
    avg_trigger_f1 = trigger_f1 / nb_steps

    return (avg_emotion_accuracy, avg_emotion_precision, avg_emotion_recall, avg_emotion_f1,
            avg_trigger_accuracy, avg_trigger_precision, avg_trigger_recall, avg_trigger_f1)

In [None]:
avg_emotion_accuracy, avg_emotion_precision, avg_emotion_recall, avg_emotion_f1, avg_trigger_accuracy, avg_trigger_precision, avg_trigger_recall, avg_trigger_f1 = get_metrics(moe, test_loader, device)

# Output results
print("Emotion classification:")
print(f"   Accuracy: {avg_emotion_accuracy:.3f}")
print(f"   Precision: {avg_emotion_precision:.3f}")
print(f"   Recall: {avg_emotion_recall:.3f}")
print(f"   F1-score: {avg_emotion_f1:.3f}")

print("\n Trigger classification:")
print(f"   Accuracy: {avg_trigger_accuracy:.3f}")
print(f"   Precision: {avg_trigger_precision:.3f}")
print(f"   Recall: {avg_trigger_recall:.3f}")
print(f"   F1-score: {avg_trigger_f1:.3f}")

# Load and test trained model

In [None]:
moe_loaded = MoEForEmotionAndTriggerClassification(num_experts = NUM_EXPERTS, k = TOP_K, num_classes = len(labels_to_ids), gate_type = GATE_TYPE, expert_type = EXPERT_TYPE)
moe_loaded.load_state_dict(torch.load(f'trained_models/{DATASET}/moe_model_{GATE_TYPE}_dual_gate_{NUM_EXPERTS}_{EXPERT_TYPE}_experts_{TOP_K}_active_{LEARNING_RATE}_lr_{NUM_EPOCHS}_epochs.pth', map_location=torch.device('cpu')))

In [None]:
avg_emotion_accuracy, avg_emotion_precision, avg_emotion_recall, avg_emotion_f1, avg_trigger_accuracy, avg_trigger_precision, avg_trigger_recall, avg_trigger_f1 = get_metrics(moe_loaded, test_loader, 'cpu')

# Output results
print("Emotion classification:")
print(f"   Accuracy: {avg_emotion_accuracy:.3f}")
print(f"   Precision: {avg_emotion_precision:.3f}")
print(f"   Recall: {avg_emotion_recall:.3f}")
print(f"   F1-score: {avg_emotion_f1:.3f}")

print("\n Trigger classification:")
print(f"   Accuracy: {avg_trigger_accuracy:.3f}")
print(f"   Precision: {avg_trigger_precision:.3f}")
print(f"   Recall: {avg_trigger_recall:.3f}")
print(f"   F1-score: {avg_trigger_f1:.3f}")