In [4]:
pip install emoji contractions

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
import numpy as np
import joblib
import re
import emoji
import contractions
import torch
import torch.nn as nn
import seaborn as sns
import matplotlib.pyplot as plt
import random
from nltk.corpus import wordnet
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, get_scheduler
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr
from collections import defaultdict

In [6]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load datasets
train_df = pd.read_csv('/kaggle/input/elreg-datasets/train.csv', delimiter='\t')
test_df = pd.read_csv('/kaggle/input/elreg-datasets/test.csv', delimiter='\t')
dev_df = pd.read_csv('/kaggle/input/elreg-datasets/dev.csv', delimiter='\t')

print(f"Training set: {train_df.shape}")
print(f"Development set: {dev_df.shape}")
print(f"Test set: {test_df.shape}")

Using device: cuda
Training set: (6908, 6)
Development set: (893, 6)
Test set: (3289, 6)


In [7]:
# Text preprocessing functions
def convert_emojis(text):
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r':([a-zA-Z_]+):', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def clean_text(text):
    # Lowercase
    text = text.lower()
    # expand contractions
    text = contractions.fix(text)
    # convert emojis
    text = convert_emojis(text)
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove user mentions and hashtags
    # text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'@\w+', '', text)
    # Remove special characters and numbers (except punctuation)
    text = re.sub(r"[^a-zA-Z\s.,!?']", '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply text cleaning
train_df["clean_text"] = train_df["Tweet"].apply(clean_text)
dev_df["clean_text"] = dev_df["Tweet"].apply(clean_text)
test_df["clean_text"] = test_df["Tweet"].apply(clean_text)

# Define emotion columns
emotion_cols = ["joy", "sadness", "anger", "fear"]

In [8]:
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            # Exclude same word and avoid underscores (multi-word expressions)
            if lemma.name().lower() != word.lower() and '_' not in lemma.name():
                synonyms.add(lemma.name().lower())
    return list(synonyms)

def safe_synonym_replace(text, replacement_prob=0.3):
    words = text.split()
    new_words = []

    for word in words:
        if random.random() < replacement_prob:
            synonyms = get_synonyms(word)
            if synonyms:
                new_word = random.choice(synonyms)
                new_words.append(new_word)
            else:
                new_words.append(word)
        else:
            new_words.append(word)

    return ' '.join(new_words)

def inject_noise(text, deletion_prob=0.05, swap_prob=0.05):
    chars = list(text)
    new_text = []

    i = 0
    while i < len(chars):
        if random.random() < deletion_prob:
            i += 1  # Skip the character (deletion)
            continue

        if i < len(chars) - 1 and random.random() < swap_prob:
            # Swap characters
            new_text.append(chars[i+1])
            new_text.append(chars[i])
            i += 2
            continue

        new_text.append(chars[i])
        i += 1

    return ''.join(new_text)

def augment_with_oversampling_and_noise(df, emotion_cols, threshold=2000, replacement_prob=0.3, deletion_prob=0.05, swap_prob=0.05):
    emotion_counts = df[emotion_cols].gt(0).sum()
    underrepresented = emotion_counts[emotion_counts < threshold].index.tolist()
    print("Underrepresented emotions:", underrepresented)

    augmented_rows = []

    for emotion in underrepresented:
        current_count = emotion_counts[emotion]
        needed = threshold - current_count
        print(f"→ Augmenting {needed} rows for emotion: {emotion}")

        if needed <= 0:
            continue

        candidates = df[df[emotion] > 0].sample(n=needed, replace=True, random_state=42).copy()

        half = needed // 2
        oversample_part = candidates.iloc[:half].copy()
        noise_part = candidates.iloc[half:].copy()

        oversample_part['clean_text'] = oversample_part['clean_text'].apply(
            lambda x: inject_noise(
                safe_synonym_replace(x, replacement_prob),
                deletion_prob=deletion_prob,
                swap_prob=swap_prob
            )
        )

        # Apply synonym replacement + noise to noise_part
        noise_part['clean_text'] = noise_part['clean_text'].apply(
            lambda x: inject_noise(
                safe_synonym_replace(x, replacement_prob),
                deletion_prob=deletion_prob,
                swap_prob=swap_prob
            )
        )

        augmented_rows.append(pd.concat([oversample_part, noise_part], ignore_index=True))

    if augmented_rows:
        df_aug = pd.concat([df] + augmented_rows, ignore_index=True)
        df_aug = df_aug.sample(frac=1, random_state=42).reset_index(drop=True)
        print(f"Final augmented dataset size: {len(df_aug)}")
        return df_aug
    else:
        print("No augmentation was necessary.")
        return df

emotion_cols = ['joy', 'sadness', 'anger', 'fear']
train_df = augment_with_oversampling_and_noise(
    train_df,
    emotion_cols,
    threshold = 2000,
    replacement_prob=0.2,
    deletion_prob=0.05,
    swap_prob=0.05
)

Underrepresented emotions: ['joy', 'sadness', 'anger']
→ Augmenting 385 rows for emotion: joy
→ Augmenting 467 rows for emotion: sadness
→ Augmenting 299 rows for emotion: anger
Final augmented dataset size: 8059


In [9]:
# Check for missing values in emotion columns
print("Missing values in emotion columns:")
for df, name in [(train_df, "train"), (dev_df, "dev"), (test_df, "test")]:
    print(f"\n{name} dataset:")
    for col in emotion_cols:
        missing = df[col].isna().sum()
        total = len(df)
        print(f"{col}: {missing} missing values ({missing/total*100:.1f}%)")

# Fill missing values with 0 (indicating absence of that emotion)
for df in [train_df, dev_df, test_df]:
    for col in emotion_cols:
        df[col] = df[col].fillna(0.0)

Missing values in emotion columns:

train dataset:
joy: 0 missing values (0.0%)
sadness: 0 missing values (0.0%)
anger: 0 missing values (0.0%)
fear: 0 missing values (0.0%)

dev dataset:
joy: 0 missing values (0.0%)
sadness: 0 missing values (0.0%)
anger: 0 missing values (0.0%)
fear: 0 missing values (0.0%)

test dataset:
joy: 0 missing values (0.0%)
sadness: 0 missing values (0.0%)
anger: 0 missing values (0.0%)
fear: 0 missing values (0.0%)


In [10]:
# Load EmoLex features
def load_lex(filepath):
    lexicon = defaultdict(dict)
    with open(filepath, 'r') as file:
        for line in file:
            word, emotion, value = line.strip().split('\t')
            if int(value) == 1:
                lexicon[word][emotion] = 1
    return lexicon

nrc_lexicon = load_lex("/kaggle/input/nrc-lexicons/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt")

def extract_lex(text, lexicon):
    emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy',
              'sadness', 'surprise', 'trust', 'positive', 'negative']
    counts = dict.fromkeys(emotions, 0)

    for word in text.split():
        if word in lexicon:
            for emo in lexicon[word]:
                counts[emo] += 1
    return [counts[emo] for emo in emotions]

# Extract lexicon features
train_df['lexicons'] = train_df['clean_text'].apply(lambda x: extract_lex(x, nrc_lexicon))
test_df['lexicons'] = test_df['clean_text'].apply(lambda x: extract_lex(x, nrc_lexicon))
dev_df['lexicons'] = dev_df['clean_text'].apply(lambda x: extract_lex(x, nrc_lexicon))

train_lex = np.array(train_df['lexicons'].tolist())
test_lex = np.array(test_df['lexicons'].tolist())
dev_lex = np.array(dev_df['lexicons'].tolist())

In [11]:
# Load VAD Lexicons
def load_nrc_vad(filepath):
    vad_lex = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        next(f)  # skip header
        for line in f:
            word, val, aro, dom = line.strip().split('\t')
            vad_lex[word] = {
                'valence': float(val),
                'arousal': float(aro),
                'dominance': float(dom)
            }
    return vad_lex

nrc_vad_lexicon = load_nrc_vad("/kaggle/input/nrc-lexicons/NRC-VAD-Lexicon-v2.1.txt")

def extract_vad(text, lexicon):
    valence = []
    arousal = []
    dominance = []

    for word in text.split():
        if word in lexicon:
            valence.append(lexicon[word]['valence'])
            arousal.append(lexicon[word]['arousal'])
            dominance.append(lexicon[word]['dominance'])

    # If no word matched, return zeros
    if not valence:
        return [0.0, 0.0, 0.0]

    # Otherwise, return means
    return [
        np.mean(valence),
        np.mean(arousal),
        np.mean(dominance)
    ]

# Extract lexicon features
train_df['vad'] = train_df['clean_text'].apply(lambda x: extract_vad(x, nrc_vad_lexicon))
test_df['vad'] = test_df['clean_text'].apply(lambda x: extract_vad(x, nrc_vad_lexicon))
dev_df['vad'] = dev_df['clean_text'].apply(lambda x: extract_vad(x, nrc_vad_lexicon))

train_vad = np.array(train_df['vad'].tolist())
test_vad = np.array(test_df['vad'].tolist())
dev_vad = np.array(dev_df['vad'].tolist())

In [12]:
# Load HashEmo Lexicons
from collections import defaultdict

def load_nrc_hash_emo(filepath):
    lexicon = defaultdict(dict)
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            emotion, word, score = line.strip().split('\t')
            lexicon[word][emotion] = float(score)
    return lexicon

hash_emo_lex = load_nrc_hash_emo('/kaggle/input/nrc-lexicons/NRC-Hashtag-Emotion-Lexicon-v0.2.txt')

def extract_hash_emo(text, lexicon):
    emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy',
                'sadness', 'surprise', 'trust']
    scores = {emo: [] for emo in emotions}

    for word in text.split():
        if word in lexicon:
            for emo, value in lexicon[word].items():
                scores[emo].append(value)

    return [np.mean(scores[emo]) if scores[emo] else 0.0 for emo in emotions]

train_df['hash'] = train_df['clean_text'].apply(lambda x: extract_hash_emo(x, hash_emo_lex))
test_df['hash'] = test_df['clean_text'].apply(lambda x: extract_hash_emo(x, hash_emo_lex))
dev_df['hash'] = dev_df['clean_text'].apply(lambda x: extract_hash_emo(x, hash_emo_lex))

train_hash = np.array(train_df['hash'].tolist())
test_hash = np.array(test_df['hash'].tolist())
dev_hash = np.array(dev_df['hash'].tolist())

In [13]:
scaler_hash = StandardScaler()
train_hash = scaler_hash.fit_transform(train_hash)
test_hash = scaler_hash.transform(test_hash)
dev_hash = scaler_hash.transform(dev_hash)

scaler_lex = StandardScaler()
train_lex = scaler_lex.fit_transform(train_lex)
test_lex = scaler_lex.transform(test_lex)
dev_lex = scaler_lex.transform(dev_lex)

scaler_vad = StandardScaler()
train_vad = scaler_vad.fit_transform(train_vad)
test_vad = scaler_vad.transform(test_vad)
dev_vad = scaler_vad.transform(dev_vad)

In [14]:
joblib.dump(scaler_hash, 'hash_scaler.pkl')
joblib.dump(scaler_lex, 'lex_scaler.pkl')
joblib.dump(scaler_vad, 'vad_scaler.pkl')

['vad_scaler.pkl']

In [71]:
# NRC Hash-Emo + EmoLex + VAD
train_combined = np.concatenate([train_vad, train_lex, train_hash], axis=1)
test_combined = np.concatenate([test_vad, test_lex, test_hash], axis=1)
dev_combined = np.concatenate([dev_vad, dev_lex, dev_hash], axis=1)

In [72]:
# Tokenize texts
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_texts(texts, max_len=128):
    return tokenizer(
        texts.tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='pt'
    )

train_tokenized = tokenize_texts(train_df['clean_text'])
test_tokenized = tokenize_texts(test_df['clean_text'])
dev_tokenized = tokenize_texts(dev_df['clean_text'])

In [73]:
# Create dataset for multi-label and multi-output learning
class EmotionMultiTaskDataset(Dataset):
    def __init__(self, encodings, emotion_intensities, lexicon_feats=None, texts=None):
        self.encodings = encodings
        self.emotion_intensities = emotion_intensities  # DataFrame with emotion columns
        self.lexicon_feats = lexicon_feats
        self.texts = texts
        self.emotion_cols = emotion_intensities.columns.tolist()

    def __len__(self):
        return len(self.emotion_intensities)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}

        # Get emotion intensities
        emotions = self.emotion_intensities.iloc[idx].values
        item['emotion_intensities'] = torch.tensor(emotions, dtype=torch.float)

        # Binary labels for multi-label classification (1 if emotion present)
        item['emotion_labels'] = torch.tensor((emotions > 0).astype(int), dtype=torch.float)

        if self.lexicon_feats is not None:
            item['lexicon_feats'] = torch.tensor(self.lexicon_feats[idx], dtype=torch.float)

        if self.texts is not None:
            item['text'] = self.texts[idx]

        return item

In [74]:
# Create datasets
train_dataset = EmotionMultiTaskDataset(
    train_tokenized,
    train_df[emotion_cols],
    lexicon_feats=train_combined,
    texts=train_df['clean_text'].tolist()
)

dev_dataset = EmotionMultiTaskDataset(
    dev_tokenized,
    dev_df[emotion_cols],
    lexicon_feats=dev_combined,
    texts=dev_df['clean_text'].tolist()
)

test_dataset = EmotionMultiTaskDataset(
    test_tokenized,
    test_df[emotion_cols],
    lexicon_feats=test_combined,
    texts=test_df['clean_text'].tolist()
)

# Create dataloaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [75]:
# Multi-task learning model for multi-label classification and multi-output regression
class EmotionMultiTaskModel(nn.Module):
    def __init__(self, num_emotions=4, lex_dim=10):
        super(EmotionMultiTaskModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)

        # Shared representation
        hidden_size = self.bert.config.hidden_size
        self.shared_layer = nn.Linear(hidden_size + lex_dim, hidden_size)

        # Task-specific layers
        self.classifier = nn.Linear(hidden_size, num_emotions)  # Multi-label classification
        self.regressor = nn.Linear(hidden_size, num_emotions)   # Multi-output regression

    def forward(self, input_ids, attention_mask, lexicon_feats):
        # Get BERT embeddings
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output

        # Concatenate with lexicon features
        combined = torch.cat((pooled_output, lexicon_feats), dim=1)

        # Shared representation
        shared_repr = F.relu(self.shared_layer(combined))
        shared_repr = self.dropout(shared_repr)

        # Task-specific outputs
        cls_logits = self.classifier(shared_repr)  # For binary classification of each emotion
        reg_output = self.regressor(shared_repr)   # For regression of each emotion's intensity

        # Apply sigmoid to classification logits
        cls_probs = torch.sigmoid(cls_logits)

        # Scale regression outputs to [0,1]
        reg_output = (torch.tanh(reg_output) + 1) / 2

        return cls_probs, reg_output

In [76]:
# Initialize model
num_emotions = len(emotion_cols)
# lex_dim = train_lex.shape[1]
lex_dim = train_combined.shape[1]
model = EmotionMultiTaskModel(num_emotions=num_emotions, lex_dim=lex_dim).to(device)

# Loss functions
# Binary cross-entropy for multi-label classification
cls_criterion = nn.BCELoss()

# Huber loss for regression
reg_criterion = nn.HuberLoss(delta=0.3)

# Pearson correlation loss for regression
def pearson_loss(preds, targets, epsilon=1e-8):
    # Apply mask to consider only non-zero targets
    mask = (targets > 0)

    if not torch.any(mask):
        return torch.tensor(0.0, device=preds.device)

    preds_masked = preds[mask]
    targets_masked = targets[mask]

    if len(preds_masked) <= 1:
        return torch.tensor(0.0, device=preds.device)

    vx = preds_masked - torch.mean(preds_masked)
    vy = targets_masked - torch.mean(targets_masked)

    corr = torch.sum(vx * vy) / (torch.sqrt(torch.sum(vx ** 2)) * torch.sqrt(torch.sum(vy ** 2)) + epsilon)
    return 1 - corr

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Training parameters
num_epochs = 10
num_training_steps = num_epochs * len(train_loader)
num_warmup_steps = int(0.1 * num_training_steps)

# Learning rate scheduler
lr_scheduler = get_scheduler(
    name="cosine_with_restarts",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

# Early stopping parameters
patience = 2
best_val_loss = float('inf')
early_stop_count = 0

In [77]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    train_losses = []
    cls_losses = []
    reg_losses = []

    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        emotion_labels = batch['emotion_labels'].to(device)
        emotion_intensities = batch['emotion_intensities'].to(device)
        lexicon_feats = batch['lexicon_feats'].to(device)

        # Forward pass
        cls_probs, reg_output = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            lexicon_feats=lexicon_feats
        )

        # Calculate classification loss (only for emotions that are present or not)
        cls_loss = cls_criterion(cls_probs, emotion_labels)

        # Calculate regression loss (only for emotions with intensity > 0)
        # First, standard MSE/Huber loss
        mask = (emotion_intensities > 0)
        if torch.any(mask):
            reg_l1_loss = reg_criterion(reg_output * mask, emotion_intensities)
            # For Pearson loss, calculate per batch
            reg_pearson_loss = pearson_loss(reg_output, emotion_intensities)
            reg_loss = 0.7 * reg_l1_loss + 0.3 * reg_pearson_loss
        else:
            reg_loss = torch.tensor(0.0, device=device)

        # Combined loss (with task weighting)
        loss = 0.3 * cls_loss + 0.7 * reg_loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        # Record losses
        train_losses.append(loss.item())
        cls_losses.append(cls_loss.item())
        reg_losses.append(reg_loss.item())

        # Update progress bar
        loop.set_description(f"Epoch {epoch + 1}")
        loop.set_postfix(loss=loss.item(), cls_loss=cls_loss.item(), reg_loss=reg_loss.item())

    # Validation
    model.eval()
    val_losses = []
    val_cls_losses = []
    val_reg_losses = []

    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            emotion_labels = batch['emotion_labels'].to(device)
            emotion_intensities = batch['emotion_intensities'].to(device)
            lexicon_feats = batch['lexicon_feats'].to(device)

            # Forward pass
            cls_probs, reg_output = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                lexicon_feats=lexicon_feats
            )

            # Calculate classification loss
            cls_loss = cls_criterion(cls_probs, emotion_labels)

            # Calculate regression loss
            mask = (emotion_intensities > 0)
            if torch.any(mask):
                reg_l1_loss = reg_criterion(reg_output * mask, emotion_intensities)
                reg_pearson_loss = pearson_loss(reg_output, emotion_intensities)
                reg_loss = 0.7 * reg_l1_loss + 0.3 * reg_pearson_loss
            else:
                reg_loss = torch.tensor(0.0, device=device)

            # Combined loss
            loss = 0.3 * cls_loss + 0.7 * reg_loss

            # Record losses
            val_losses.append(loss.item())
            val_cls_losses.append(cls_loss.item())
            val_reg_losses.append(reg_loss.item())

    # Calculate average losses
    avg_train_loss = sum(train_losses) / len(train_losses)
    avg_train_cls_loss = sum(cls_losses) / len(cls_losses)
    avg_train_reg_loss = sum(reg_losses) / len(reg_losses)

    avg_val_loss = sum(val_losses) / len(val_losses)
    avg_val_cls_loss = sum(val_cls_losses) / len(val_cls_losses)
    avg_val_reg_loss = sum(val_reg_losses) / len(val_reg_losses)

    # Print progress
    print(f"Train Loss: {avg_train_loss:.4f} (Cls: {avg_train_cls_loss:.4f}, Reg: {avg_train_reg_loss:.4f})")
    print(f"Val Loss: {avg_val_loss:.4f} (Cls: {avg_val_cls_loss:.4f}, Reg: {avg_val_reg_loss:.4f})")

    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        early_stop_count = 0
        # Save best model
        torch.save(model.state_dict(), "best_multitask_multilabel_model.pth")
        print("Model saved!")
    else:
        early_stop_count += 1
        print(f"Validation loss did not improve. Early stop counter: {early_stop_count}/{patience}")

    if early_stop_count >= patience:
        print("Early stopping triggered.")
        break


Epoch 1/10


Epoch 1: 100%|██████████| 504/504 [01:44<00:00,  4.83it/s, cls_loss=0.551, loss=0.251, reg_loss=0.122] 


Train Loss: 0.3333 (Cls: 0.6475, Reg: 0.1986)
Val Loss: 0.2689 (Cls: 0.6615, Reg: 0.1007)
Model saved!

Epoch 2/10


Epoch 2: 100%|██████████| 504/504 [01:44<00:00,  4.82it/s, cls_loss=0.474, loss=0.181, reg_loss=0.056] 


Train Loss: 0.2162 (Cls: 0.5211, Reg: 0.0855)
Val Loss: 0.2528 (Cls: 0.6148, Reg: 0.0977)
Model saved!

Epoch 3/10


Epoch 3: 100%|██████████| 504/504 [01:44<00:00,  4.83it/s, cls_loss=0.278, loss=0.147, reg_loss=0.0915]


Train Loss: 0.1710 (Cls: 0.4264, Reg: 0.0616)
Val Loss: 0.2465 (Cls: 0.5843, Reg: 0.1017)
Model saved!

Epoch 4/10


Epoch 4: 100%|██████████| 504/504 [01:44<00:00,  4.84it/s, cls_loss=0.411, loss=0.149, reg_loss=0.0359] 


Train Loss: 0.1205 (Cls: 0.2801, Reg: 0.0521)
Val Loss: 0.2580 (Cls: 0.6278, Reg: 0.0995)
Validation loss did not improve. Early stop counter: 1/2

Epoch 5/10


Epoch 5: 100%|██████████| 504/504 [01:44<00:00,  4.84it/s, cls_loss=0.19, loss=0.0822, reg_loss=0.0362]  


Train Loss: 0.0833 (Cls: 0.1776, Reg: 0.0430)
Val Loss: 0.2777 (Cls: 0.6994, Reg: 0.0970)
Validation loss did not improve. Early stop counter: 2/2
Early stopping triggered.


In [78]:
# Load best model for evaluation
model.load_state_dict(torch.load("best_multitask_multilabel_model.pth"))

# Evaluation functions for multi-label classification
def evaluate_classification(model, dataloader, device, threshold=0.3):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating classification"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            emotion_labels = batch['emotion_labels']
            lexicon_feats = batch['lexicon_feats'].to(device)

            # Get classification predictions
            cls_probs, _ = model(input_ids=input_ids, attention_mask=attention_mask, lexicon_feats=lexicon_feats)
            preds = (cls_probs > threshold).float().cpu().numpy()
            # preds = (cls_probs == cls_probs.max(dim=1, keepdim=True).values).float().cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(emotion_labels.numpy())

    # Convert to numpy arrays
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    # Calculate overall metrics
    accuracy = accuracy_score(all_labels.flatten(), all_preds.flatten())
    f1_macro = f1_score(all_labels, all_preds, average='macro')
    f1_micro = f1_score(all_labels, all_preds, average='micro')

    # Calculate per-emotion metrics
    per_emotion_f1 = {}
    for i, emotion in enumerate(emotion_cols):
        f1 = f1_score(all_labels[:, i], all_preds[:, i])
        per_emotion_f1[emotion] = f1

    return {
        "accuracy": accuracy,
        "f1_macro": f1_macro,
        "f1_micro": f1_micro,
        "per_emotion_f1": per_emotion_f1,
        "predictions": all_preds,
        "true_labels": all_labels
    }

# Evaluation function for regression
def evaluate_regression(model, dataloader, device):
    model.eval()
    all_preds = []
    all_true = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating regression"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            emotion_intensities = batch['emotion_intensities']
            lexicon_feats = batch['lexicon_feats'].to(device)

            # Get regression predictions
            _, reg_output = model(input_ids=input_ids, attention_mask=attention_mask, lexicon_feats=lexicon_feats)

            all_preds.extend(reg_output.cpu().numpy())
            all_true.extend(emotion_intensities.numpy())

    # Convert to numpy arrays
    all_preds = np.array(all_preds)
    all_true = np.array(all_true)

    # Calculate overall metrics (for non-zero values)
    mask = (all_true > 0)
    if np.any(mask):
        mse = mean_squared_error(all_true[mask], all_preds[mask])
        mae = mean_absolute_error(all_true[mask], all_preds[mask])
        r2 = r2_score(all_true[mask], all_preds[mask])
    else:
        mse = mae = r2 = 0

    # Calculate per-emotion metrics
    per_emotion_metrics = {}
    for i, emotion in enumerate(emotion_cols):
        emotion_mask = (all_true[:, i] > 0)
        if np.sum(emotion_mask) > 1:  # Need at least 2 points for correlation
            emotion_true = all_true[:, i][emotion_mask]
            emotion_pred = all_preds[:, i][emotion_mask]

            emotion_mse = mean_squared_error(emotion_true, emotion_pred)
            emotion_mae = mean_absolute_error(emotion_true, emotion_pred)
            try:
                emotion_pearson, _ = pearsonr(emotion_true, emotion_pred)
            except:
                emotion_pearson = float('nan')

            per_emotion_metrics[emotion] = {
                "mse": emotion_mse,
                "mae": emotion_mae,
                "pearson": emotion_pearson
            }
        else:
            per_emotion_metrics[emotion] = {
                "mse": float('nan'),
                "mae": float('nan'),
                "pearson": float('nan')
            }

    # Compute average Pearson (excluding NaNs)
    pearson_values = [metrics["pearson"] for metrics in per_emotion_metrics.values() if not np.isnan(metrics["pearson"])]
    avg_pearson = np.mean(pearson_values) if pearson_values else float('nan')

    return {
        "mse": mse,
        "mae": mae,
        "r2": r2,
        "avg_pearson": avg_pearson,
        "per_emotion_metrics": per_emotion_metrics,
        "predictions": all_preds,
        "true_values": all_true
    }

# Evaluate the model
print("\n--- Multi-Label Classification Results ---")
cls_results = evaluate_classification(model, test_loader, device)
print(f"Test Accuracy: {cls_results['accuracy']:.4f}")
print(f"F1 Macro: {cls_results['f1_macro']:.4f}")
print(f"F1 Micro: {cls_results['f1_micro']:.4f}")

print("\nPer-emotion F1 scores:")
for emotion, f1 in cls_results['per_emotion_f1'].items():
    print(f"{emotion}: {f1:.4f}")

print("\n--- Regression Results ---")
reg_results = evaluate_regression(model, test_loader, device)
print(f"MSE: {reg_results['mse']:.4f}")
print(f"MAE: {reg_results['mae']:.4f}")
print(f"R²: {reg_results['r2']:.4f}")

print("\nPer-emotion regression metrics:")
for emotion, metrics in reg_results['per_emotion_metrics'].items():
    print(f"{emotion}:")
    print(f"  MSE: {metrics['mse']:.4f}")
    print(f"  MAE: {metrics['mae']:.4f}")
    print(f"  Pearson: {metrics['pearson']:.4f}")

print(f"Avg Pearson: {reg_results['avg_pearson']:.4f}")


--- Multi-Label Classification Results ---


Evaluating classification: 100%|██████████| 206/206 [00:11<00:00, 18.20it/s]


Test Accuracy: 0.8000
F1 Macro: 0.6894
F1 Micro: 0.6855

Per-emotion F1 scores:
joy: 0.8002
sadness: 0.6208
anger: 0.7102
fear: 0.6265

--- Regression Results ---


Evaluating regression: 100%|██████████| 206/206 [00:11<00:00, 18.22it/s]


MSE: 0.0256
MAE: 0.1264
R²: 0.2557

Per-emotion regression metrics:
joy:
  MSE: 0.0295
  MAE: 0.1371
  Pearson: 0.7234
sadness:
  MSE: 0.0323
  MAE: 0.1443
  Pearson: 0.7181
anger:
  MSE: 0.0169
  MAE: 0.1033
  Pearson: 0.7578
fear:
  MSE: 0.0234
  MAE: 0.1202
  Pearson: 0.7430
Avg Pearson: 0.7356


In [79]:
# Generate predictions for new text
def extract_all_lexicons(text):
    vad_feats = extract_vad(text, nrc_vad_lexicon)
    lex_feats = extract_lex(text, nrc_lexicon)
    hash_feats = extract_hash_emo(text, hash_emo_lex)
    
    combined_feats = np.concatenate([vad_feats, lex_feats, hash_feats])
    return combined_feats

def predict_emotions(text, model, tokenizer, threshold=0.3):
    model.eval()

    # Clean and tokenize the text
    clean = clean_text(text)
    tokens = tokenizer(
        clean,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )

    # Create lexicon features
    lexicon_feats = torch.tensor([extract_all_lexicons(clean)], dtype=torch.float).to(device)

    # Move inputs to device
    input_ids = tokens['input_ids'].to(device)
    attention_mask = tokens['attention_mask'].to(device)

    # Get predictions
    with torch.no_grad():
        cls_probs, intensities = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            lexicon_feats=lexicon_feats
        )

        # Convert to numpy
        cls_probs = cls_probs.cpu().numpy()[0]
        intensities = intensities.cpu().numpy()[0]

        # Apply threshold to classification probabilities
        # detected_emotions = cls_probs > threshold
        detected_emotions = np.zeros_like(cls_probs, dtype=bool)
        detected_emotions[cls_probs.argmax()] = True

    # Prepare results
    results = {}
    for i, emotion in enumerate(emotion_cols):
        results[emotion] = {
            "probability": float(cls_probs[i]),
            "detected": bool(detected_emotions[i]),
            "intensity": float(intensities[i]) if detected_emotions[i] else 0.0
        }

    return results

In [80]:
# Demo with sample texts
sample_texts = [
    "I'm so happy today! Everything is going well.",
    "This makes me so angry, I can't believe they did that.",
    "I'm angry, but I think I can tolerate their behavior.",
    "I might be happy today, but it's just a normal day.",
    "I'm feeling a bit down today, things aren't going as planned.",
    "My girlfriend just dumped me, I don't know what to do with my life anymore. I'm in agony.",
    "That movie was terrifying, I couldn't sleep all night."
]

print("\n--- Sample Predictions ---")
for text in sample_texts:
    result = predict_emotions(text, model, tokenizer)
    print(f"Text: {text}")
    print("Detected emotions:")

    # Sort emotions by intensity
    emotions_sorted = sorted(
        [(emotion, details) for emotion, details in result.items() if details["detected"]],
        key=lambda x: x[1]["intensity"],
        reverse=True
    )

    if emotions_sorted:
        for emotion, details in emotions_sorted:
            print(f"  {emotion}: intensity={details['intensity']:.2f}, probability={details['probability']:.2f}")
    else:
        print("  No emotions detected")
    print("---")


--- Sample Predictions ---
Text: I'm so happy today! Everything is going well.
Detected emotions:
  joy: intensity=0.87, probability=0.89
---
Text: This makes me so angry, I can't believe they did that.
Detected emotions:
  anger: intensity=0.76, probability=0.89
---
Text: I'm angry, but I think I can tolerate their behavior.
Detected emotions:
  anger: intensity=0.53, probability=0.88
---
Text: I might be happy today, but it's just a normal day.
Detected emotions:
  joy: intensity=0.50, probability=0.73
---
Text: I'm feeling a bit down today, things aren't going as planned.
Detected emotions:
  fear: intensity=0.80, probability=0.56
---
Text: My girlfriend just dumped me, I don't know what to do with my life anymore. I'm in agony.
Detected emotions:
  fear: intensity=0.88, probability=0.47
---
Text: That movie was terrifying, I couldn't sleep all night.
Detected emotions:
  fear: intensity=0.87, probability=0.70
---
