In [1]:
import torch
import torch.nn as nn
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader
import numpy as np


In [2]:
# ============ Load CSV Labels ============
base_path = Path('emo_video')
train_df = pd.read_csv(base_path / 'train_full.csv').drop(columns=["text", "Other"])
test_df = pd.read_csv(base_path / 'test_full' / 'test_full.csv').drop(columns=["text", "Other"])

In [6]:
train_df.head()

Unnamed: 0,video_name,Neutral,Anger,Disgust,Fear,Happiness,Sadness,Surprise
0,-3g5yACwYnA_82.7645_100.5550,0.0,0.0,0.0,0.333333,0.333333,0.333333,0.0
1,-3g5yACwYnA_119.9190_125.2990,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-3g5yACwYnA_4.8400_13.6315,0.0,0.0,0.0,0.2,0.4,0.4,0.0
3,-3g5yACwYnA_13.6315_27.0310,0.0,0.0,0.0,0.0,0.5,0.5,0.0
4,-3g5yACwYnA_27.0310_41.3000,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [8]:
# ============ Load Embeddings ============
def load_emo_embeddings(df, emb_dir):
    X, y, lengths = [], [], []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        video_id = row['video_name']
        pt_file = Path(emb_dir) / f"{video_id}.pt"
        
        if not pt_file.exists():
            continue
        try:
            emb = torch.load(pt_file, weights_only=True)
            if isinstance(emb, dict):
                emb = emb['emb']
            if torch.isnan(emb).any() or emb.shape != (30, 1024):
                continue
        except Exception as e:
            print(e)
            continue
            
        X.append(emb)
        y.append(torch.tensor(row.drop('video_name').values.astype('float32')))
        lengths.append(torch.tensor(emb.shape[0]))
    return torch.stack(X), torch.stack(y), torch.stack(lengths)

X_train, y_train, lengths_train = load_emo_embeddings(train_df, 'Embeddings_emonext/cmu_mosei_embeddings/train')
X_test, y_test, lengths_test = load_emo_embeddings(test_df, 'Embeddings_emonext/cmu_mosei_embeddings/test')

100%|██████████| 16274/16274 [02:55<00:00, 92.68it/s] 
100%|██████████| 4653/4653 [00:40<00:00, 114.64it/s]


In [33]:
# ============ Data Loaders ============
train_loader = DataLoader(TensorDataset(X_train, y_train, lengths_train), batch_size=2048, shuffle=True)
val_loader = DataLoader(TensorDataset(X_test, y_test, lengths_test), batch_size=2048)

In [34]:
# ============ Model ============
class TransformerClassifier(nn.Module):
    def __init__(self, input_dim=1024, d_model=512, num_layers=3, n_heads=8, dropout=0.2, num_classes=7, max_len=30):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, d_model)
        self.pos_embed = nn.Parameter(torch.randn(1, max_len, d_model))
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads, dim_feedforward=d_model * 4, dropout=dropout, batch_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x, lengths):
        x = self.input_proj(x) + self.pos_embed[:, :x.size(1)]
        x = self.encoder(x)
        mask = torch.arange(x.size(1), device=lengths.device)[None, :] < lengths[:, None]
        mask = mask.float().unsqueeze(2)
        summed = (x * mask).sum(dim=1)
        count = mask.sum(dim=1).clamp(min=1)
        pooled = summed / count
        return self.fc(self.dropout(pooled))

In [35]:
from sklearn.metrics import classification_report


def mf1(targets: list[np.ndarray] | np.ndarray, 
                         predicts: list[np.ndarray] | np.ndarray,
                         return_scores: bool = False) -> float | tuple[float, list[float]]:
    """Calculates mean Macro F1 score (emotional multilabel mMacroF1)
    
    Args:
        targets: Targets array (ground truth)
        predicts: Predicts array (model predictions)
        return_scores: If True, returns both mean and per-class scores
        
    Returns:
        float: Mean Macro F1 score across all classes
        or
        tuple[float, list[float]]: If return_scores=True, returns (mean, per_class_scores)
    """
    targets = np.array(targets)
    predicts = np.array(predicts)

    f1_macro_scores = []
    for i in range(predicts.shape[1]):
        cr = classification_report(targets[:, i], predicts[:, i], 
                                         output_dict=True, zero_division=0)
        f1_macro_scores.append(cr['macro avg']['f1-score'])

    if return_scores:
        return np.mean(f1_macro_scores), f1_macro_scores
    return np.mean(f1_macro_scores)


def uar(targets: list[np.ndarray] | np.ndarray,
                    predicts: list[np.ndarray] | np.ndarray,
                    return_scores: bool = False) -> float | tuple[float, list[float]]:
    """Calculates mean Unweighted Average Recall (emotional multilabel mUAR)
    
    Args:
        targets: Targets array (ground truth)
        predicts: Predicts array (model predictions)
        return_scores: If True, returns both mean and per-class scores
        
    Returns:
        float: Mean UAR across all classes
        or
        tuple[float, list[float]]: If return_scores=True, returns (mean, per_class_scores)
    """
    targets = np.array(targets)
    predicts = np.array(predicts)

    uar_scores = []
    for i in range(predicts.shape[1]):
        cr = classification_report(targets[:, i], predicts[:, i],
                                         output_dict=True, zero_division=0)
        uar_scores.append(cr['macro avg']['recall'])

    if return_scores:
        return np.mean(uar_scores), uar_scores
    return np.mean(uar_scores)


def transform_matrix(matrix):
    threshold1 = 1 - 1/7 
    threshold2 = 1/7
    mask1 = matrix[:, 0] >= threshold1
    result = np.zeros_like(matrix[:, 1:])
    transformed = (matrix[:, 1:] >= threshold2).astype(int)
    result[~mask1] = transformed[~mask1]
    return result

def process_predictions(pred_emo, true_emo):
    pred_emo = torch.nn.functional.softmax(pred_emo, dim=1).cpu().detach().numpy()
    pred_emo = transform_matrix(pred_emo).tolist()
    true_emo = true_emo.cpu().detach().numpy()
    true_emo = np.where(true_emo > 0, 1, 0)[:, 1:].tolist()
    return pred_emo, true_emo

In [44]:
# ============ Training ============
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransformerClassifier(input_dim=1024, num_classes=7).to(DEVICE)

def weighted_kl_div(log_probs, target_probs, class_weights):
    """
    Manually computes class-weighted KL divergence:
    L = sum_j w_j * q_j * (log q_j - log p_j)
    where q_j is target prob and p_j is predicted prob.
    """
    kl_per_class = target_probs * (torch.log(target_probs + 1e-8) - log_probs)  # (B, C)
    weighted_kl = kl_per_class * class_weights  # apply weights per class
    return weighted_kl.sum(dim=1).mean()  # average over batch

optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)

class_weights = torch.tensor([3.0, 10.0, 13.0, 20.0, 2.0, 15.0, 18.0], device=DEVICE)

for epoch in range(1, 121):
    model.train()
    total_loss = 0
    for xb, yb, lb in train_loader:
        xb, yb, lb = xb.to(DEVICE), yb.to(DEVICE), lb.to(DEVICE)
        optimizer.zero_grad()
        logits = model(xb, lb)
        
        log_probs = torch.log_softmax(logits, dim=1)
        loss = weighted_kl_div(log_probs, yb, class_weights)
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    val_loss, raw_preds, raw_targets = 0, [], []
    with torch.no_grad():
        for xb, yb, lb in val_loader:
            xb, yb, lb = xb.to(DEVICE), yb.to(DEVICE), lb.to(DEVICE)
            logits = model(xb, lb)
            
            log_probs = torch.log_softmax(logits, dim=1)
            loss = weighted_kl_div(log_probs, yb, class_weights)

            val_loss += loss.item()
            raw_preds.append(logits)
            raw_targets.append(yb)

    # === Post-processing ===
    raw_preds = torch.cat(raw_preds)
    raw_targets = torch.cat(raw_targets)
    bin_preds, bin_targets = process_predictions(raw_preds, raw_targets) # Сюда лучше пихать предсказанные логиты 

    f1 = mf1(bin_targets, bin_preds)
    recall = uar(bin_targets, bin_preds)

    print(f"[Epoch {epoch}] Train Loss: {total_loss:.4f} | Val Loss: {val_loss:.4f} | mF1: {f1:.4f} | mUAR: {recall:.4f}")


[Epoch 1] Train Loss: 107.1628 | Val Loss: 32.3877 | mF1: 0.3661 | mUAR: 0.5141
[Epoch 2] Train Loss: 82.0465 | Val Loss: 30.0806 | mF1: 0.3896 | mUAR: 0.5260
[Epoch 3] Train Loss: 78.1312 | Val Loss: 28.7276 | mF1: 0.4519 | mUAR: 0.5333
[Epoch 4] Train Loss: 77.0553 | Val Loss: 29.0115 | mF1: 0.4569 | mUAR: 0.5333
[Epoch 5] Train Loss: 75.9762 | Val Loss: 28.3438 | mF1: 0.4664 | mUAR: 0.5398
[Epoch 6] Train Loss: 75.1291 | Val Loss: 28.5183 | mF1: 0.4760 | mUAR: 0.5421
[Epoch 7] Train Loss: 74.2318 | Val Loss: 28.1862 | mF1: 0.4768 | mUAR: 0.5522
[Epoch 8] Train Loss: 73.6746 | Val Loss: 28.6274 | mF1: 0.4846 | mUAR: 0.5475
[Epoch 9] Train Loss: 73.1868 | Val Loss: 28.9858 | mF1: 0.4629 | mUAR: 0.5446
[Epoch 10] Train Loss: 72.5164 | Val Loss: 28.4995 | mF1: 0.4775 | mUAR: 0.5535
[Epoch 11] Train Loss: 72.1916 | Val Loss: 28.5801 | mF1: 0.4911 | mUAR: 0.5540
[Epoch 12] Train Loss: 71.3207 | Val Loss: 29.3425 | mF1: 0.4958 | mUAR: 0.5525
[Epoch 13] Train Loss: 71.2465 | Val Loss: 29.08

In [46]:
# Save weights and optimizer
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, "cmu_mosei_best_checkpoint.pth")