In [2]:
import pandas as pd
import torch
import numpy as np
import os
from torch.utils.data import Dataset, DataLoader 
import torch.nn as nn
import torch.nn.functional as F
import math
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import torch.optim as optim
from tqdm import tqdm
import math
import random
import logging
from torch.utils.data import DataLoader, ConcatDataset, WeightedRandomSampler
from torch.nn.utils.rnn import pad_sequence
import pickle
import ast
from copy import deepcopy

import warnings
for warn in [UserWarning, FutureWarning]: warnings.filterwarnings("ignore", category = warn)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [3]:
class CustomMambaBlock(nn.Module):
    def __init__(self, d_input, d_model, dropout=0.1):
        super().__init__()
        self.in_proj = nn.Linear(d_input, d_model)
        self.s_B = nn.Linear(d_model, d_model)
        self.s_C = nn.Linear(d_model, d_model)
        self.out_proj = nn.Linear(d_model, d_input)
        self.norm = nn.LayerNorm(d_input)
        self.dropout = nn.Dropout(dropout)
        self.activation = nn.ReLU()

    def forward(self, x):
        x_in = x
        x = self.in_proj(x)
        B = self.s_B(x)
        C = self.s_C(x)
        x = x + B + C
        x = self.activation(x)
        x = self.out_proj(x)
        x = self.dropout(x)
        x = self.norm(x + x_in)
        return x

In [4]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout=0.1):
        super().__init__()
        self.layer_1 = nn.Linear(input_dim, hidden_dim)
        self.layer_2 = nn.Linear(hidden_dim, input_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.layer_1(x)
        x = F.gelu(x)
        x = self.dropout(x)
        return self.layer_2(x)
class AddAndNorm(nn.Module):
    def __init__(self, input_dim, dropout=0.1):
        super().__init__()
        self.norm = nn.LayerNorm(input_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, residual):
        return self.norm(x + self.dropout(residual))


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[: x.size(1)].detach()  # Отключаем градиенты
        return self.dropout(x)

class TransformerEncoderLayer(nn.Module):
    def __init__(self, input_dim, num_heads, dropout=0.1, positional_encoding=False):
        super().__init__()
        self.input_dim = input_dim
        self.self_attention = nn.MultiheadAttention(input_dim, num_heads, dropout=dropout, batch_first=True)
        self.feed_forward = PositionWiseFeedForward(input_dim, input_dim, dropout=dropout)
        self.add_norm_after_attention = AddAndNorm(input_dim, dropout=dropout)
        self.add_norm_after_ff = AddAndNorm(input_dim, dropout=dropout)
        self.positional_encoding = PositionalEncoding(input_dim) if positional_encoding else None

    def forward(self, query, key, value):
        if self.positional_encoding:
            key = self.positional_encoding(key)
            value = self.positional_encoding(value)
            query = self.positional_encoding(query)

        attn_output, _ = self.self_attention(query, key, value, need_weights=False)

        x = self.add_norm_after_attention(attn_output, query)

        ff_output = self.feed_forward(x)
        x = self.add_norm_after_ff(ff_output, x)

        return x

In [5]:
class EmotionMamba(nn.Module):
    def __init__(self, input_dim_emotion=1024, input_dim_personality=1024, hidden_dim=128, out_features=512, mamba_layer_number=2, mamba_d_model=256, positional_encoding=True, num_transformer_heads=4, transformer_dropout=0.1, tr_layer_number=1, dropout=0.1, num_emotions=7, num_traits=5):
        super().__init__()
        
        self.hidden_dim = hidden_dim
        
        self.emo_proj = nn.Sequential(
            nn.Linear(input_dim_emotion, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(dropout)
        )
        
        self.emotion_encoder = nn.ModuleList([
            CustomMambaBlock(hidden_dim, mamba_d_model, dropout=dropout)
            for _ in range(mamba_layer_number)
        ])


        self.emotion_fc_out = nn.Sequential(
            nn.Linear(hidden_dim, out_features),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(out_features, num_emotions)
        )


    def forward(self, emotion_input=None, personality_input=None, return_features=False):
        emo = self.emo_proj(emotion_input)  # (B, T, hidden_dim)
        for layer in self.emotion_encoder:
            emo = layer(emo)
        out_emo = self.emotion_fc_out(emo.mean(dim=1))  # (B, num_emotions)
        if return_features:
            return {
                'emotion_logits': out_emo,
                'last_encoder_features': emo,
            }
        else:
            return {'emotion_logits': out_emo}

In [6]:
class PersonalityMamba(nn.Module):
    def __init__(self, input_dim_emotion=1024, input_dim_personality=1024, hidden_dim=128, out_features=512, mamba_layer_number=2, mamba_d_model=256, per_activation="sigmoid", positional_encoding=True, num_transformer_heads=4, tr_layer_number=1, dropout=0.1, num_emotions=7, num_traits=5, device='cpu'):
        super().__init__()

        self.hidden_dim = hidden_dim

        self.per_proj = nn.Sequential(
            nn.Linear(input_dim_personality, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(dropout)
        )
        
        self.personality_encoder = nn.ModuleList([
            CustomMambaBlock(hidden_dim, mamba_d_model, dropout=dropout)
            for _ in range(mamba_layer_number)
        ])

        self.personality_fc_out = nn.Sequential(
            nn.Linear(hidden_dim, out_features),
            nn.LayerNorm(out_features),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(out_features, num_traits)
        )

        if per_activation == "sigmoid":
            self.activation = nn.Sigmoid()
        elif per_activation == "relu":
            self.activation = nn.ReLU()

    def forward(self, emotion_input=None, personality_input=None, return_features=False):
        per = self.per_proj(personality_input)

        for layer in self.personality_encoder:
            per = layer(per)

        out_per = self.personality_fc_out(per.mean(dim=1))
    
        if return_features:
            return {
                'personality_scores': self.activation(out_per),
                'last_encoder_features': per,
            }
        else:
            return {'personality_scores': self.activation(out_per)}

In [7]:
class FusionTransformer(nn.Module):
    def __init__(self, emo_model, per_model, input_dim_emotion=512, input_dim_personality=512, hidden_dim=128, out_features=512, mamba_layer_number=2, mamba_d_model=256, per_activation="sigmoid", positional_encoding=True, num_transformer_heads=4, tr_layer_number=1, dropout=0.1, num_emotions=7, num_traits=5, device='cpu'):
        super().__init__()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        self.hidden_dim = hidden_dim

        self.emo_model = emo_model
        self.per_model = per_model

        for param in self.emo_model.parameters():
            param.requires_grad = False

        for param in self.per_model.parameters():
            param.requires_grad = False

        self.emo_proj = nn.Sequential(
            nn.Linear(self.emo_model.hidden_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(dropout)
        )

        self.per_proj = nn.Sequential(
            nn.Linear(self.per_model.hidden_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(dropout)
        )

        self.emotion_to_personality_attn = nn.ModuleList([
            TransformerEncoderLayer(
                input_dim=hidden_dim,
                num_heads=num_transformer_heads,
                dropout=dropout,
                positional_encoding=positional_encoding
            ) for _ in range(tr_layer_number)
        ])

        self.personality_to_emotion_attn = nn.ModuleList([
            TransformerEncoderLayer(
                input_dim=hidden_dim,
                num_heads=num_transformer_heads,
                dropout=dropout,
                positional_encoding=positional_encoding
            ) for _ in range(tr_layer_number)
        ])

        self.emotion_personality_fc_out = nn.Sequential(
            nn.Linear(hidden_dim*2, out_features),
            nn.LayerNorm(out_features),
            nn.SiLU(),
            nn.Dropout(dropout),
            nn.Linear(out_features, num_emotions)
        )

        self.personality_emotion_fc_out = nn.Sequential(
            nn.Linear(hidden_dim*2, out_features),
            nn.LayerNorm(out_features),
            nn.SiLU(),
            nn.Dropout(dropout),
            nn.Linear(out_features, num_traits)
        )        

        if per_activation == "sigmoid":
            self.activation = nn.Sigmoid()
        elif per_activation == "relu":
            self.activation = nn.ReLU()

    def forward(self, emotion_input=None, personality_input=None, return_features=False):
        emo_features = self.emo_model(emotion_input=emotion_input, return_features=True)
        per_features = self.per_model(personality_input=personality_input, return_features=True)

        emo_emd = self.emo_proj(emo_features['last_encoder_features'])
        per_emd = self.per_proj(per_features['last_encoder_features'])
        
        # padding
        max_len = max(emo_emd.shape[1], per_emd.shape[1])
        emo_emd = emo_emd.cpu().detach().numpy()
        per_emd = per_emd.cpu().detach().numpy()
        emo_emd = np.pad(emo_emd[:, :max_len, :], ((0, 0), (0, max(0, max_len - emo_emd.shape[1])), (0, 0)), "constant")
        per_emd = np.pad(per_emd[:, :max_len, :], ((0, 0), (0, max(0, max_len - per_emd.shape[1])), (0, 0)), "constant")
        emo_emd = torch.tensor(emo_emd, device=self.device)
        per_emd = torch.tensor(per_emd, device=self.device)

        for layer in self.emotion_to_personality_attn:
            emo_emd += layer(emo_emd, per_emd, per_emd) # or per_emd, emo_emd, emo_emd

        for layer in self.personality_to_emotion_attn:
            per_emd += layer(per_emd, emo_emd, emo_emd) # or emo_emd, per_emd, per_emd

        fused = torch.cat([emo_emd, per_emd], dim=-1)
        emotion_logits = self.emotion_personality_fc_out(fused.mean(dim=1))
        personality_scores = self.personality_emotion_fc_out(fused.mean(dim=1))

        if return_features:
            return {
                'emotion_logits': (emotion_logits+emo_features['emotion_logits'])/2,
                'personality_scores': (self.activation(personality_scores)+per_features['personality_scores'])/2,
                'last_emo_encoder_features': emo_emd,
                'last_per_encoder_features': per_emd,
            }
        else:
            return {'emotion_logits': (emotion_logits+emo_features['emotion_logits'])/2,
                    'personality_scores': (self.activation(personality_scores)+per_features['personality_scores'])/2,}

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [9]:
def transform_matrix(matrix):
    threshold1 = 1 - 1/7 
    threshold2 = 1/7
    mask1 = matrix[:, 0] >= threshold1
    result = np.zeros_like(matrix[:, 1:])
    transformed = (matrix[:, 1:] >= threshold2).astype(int)
    result[~mask1] = transformed[~mask1]
    return result
def process_predictions(pred_emo):
    pred_emo = torch.nn.functional.softmax(pred_emo, dim=1).cpu().detach().numpy()
    pred_emo = transform_matrix(pred_emo).tolist()
    return pred_emo

### Load emotion classifier and personality regressor

In [10]:
emo_path = os.path.join(".", "best_models", "Mamba_bge-small_emotion.pt")
pers_path = os.path.join(".", "best_models", "Mamba_bge-small_personality.pt")
fusion_path = os.path.join(".", "best_models", "Mamba_bge-small_fusion_best.pt")

In [11]:
def load_models(emo_path, pers_path, fusion_path, device):
    emo_model = EmotionMamba(input_dim_emotion=384, input_dim_personality=384, hidden_dim=256, out_features=256, mamba_layer_number=4, mamba_d_model=256, dropout=0.2).to(device)
    checkpoint = torch.load(emo_path, map_location=device)
    emo_model.load_state_dict(checkpoint)
    
    per_model = PersonalityMamba(input_dim_emotion=384, input_dim_personality=384, hidden_dim=128, out_features=128, mamba_layer_number=1, mamba_d_model=128, dropout=0.1).to(device)
    checkpoint = torch.load(pers_path, map_location=device)
    per_model.load_state_dict(checkpoint)
    
    model = FusionTransformer(emo_model, per_model, input_dim_emotion=384, input_dim_personality=384, hidden_dim=256, out_features=512, tr_layer_number=2, num_transformer_heads=8).to(device)
    checkpoint = torch.load(fusion_path, map_location=device)
    model.load_state_dict(checkpoint)
    
    return model

In [12]:
model = load_models(emo_path, pers_path, fusion_path, device)

In [13]:
def inference(model, text, device):
    feature_extractor_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
    feature_extractor_model = AutoModel.from_pretrained("BAAI/bge-small-en-v1.5").to(device)
    encoded_input = feature_extractor_tokenizer(text, padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        features = feature_extractor_model(**encoded_input)[0][0]
    model.eval()
    return model(emotion_input=features.unsqueeze(0).to(device), personality_input=features.unsqueeze(0).to(device))

In [14]:
logits = inference(model, 'You are the best!', device)

In [34]:
print("Emotion")
prob_emo = torch.nn.functional.softmax(logits['emotion_logits'], dim=1).cpu().detach().numpy()
emo_names = ['Neutral', 'Anger', 'Disgust', 'Fear', 'Happiness', 'Sadness', 'Surprise']
for name, v in zip(emo_names, prob_emo[0]):
    print(f"  {name}: {v:.4f}")
print("Personality")   
pers_names = ['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism']
for name, v in zip(pers_names, logits['personality_scores'].tolist()[0]):
    print(f"  {name}: {v:.4f}")

Emotion
  Neutral: 0.1192
  Anger: 0.0658
  Disgust: 0.0078
  Fear: 0.0506
  Happiness: 0.6279
  Sadness: 0.0335
  Surprise: 0.0953
Personality
  Openness: 0.8092
  Conscientiousness: 0.6119
  Extraversion: 0.7637
  Agreeableness: 0.6649
  Neuroticism: 0.7665


### Test

In [35]:
def get_cmu_mosei_data(path, part='train'):
    if part in ['train', 'dev', 'test']:
        df = pd.read_csv(os.path.join(path, part + "_full.csv"))
    else:
        raise ValueError('Unknown part of train / dev / test')
    return df.text.values, np.dstack((df.Neutral.to_numpy(), df.Anger.to_numpy(), df.Disgust.to_numpy(), df.Fear.to_numpy(), df.Happiness.to_numpy(), df.Sadness.to_numpy(), df.Surprise.to_numpy()))

def get_first_imp_data(path, part='train'):
    if part in ['train', 'dev', 'test']:
        if part == 'dev':
            part = 'val'
        df_first_emp = pd.read_csv(os.path.join(path, "FirstImpressionV2_text_" + part + ".csv")).dropna()
        df_first_emp.columns=["NAME_VIDEO", "text"]
        df_first_emp.NAME_VIDEO = df_first_emp.NAME_VIDEO + ".mp4"
        df_first_emp_all = pd.read_csv(os.path.join(path, "data_true_traits_fi.csv"))
        df_first_emp_all = df_first_emp_all[df_first_emp_all.Subset == ('val' if part == 'dev' else part)]
        df = pd.merge(df_first_emp, df_first_emp_all, on='NAME_VIDEO', how='left').dropna()
    else:
        raise ValueError('Unknown part of train / dev / test')
    return df.text.values, np.dstack((df.openness.to_numpy(), df.conscientiousness.to_numpy(), df.extraversion.to_numpy(), df.agreeableness.to_numpy(), df['non-neuroticism'].to_numpy()))

In [63]:
class DatasetEmotionPersonality(Dataset): 
    def __init__(self, dataset='CMU-MOSEI', part='train', path='data', path_to_emb=None, model='jina'): 
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        if dataset == 'CMU-MOSEI':
            texts, labels = get_cmu_mosei_data(path, part)
        elif dataset == 'FirstImpressionV2':
            texts, labels = get_first_imp_data(path, part)
        else:
            raise ValueError('Unknown dataset (CMU-MOSEI / FirstImpressionV2)')
        self.x = texts
        self.y = labels[0]
        if path_to_emb is None:
            if model == 'jina':
                self.feature_extractor_tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v3", code_revision='da863dd04a4e5dce6814c6625adfba87b83838aa', trust_remote_code=True)
                self.feature_extractor_model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", code_revision='da863dd04a4e5dce6814c6625adfba87b83838aa', trust_remote_code=True).to(self.device)
            elif model == 'xlm-roberta-base':
                self.feature_extractor_tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
                self.feature_extractor_model = AutoModel.from_pretrained('xlm-roberta-base').to(self.device)
            elif model == 'bge-small':
                self.feature_extractor_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
                self.feature_extractor_model = AutoModel.from_pretrained("BAAI/bge-small-en-v1.5").to(self.device)
            else:
                raise ValueError('Unknown name of model')
            self.text_embedding = []
            for t in tqdm(texts):
                encoded_input = self.feature_extractor_tokenizer(t, padding=True, truncation=True, return_tensors='pt').to(self.device)
                with torch.no_grad():
                    features = self.feature_extractor_model(**encoded_input)[0][0]
                self.text_embedding.append(features)
                
        else:
            with open(path_to_emb, 'rb') as file:
                self.text_embedding = pickle.load(file)
        self.n_samples = len(texts)        

    def __getitem__(self, index): 
        return {
            "text": self.x[index], 
            "text_embedding" : self.text_embedding[index],
            "label" :self.y[index] 
        }
        
    def __len__(self): 
        return self.n_samples 

In [37]:
def custom_collate_fn(batch):
    """Собирает список образцов в единый батч, отбрасывая None (невалидные)."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    batch = [x for x in batch if x is not None]
    if not batch:
        return None

    text = [b["text"] for b in batch]

    labels = [b["label"] for b in batch]
    label_tensor = torch.tensor(labels, device=device)

    text_embedding = [torch.tensor(b["text_embedding"], device=device) for b in batch]
    text_tensor = pad_sequence(text_embedding, batch_first=True)
    text_tensor = text_tensor.to(device)

    return {
        "text": text,
        "text_embedding": text_tensor.float(),
        "label": label_tensor,
    }

In [64]:
emotion_test_dataset = dataset=DatasetEmotionPersonality(dataset='CMU-MOSEI', part='test', model='bge-small')
personality_test_dataset = dataset=DatasetEmotionPersonality(dataset='FirstImpressionV2', part='test', model='bge-small')

100%|██████████| 4653/4653 [00:57<00:00, 80.88it/s]
100%|██████████| 1991/1991 [00:23<00:00, 84.64it/s] 


In [65]:
BATCH_SIZE = 32
emotion_test_dataloader = DataLoader(dataset=emotion_test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn)
personality_test_dataloader = DataLoader(dataset=personality_test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn)
test_loaders = {'cmu_mosei' : emotion_test_dataloader, 'fiv2' : personality_test_dataloader}

In [57]:
from utils.measures import uar, mf1, acc_func, ccc

In [58]:
def process_predictions(pred_emo, true_emo):
    pred_emo = torch.nn.functional.softmax(pred_emo, dim=1).cpu().detach().numpy()
    pred_emo = transform_matrix(pred_emo).tolist()
    true_emo = true_emo.cpu().detach().numpy()
    true_emo = np.where(true_emo > 0, 1, 0)[:, 1:].tolist()
    return pred_emo, true_emo

def run_emo_eval(model, loader, device="cuda", mode = "emotion", disable_print=True):
    """
    Оценка модели по задаче эмоций. Возвращает (uar, mf1).
    """
    model.eval()
    total_preds = []
    total_targets = []
    total = 0

    with torch.no_grad():
        for batch in tqdm(loader, disable=disable_print):
            if batch is None:
                continue

            labels = batch["label"].to(device)      # shape: (B, 7)
            text  = batch["text_embedding"].to(device)      # shape: (B, D, F)
            

            if mode == "emotion":
                logits = model(emotion_input=text)
            elif mode == "fusion":
                logits = model(emotion_input=text, personality_input=text)

            bs = text.shape[0]
            total += bs

            preds, target =  process_predictions(logits['emotion_logits'], labels)
            total_preds.extend(preds)
            total_targets.extend(target)

    uar_m = uar(total_targets, total_preds)
    mf1_m = mf1(total_targets, total_preds)

    return uar_m, mf1_m

def run_per_eval(model, loader, device="cuda", mode="personality", disable_print=True):
    """
    Оценка модели по задаче персональные качества личности. Возвращает (m_acc, m_ccc).
    """
    model.eval()
    total_preds = []
    total_targets = []
    total = 0

    with torch.no_grad():
        for batch in tqdm(loader, disable=disable_print):
            if batch is None:
                continue

            labels = batch["label"].to(device)      # shape: (B, 7)
            text  = batch["text_embedding"].to(device)      # shape: (B, D, F)
            if mode == "personality":
                logits = model(personality_input=text)
            elif mode == "fusion":
                logits = model(emotion_input=text, personality_input=text)

            bs = text.shape[0]
            total += bs

            preds = logits['personality_scores']
            total_preds.extend(preds.detach().cpu().numpy())
            total_targets.extend(labels.detach().cpu().numpy())

    total_preds = np.array(total_preds)
    total_targets = np.array(total_targets)

    m_acc = acc_func(total_targets, total_preds)
    m_ccc = ccc(total_targets, total_preds)

    return m_acc, m_ccc

In [66]:
run_emo_eval(model, test_loaders['cmu_mosei'], device="cuda", mode = "fusion", disable_print=False)

100%|██████████| 146/146 [00:02<00:00, 53.47it/s]


(0.6516559495186202, 0.6082743589709423)

In [67]:
run_per_eval(model, test_loaders['fiv2'], device="cuda", mode = "fusion", disable_print=False)

100%|██████████| 63/63 [00:01<00:00, 52.71it/s]


(0.8729764261853946, 0.33992263064351597)