In [1]:
# Подавление предупреждений
import warnings
for warn in [UserWarning, FutureWarning]: warnings.filterwarnings("ignore", category = warn)

# Импорт необходимых библиотек
import os
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModel,AutoModelForMaskedLM
import torch
import torch.nn.functional as F
from torch import Tensor
from einops import rearrange
from typing import Tuple, Callable
from torch.autograd import Function
import gc
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
pd.set_option('display.max_columns', None)

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Данные

In [3]:
from torch.utils.data import Dataset, DataLoader 
import numpy as np 
import math 

class Dataset_MELD_RESD(): 
    def __init__(self, part='train', transform=None): 
        if part == 'train':
            df_meld = pd.read_csv("train_sent_emo.csv")[['Utterance', 'Emotion']]
            df_meld.columns = ['text', 'emotion']
            df_resd = pd.read_csv("train.csv")[['text', 'emotion']]
            df = pd.concat([df_meld, df_resd[0:int(len(df_resd)*0.7)]], axis=0)
        elif part == 'dev_meld':
            df = pd.read_csv("dev_sent_emo.csv")[['Utterance', 'Emotion']]
            df.columns = ['text', 'emotion']
        elif part == 'dev_resd':
            df = pd.read_csv("train.csv")
            df = df[int(len(df)*0.7):]
        elif part == 'test_resd':
            df = pd.read_csv("test.csv")
        elif part == 'test_meld':
            df = pd.read_csv("test_sent_emo.csv")[['Utterance', 'Emotion']]
            df.columns = ['text', 'emotion']
        elif part == 'test_resd':
            df = pd.read_csv("test.csv")
        else:
            raise ValueError('Unknown part of Dataset (train / test_meld / test_resd)')
        self.x = list(df['text'].values)
        emotion_mapping = {
            'anger': 0,
            'disgust': 1,
            'fear': 2,
            'joy': 3,
            'happiness': 3,
            'neutral': 4,
            'sadness': 5,
            'surprise': 6,
            'enthusiasm': 6
        }

        self.y = torch.tensor(df['emotion'].apply(lambda x : emotion_mapping[x]).values).to(device)
        self.n_samples = df.shape[0]

    def __getitem__(self, index): 
        return self.x[index], self.y[index] 
        
    def __len__(self): 
        return self.n_samples 

In [4]:
BATCH_SIZE = 32
train_dataloader = DataLoader(dataset=Dataset_MELD_RESD('train'), batch_size=BATCH_SIZE, shuffle=True)
dev_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD('dev_meld'), batch_size=BATCH_SIZE, shuffle=False)
dev_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD('dev_resd'), batch_size=BATCH_SIZE, shuffle=False)
test_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD('test_meld'), batch_size=BATCH_SIZE, shuffle=False)
test_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD('test_resd'), batch_size=BATCH_SIZE, shuffle=False)

### Feature Extractor

In [5]:
class Embedding():
    def __init__(self, model_name='jina', pooling=None):
        self.model_name = model_name
        self.pooling = pooling
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        if model_name == 'jina':
            self.tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v3", code_revision='da863dd04a4e5dce6814c6625adfba87b83838aa', trust_remote_code=True)
            self.model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", code_revision='da863dd04a4e5dce6814c6625adfba87b83838aa', trust_remote_code=True).to(self.device)
        elif model_name == 'xlm-roberta-base':
            self.tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
            self.model = AutoModel.from_pretrained('xlm-roberta-base').to(self.device)
        elif model_name == 'canine-c':
            self.tokenizer = AutoTokenizer.from_pretrained('google/canine-c')
            self.model = AutoModel.from_pretrained('google/canine-c').to(self.device)
        else:
            raise ValueError('Unknown name of Embedding')
    def _mean_pooling(self, X):
        def mean_pooling(model_output, attention_mask):
            token_embeddings = model_output[0]
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        encoded_input = self.tokenizer(X, padding=True, truncation=True, return_tensors='pt').to(self.device)
        with torch.no_grad():
            model_output = self.model(**encoded_input)
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings.unsqueeze(1)
    
    def get_embeddings(self, X):
        if self.pooling is None:
            if self.model_name == 'canine-c':
                max_len = 329
            else:
                max_len = 95
            encoded_input = self.tokenizer(X, padding=True, truncation=True, return_tensors='pt').to(self.device)
            with torch.no_grad():
                features = self.model(**encoded_input)[0].detach().cpu().float().numpy()
            res = np.pad(features[:, :max_len, :], ((0, 0), (0, max(0, max_len - features.shape[1])), (0, 0)), "constant")
            return torch.tensor(res)
        elif self.pooling == 'mean':
            return self._mean_pooling(X)
        else:
            raise ValueError('Unknown type of pooling')

### Метрики

In [6]:
def evaluate_metrics(model, test_dataloader):
    model.eval()
    y_test = []
    y_predict = []
    with torch.no_grad():
        for batch, (batch_X, targets) in enumerate(test_dataloader, 1):
            y_test.extend(list(map(int, targets)))
            output = model(batch_X)
            _, predictions = torch.max(output, dim=1)
            y_predict.extend(list(map(int, predictions)))
        # Unweighted Average Recall (UAR)
        uar = recall_score(y_test, y_predict, average='macro')
        # Weighted Average Recall (WAR)
        war = recall_score(y_test, y_predict, average='weighted')
        # Macro F1-score (MF1)
        mf1 = f1_score(y_test, y_predict, average='macro')
        # Weighted F1-score (WF1)
        wf1 = f1_score(y_test, y_predict, average='weighted')
    return {'uar': 100.0 * uar, 'war': 100.0 * war, 'mf1': 100.0 * mf1, 'wf1': 100.0 * wf1}

# Обучение

In [7]:
from dataclasses import dataclass
from typing import ClassVar
from typing import List, Dict, Any, Tuple, Optional
@dataclass
class ModelTrainer:
    model: 'typing.Any'
    train_dataloader: DataLoader
    dev_meld_dataloader: DataLoader
    dev_resd_dataloader: DataLoader
    test_meld_dataloader: DataLoader
    test_resd_dataloader: DataLoader
    device: torch.device
    epochs: int
    round_loss: int
    round_acc: int

    optimizer: torch.optim
    loss_fn: 'typing.Any'
    
    patience: int = 10 # Ранняя остановка обучения

    class_names: ClassVar[Optional[List[str]]] = None # Список имен классов

    def __post_init__(self):
        
        # История обучения и тестирования
        self.__history = pd.DataFrame({
            "train_avg": [], # Средние метрики на тренировочной выборке
            "dev_avg": [], # Средние метрики на валидационной выборке
            "train_loss": [], # Loss на тренировочной выборке
            "dev_loss": [], # Loss на валидационной выборке
        })

        # Количество шагов в одной эпохе
        self.__train_steps = len(self.train_dataloader)
        self.__dev_steps = len(self.dev_meld_dataloader) + len(self.dev_resd_dataloader)
        self.__test_steps = len(self.test_meld_dataloader) + len(self.test_resd_dataloader)

        self.__best_dev_avg = 0
        self.__no_improvement_count = 0
        
        self.loss_fn = self.loss_fn

    @property
    def history(self) -> pd.DataFrame:
        """Получение DataFrame историей обучения и тестирования

        Returns:
            pd.DataFrame: **DataFrame** c историей обучения и тестирования
        """

        return self.__history

    @classmethod
    def get_model_logits(cls, logits: torch.Tensor) -> torch.Tensor:
        """Получение логитов модели в зависимости от функции потерь

        Args:
            logits (torch.Tensor): Входные логиты

        Returns:
            torch.Tensor: Обработанные логиты
        """

        if isinstance(cls.loss_fn, nn.NLLLoss):
            log_softmax = nn.LogSoftmax(dim = 1)
            return log_softmax(logits)
        elif isinstance(cls.loss_fn, nn.CrossEntropyLoss):
            return logits

    def _is_best_model(self, dev_avg: float) -> bool:
        """Проверка, является ли текущая модель лучшей на основе метрик валидации

        Args:
            test_accuracy (float): Текущая точность тестирования

        Returns:
            bool: True, если текущая модель лучшая, иначе False
        """

        try:
            max_dev_avg = max(self.__history["dev_avg"])
        except ValueError:
            max_dev_avg = 0
        return dev_avg > max_dev_avg

    def _save_model(self, epoch: int, path_to_model: str, test_accuracy: float, loss: torch.Tensor) -> None:
        """Сохранение модели

        Args:
            epoch (int): Текущая эпоха
            path_to_model (str): Путь для сохранения модели
            test_accuracy (float): Точность на тестовой выборке
            loss (torch.Tensor): Значение потерь
        """
        
        os.makedirs(path_to_model, exist_ok = True)
        self._best_model_name = f"{self.model.__class__.__name__}_{self.model.model_name}_{epoch}_{test_accuracy}_checkpoint.pth"

        torch.save({
            "epoch": epoch,
            "model_state_dict": self.model.state_dict(),
            "optimizer_state_dict": self.optimizer.state_dict(),
            "test_loss": loss,
        }, os.path.join(path_to_model, f"{self.model.__class__.__name__}_{self.model.model_name}_{epoch}_{test_accuracy}_checkpoint.pth"))
    
    # Процесс обучения
    def train(self, path_to_model: str) -> None:
        """Процесс обучения

        Args:
            path_to_model (str): Путь для сохранения моделей

        Returns:
            None
        """
        
        losses_train_list = []
        losses_dev_list = []
        accuracy_train_list = []
        accuracy_dev_list = []

        for epoch in range(1, self.epochs + 1):
            with torch.no_grad():
                torch.cuda.empty_cache()
            self.model.train() # Установка модели в режим обучения
            # Сумма Loss
            total_train_loss = 0
            total_dev_loss = 0
            total_dev_loss_meld = 0
            total_dev_loss_resd = 0
            # Сумма точности
            train_accuracy = 0
            dev_accuracy = 0
            dev_accuracy_meld = 0
            dev_accuracy_resd = 0
            # Сумма метрик
            train_uar = 0
            train_war = 0
            train_mf1 = 0
            train_wf1 = 0
            dev_uar_meld = 0
            dev_war_meld = 0
            dev_mf1_meld = 0
            dev_wf1_meld = 0
            dev_uar_resd = 0
            dev_war_resd = 0
            dev_mf1_resd = 0
            dev_wf1_resd = 0

            # Проход по всем тренировочным пакетам
            with tqdm(total = self.__train_steps, desc = f"Эпоха {epoch}", unit = "batch") as pbar_train:
                for batch, (batch_X, targets) in enumerate(self.train_dataloader, 1):
                    targets = targets.to(device)
                    logits = self.model(batch_X)
                    loss = self.loss_fn(logits, targets) # Ошибка предсказаний

                    # Обратное распространение для обновления весов
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()
        
                    total_train_loss += loss.item() # Потеря
                    # Метрики
                    train_uar += 100.0 * recall_score(targets.cpu(), logits.argmax(1).cpu(), average='macro')
                    train_war += 100.0 * recall_score(targets.cpu(), logits.argmax(1).cpu(), average='weighted')
                    train_mf1 += 100.0 * f1_score(targets.cpu(), logits.argmax(1).cpu(), average='macro')
                    train_wf1 += 100.0 * f1_score(targets.cpu(), logits.argmax(1).cpu(), average='weighted')
                    train_accuracy += (logits.argmax(1) == targets).type(torch.float).sum().item()
        
                    pbar_train.update(1)
                    with torch.no_grad():
                        torch.cuda.empty_cache()

                # Средняя потеря
                avg_train_loss = round(total_train_loss / batch, self.round_loss)
                losses_train_list.append(avg_train_loss)
        
                # Точность
                train_accuracy = round(train_accuracy / len(self.train_dataloader.dataset) * 100, self.round_acc)
                
                
                train_uar = round(train_uar / len(self.train_dataloader), self.round_acc)
                train_war = round(train_war / len(self.train_dataloader), self.round_acc)
                train_mf1 = round(train_mf1 / len(self.train_dataloader), self.round_acc)
                train_wf1 = round(train_wf1 / len(self.train_dataloader), self.round_acc)
                
                train_avg_metrics = 0.25 * (train_uar + train_war + train_mf1 + train_wf1)
                accuracy_train_list.append(train_avg_metrics)
        
                pbar_train.set_postfix({
                    "uar": train_uar,
                    "war" : train_war,
                    "mf1" : train_mf1,
                    "wf1" : train_wf1,
                    "avg" : train_avg_metrics,
                    "Средняя потеря": avg_train_loss
                })
            
            
            # Установка модели в режим предсказаний
            self.model.eval()
        
            # Предсказания на валидационной выборке
            with torch.no_grad():
                with tqdm(total = self.__dev_steps, desc = f"Тестирование {epoch}", unit = "batch") as pbar_dev:
                    num_batches = 0
                    for batch, (batch_X, targets) in enumerate(self.dev_meld_dataloader, 1):
                        targets = targets.to(device)
                        logits = self.model(batch_X)
                        loss = self.loss_fn(logits, targets) # Ошибка предсказаний
                        
                        total_dev_loss += loss.item() # Потеря
                        total_dev_loss_meld += loss.item()
                        dev_accuracy_meld += (logits.argmax(1) == targets).type(torch.float).sum().item()
                        # Метрики
                        dev_uar_meld += 100.0 * recall_score(targets.cpu(), logits.argmax(1).cpu(), average='macro')
                        dev_war_meld += 100.0 * recall_score(targets.cpu(), logits.argmax(1).cpu(), average='weighted')
                        dev_mf1_meld += 100.0 * f1_score(targets.cpu(), logits.argmax(1).cpu(), average='macro')
                        dev_wf1_meld += 100.0 * f1_score(targets.cpu(), logits.argmax(1).cpu(), average='weighted')
        
                        pbar_dev.update(1)
                        with torch.no_grad():
                            torch.cuda.empty_cache()
                    num_batches += batch
                    batch_meld = batch
                    for batch, (batch_X, targets) in enumerate(self.dev_resd_dataloader, 1):
                        targets = targets.to(device)
                        logits = self.model(batch_X)
                        loss = self.loss_fn(logits, targets) # Ошибка предсказаний
                        
                        total_dev_loss += loss.item() # Потеря
                        total_dev_loss_resd += loss.item()
                        # Количество правильных предсказаний
                        dev_accuracy_resd += (logits.argmax(1) == targets).type(torch.float).sum().item()
                        # Метрики
                        dev_uar_resd += 100.0 * recall_score(targets.cpu(), logits.argmax(1).cpu(), average='macro')
                        dev_war_resd += 100.0 * recall_score(targets.cpu(), logits.argmax(1).cpu(), average='weighted')
                        dev_mf1_resd += 100.0 * f1_score(targets.cpu(), logits.argmax(1).cpu(), average='macro')
                        dev_wf1_resd += 100.0 * f1_score(targets.cpu(), logits.argmax(1).cpu(), average='weighted')
        
                        pbar_dev.update(1)
                        with torch.no_grad():
                            torch.cuda.empty_cache()
                    num_batches += batch
                    # Средняя потеря
                    avg_dev_loss = round(total_dev_loss / num_batches, self.round_loss)
                    avg_dev_loss = round(0.5 * (total_dev_loss_meld / batch_meld + total_dev_loss_resd / batch), self.round_loss)
                    losses_dev_list.append(avg_dev_loss)
        
                    # Точность
                    dev_accuracy = round(0.5 * (dev_accuracy_meld / len(self.dev_meld_dataloader.dataset) * 100 + dev_accuracy_resd / len(self.dev_resd_dataloader.dataset) * 100), self.round_acc)
                
                    dev_uar_meld = round(dev_uar_meld / len(self.dev_meld_dataloader), self.round_acc)
                    dev_war_meld = round(dev_war_meld / len(self.dev_meld_dataloader), self.round_acc)
                    dev_mf1_meld = round(dev_mf1_meld / len(self.dev_meld_dataloader), self.round_acc)
                    dev_wf1_meld = round(dev_wf1_meld / len(self.dev_meld_dataloader), self.round_acc)
                    
                    dev_uar_resd = round(dev_uar_resd / len(self.dev_resd_dataloader), self.round_acc)
                    dev_war_resd = round(dev_war_resd / len(self.dev_resd_dataloader), self.round_acc)
                    dev_mf1_resd = round(dev_mf1_resd / len(self.dev_resd_dataloader), self.round_acc)
                    dev_wf1_resd = round(dev_wf1_resd / len(self.dev_resd_dataloader), self.round_acc)
                    
                    
                    dev_uar = 0.5 * (dev_uar_meld + dev_uar_resd)
                    dev_war = 0.5 * (dev_war_meld + dev_war_resd)
                    dev_mf1 = 0.5 * (dev_mf1_meld + dev_mf1_resd)
                    dev_wf1 = 0.5 * (dev_wf1_meld + dev_wf1_resd)
                    
                    dev_avg_metrics = 0.25 * (dev_uar + dev_war + dev_mf1 + dev_wf1)
                    accuracy_dev_list.append(dev_avg_metrics)
                    
                    pbar_dev.set_postfix({
                        "uar": dev_uar,
                        "war" : dev_war,
                        "mf1" : dev_mf1,
                        "wf1" : dev_wf1,
                        "avg" : dev_avg_metrics,
                        "Средняя потеря": avg_dev_loss
                    })
            
            if self._is_best_model(dev_avg_metrics):
                self._save_model(epoch, path_to_model, round(dev_avg_metrics, self.round_acc), avg_dev_loss)
                self.__best_dev_avg = dev_avg_metrics
                self.__no_improvement_count = 0
            else:
                self.__no_improvement_count += 1

            # Добавлениие данных в историю обучения
            new_row = pd.Series([train_avg_metrics, dev_avg_metrics, avg_train_loss, avg_dev_loss], index = self.__history.columns)
            self.__history = pd.concat([self.__history, new_row.to_frame().T], ignore_index = True)

            if self.__no_improvement_count >= self.patience:
                print(f"Ранняя остановка на эпохе {epoch} из-за отсутствия улучшения точности на тестовой выборке")
                break
        '''checkpoint = torch.load(os.path.join(path_to_model, self._best_model_name))
        self.model.load_state_dict(checkpoint['model_state_dict'])
        metrics_dev_meld = evaluate_metrics(self.model, dev_meld_dataloader)
        metrics_dev_resd = evaluate_metrics(self.model, dev_resd_dataloader)
        print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
        print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
        metrics_test_meld = evaluate_metrics(self.model, test_meld_dataloader)
        metrics_test_resd = evaluate_metrics(self.model, test_resd_dataloader)
        print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
        print("Метрики на тестовой выборке RESD: ", metrics_test_resd)'''
        '''# Визуализация графиков потерь и точности
        plt.figure(figsize=(12, 6))

        plt.subplot(1, 2, 1)
        plt.plot(losses_train_list, label = 'Потери на тренировочной выборке')
        plt.plot(losses_dev_list, label = 'Потери на валидационной выборке')
        plt.title('Потери во время обучения')
        plt.xlabel('Эпоха')
        plt.ylabel('Потери')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(accuracy_train_list, label = 'Средние метрики на тренировочной выборке')
        plt.plot(accuracy_dev_list, label = 'Средние метрики на валидационной выборке')
        plt.title('Средние метрики во время обучения')
        plt.xlabel('Эпоха')
        plt.ylabel('Точность')
        plt.legend()

        plt.tight_layout()
        plt.show()'''

    # Получение хэш-значения
    def __hash__(self):
        return id(self)

In [7]:
EPOCHS = 50 # Количество эпох
BATCH_SIZE = 32 # Размер выборки (пакета)
LEARNING_RATE = 1e-4 # Скорость обучения
ROUND_ACC = 2 # Знаков Accuracy после запятой
ROUND_LOSS = 7 # Знаков Loss после запятой
ROOT_DIR = os.path.join(".")
PATH_TO_MODEL = os.path.join(ROOT_DIR, "Models_mamba")

In [9]:
from sklearn.utils.class_weight import compute_class_weight
y = []
for batch, (batch_X, targets) in enumerate(train_dataloader, 1):
    y.extend(list(map(int, targets)))
class_weights = torch.tensor(compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y), dtype=torch.float).to(device)

### Mamba

In [10]:
from torch.nn.functional import silu
from torch.nn.functional import softplus
from einops import rearrange, repeat, einsum
class RMSNorm(nn.Module):
    def __init__(self, d_model: int, eps: float = 1e-8) -> None:
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(d_model))

    def forward(self, x: Tensor) -> Tensor:        
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim = True) + self.eps) * self.weight

class Mamba(nn.Module):
    def __init__(self, num_layers, d_input, d_model, d_state=16, d_discr=None, ker_size=4, num_classes=7, model_name='jina', pooling=None):
        super().__init__()
        mamba_par = {
            'd_input' : d_input,
            'd_model' : d_model,
            'd_state' : d_state,
            'd_discr' : d_discr,
            'ker_size': ker_size
        }
        self.model_name = model_name
        embed = Embedding(model_name, pooling)
        self.embedding = embed.get_embeddings
        self.layers = nn.ModuleList([nn.ModuleList([MambaBlock(**mamba_par), RMSNorm(d_input)]) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_input, num_classes)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
    def forward(self, seq, cache=None):
        seq = torch.tensor(self.embedding(seq)).to(self.device)
        for mamba, norm in self.layers:
            out, cache = mamba(norm(seq), cache)
            seq = out + seq
        return self.fc_out(seq.mean(dim = 1))
        
class MambaBlock(nn.Module):
    def __init__(self, d_input, d_model, d_state=16, d_discr=None, ker_size=4):
        super().__init__()
        d_discr = d_discr if d_discr is not None else d_model // 16
        self.in_proj  = nn.Linear(d_input, 2 * d_model, bias=False)
        self.out_proj = nn.Linear(d_model, d_input, bias=False)
        self.s_B = nn.Linear(d_model, d_state, bias=False)
        self.s_C = nn.Linear(d_model, d_state, bias=False)
        self.s_D = nn.Sequential(nn.Linear(d_model, d_discr, bias=False), nn.Linear(d_discr, d_model, bias=False),)
        self.conv = nn.Conv1d(
            in_channels=d_model,
            out_channels=d_model,
            kernel_size=ker_size,
            padding=ker_size - 1,
            groups=d_model,
            bias=True,
        )
        self.A = nn.Parameter(torch.arange(1, d_state + 1, dtype=torch.float).repeat(d_model, 1))
        self.D = nn.Parameter(torch.ones(d_model, dtype=torch.float))
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
    def forward(self, seq, cache=None):
        b, l, d = seq.shape
        (prev_hid, prev_inp) = cache if cache is not None else (None, None)
        a, b = self.in_proj(seq).chunk(2, dim=-1)
        x = rearrange(a, 'b l d -> b d l')
        x = x if prev_inp is None else torch.cat((prev_inp, x), dim=-1)
        a = self.conv(x)[..., :l]
        a = rearrange(a, 'b d l -> b l d')
        a = silu(a)
        a, hid = self.ssm(a, prev_hid=prev_hid) 
        b = silu(b)
        out = a * b
        out =  self.out_proj(out)
        if cache:
            cache = (hid.squeeze(), x[..., 1:])   
        return out, cache
    
    def ssm(self, seq, prev_hid):
        A = -self.A
        D = +self.D
        B = self.s_B(seq)
        C = self.s_C(seq)
        s = softplus(D + self.s_D(seq))
        A_bar = einsum(torch.exp(A), s, 'd s,   b l d -> b l d s')
        B_bar = einsum(          B,  s, 'b l s, b l d -> b l d s')
        X_bar = einsum(B_bar, seq, 'b l d s, b l d -> b l d s')
        hid = self._hid_states(A_bar, X_bar, prev_hid=prev_hid)
        out = einsum(hid, C, 'b l d s, b l s -> b l d')
        out = out + D * seq
        return out, hid
    
    def _hid_states(self, A, X, prev_hid=None):
        b, l, d, s = A.shape
        A = rearrange(A, 'b l d s -> l b d s')
        X = rearrange(X, 'b l d s -> l b d s')
        if prev_hid is not None:
            return rearrange(A * prev_hid + X, 'l b d s -> b l d s')
        h = torch.zeros(b, d, s, device=self.device)
        return torch.stack([h := A_t * h + X_t for A_t, X_t in zip(A, X)], dim=1)

### Mamba + jina

### Model hyperparameters  
d_model, num_layers, ker_size

In [12]:
%%capture --no-stdout
result = []
ker_size = 4
num_layers = 1
for d_model in [64, 128, 256, 512]:
    print(f"d_model={d_model}, num_layers={num_layers}, ker_size={ker_size}")
    model_mamba = Mamba(model_name='jina', pooling=None,  num_layers = num_layers, d_input = 1024, d_model = d_model, num_classes=7, ker_size=ker_size).to(device)
    optimizer = optim.Adam(params = model_mamba.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"d_model" : d_model, "num_layers": num_layers, "ker_size" : ker_size}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

d_model=64, num_layers=1, ker_size=4
Ранняя остановка на эпохе 24 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 40.1453592524256, 'war': 51.75834084761046, 'mf1': 39.115122572672384, 'wf1': 51.694737671216316}
Метрики на валидационной выборке RESD:  {'uar': 30.871058568354876, 'war': 31.044776119402982, 'mf1': 30.719674862246784, 'wf1': 31.292495874503068}
Метрики на тестовой выборке MELD:  {'uar': 36.56062782494112, 'war': 51.91570881226054, 'mf1': 34.95508193450415, 'wf1': 52.95587794935126}
Метрики на тестовой выборке RESD:  {'uar': 28.617272860693916, 'war': 28.92857142857143, 'mf1': 28.003793275614036, 'wf1': 28.358252039735817}
d_model=128, num_layers=1, ker_size=4
Ранняя остановка на эпохе 18 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 40.03732086702748, 'war': 48.69251577998197, 'mf1': 36.8872681279535, 'wf1': 49.79697284835936}
Метрики на валидационной выборке 

In [13]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["d_model", "num_layers", "ker_size", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_1_ker_size_4_d_model.csv"))

In [14]:
%%capture --no-stdout
result = []
ker_size = 4
num_layers = 2
for d_model in [64, 128, 256, 512]:
    print(f"d_model={d_model}, num_layers={num_layers}, ker_size={ker_size}")
    model_mamba = Mamba(model_name='jina', pooling=None,  num_layers = num_layers, d_input = 1024, d_model = d_model, num_classes=7, ker_size=ker_size).to(device)
    optimizer = optim.Adam(params = model_mamba.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"d_model" : d_model, "num_layers": num_layers, "ker_size" : ker_size}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

d_model=64, num_layers=2, ker_size=4
Ранняя остановка на эпохе 18 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 41.76370101544592, 'war': 51.12714156898106, 'mf1': 40.16349389364534, 'wf1': 52.202720863128135}
Метрики на валидационной выборке RESD:  {'uar': 31.820101953825002, 'war': 32.537313432835816, 'mf1': 31.139894370659267, 'wf1': 31.819726057646424}
Метрики на тестовой выборке MELD:  {'uar': 40.59738150619937, 'war': 53.63984674329502, 'mf1': 38.520091112382204, 'wf1': 55.188357863518235}
Метрики на тестовой выборке RESD:  {'uar': 30.718786113522956, 'war': 31.428571428571427, 'mf1': 30.135800630050692, 'wf1': 30.647716371474626}
d_model=128, num_layers=2, ker_size=4
Ранняя остановка на эпохе 17 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 42.189028169429285, 'war': 49.77457168620379, 'mf1': 39.163589067629296, 'wf1': 51.44236558208918}
Метрики на валидационной вы

In [15]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["d_model", "num_layers", "ker_size", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_2_ker_size_4_d_model.csv"))

In [16]:
%%capture --no-stdout
result = []
ker_size = 4
num_layers = 3
for d_model in [64, 128, 256, 512]:
    print(f"d_model={d_model}, num_layers={num_layers}, ker_size={ker_size}")
    model_mamba = Mamba(model_name='jina', pooling=None,  num_layers = num_layers, d_input = 1024, d_model = d_model, num_classes=7, ker_size=ker_size).to(device)
    optimizer = optim.Adam(params = model_mamba.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"d_model" : d_model, "num_layers": num_layers, "ker_size" : ker_size}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

d_model=64, num_layers=3, ker_size=4
Ранняя остановка на эпохе 19 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 39.24013517936903, 'war': 52.209197475202885, 'mf1': 38.746034687077724, 'wf1': 52.5657535793767}
Метрики на валидационной выборке RESD:  {'uar': 31.846686500687234, 'war': 32.23880597014925, 'mf1': 31.61177058355803, 'wf1': 32.109376801299476}
Метрики на тестовой выборке MELD:  {'uar': 38.59887085742035, 'war': 53.14176245210728, 'mf1': 36.90749865066169, 'wf1': 54.51596823231611}
Метрики на тестовой выборке RESD:  {'uar': 27.648410543147385, 'war': 28.214285714285715, 'mf1': 27.719212866504993, 'wf1': 28.497896823782593}
d_model=128, num_layers=3, ker_size=4
Ранняя остановка на эпохе 16 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 41.516372530320176, 'war': 50.85662759242561, 'mf1': 38.258880721913066, 'wf1': 51.02863024719397}
Метрики на валидационной выборк

In [17]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["d_model", "num_layers", "ker_size", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_3_ker_size_4_d_model.csv"))

In [14]:
columns = ["d_model", "num_layers", "ker_size", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df = pd.concat([pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_1_ker_size_4_d_model.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_2_ker_size_4_d_model.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_3_ker_size_4_d_model.csv"), index_col=0)])
df.columns=columns

In [15]:
df['average_dev_meld'] = (df['uar_dev_meld'] + df['war_dev_meld'] + df['mf1_dev_meld'] + df['wf1_dev_meld']) / 4.0
df['average_dev_resd'] = (df['uar_dev_resd'] + df['war_dev_resd'] + df['mf1_dev_resd'] + df['wf1_dev_resd']) / 4.0
df['average_test_meld'] = (df['uar_test_meld'] + df['war_test_meld'] + df['mf1_test_meld'] + df['wf1_test_meld']) / 4.0
df['average_test_resd'] = (df['uar_test_resd'] + df['war_test_resd'] + df['mf1_test_resd'] + df['wf1_test_resd']) / 4.0

In [33]:
df.sort_values(['average_test_resd', 'average_test_meld'] , ascending=False)

Unnamed: 0,d_model,num_layers,ker_size,uar_dev_meld,war_dev_meld,mf1_dev_meld,wf1_dev_meld,uar_dev_resd,war_dev_resd,mf1_dev_resd,...,wf1_test_meld,uar_test_resd,war_test_resd,mf1_test_resd,wf1_test_resd,путь,average_dev_meld,average_dev_resd,average_test_meld,average_test_resd
3,512,2,4,39.408059,54.914337,40.489086,53.02849,35.5694,35.820896,35.508898,...,55.268566,37.200911,37.5,37.175183,37.356274,Mamba_jina_25_41.19_checkpoint.pth,46.959993,35.694853,46.159247,37.308092
2,256,2,4,36.618373,49.413886,36.077703,49.244611,35.923668,36.41791,35.455143,...,54.136911,36.247523,36.428571,36.69278,36.864663,Mamba_jina_12_38.87_checkpoint.pth,42.838643,35.923612,45.331295,36.558384
3,512,3,4,42.546295,53.471596,41.691935,53.289745,34.613148,34.925373,34.855382,...,54.887694,34.519766,35.0,34.314357,34.668531,Mamba_jina_5_41.4_checkpoint.pth,47.749893,34.885006,46.653132,34.625663
1,128,3,4,41.516373,50.856628,38.258881,51.02863,32.73407,32.835821,32.228419,...,53.305432,32.917755,33.214286,32.042812,32.539638,Mamba_jina_6_38.54_checkpoint.pth,45.415128,32.674703,44.8539,32.678623
1,128,1,4,40.037321,48.692516,36.887268,49.796973,33.293573,33.432836,32.88729,...,49.912187,32.41753,32.857143,31.723123,32.311382,Mamba_jina_8_37.64_checkpoint.pth,43.853519,33.19922,42.792389,32.327295
2,256,3,4,41.209792,47.971145,38.08253,49.577856,36.362515,37.014925,35.983411,...,48.737094,32.215697,32.5,30.97197,31.168165,Mamba_jina_6_40.1_checkpoint.pth,44.210331,36.480246,42.070172,31.713958
3,512,1,4,38.897907,51.66817,36.979896,50.7704,33.072608,32.835821,32.881661,...,54.539324,31.514808,31.785714,31.523172,31.974245,Mamba_jina_13_38.3_checkpoint.pth,44.579093,32.974173,45.527194,31.699485
2,256,1,4,38.140976,50.135257,37.543032,49.555923,33.650738,33.134328,32.920493,...,53.716002,31.009193,31.071429,31.019941,31.411611,Mamba_jina_9_38.34_checkpoint.pth,43.843797,33.199782,45.196504,31.128043
0,64,2,4,41.763701,51.127142,40.163494,52.202721,31.820102,32.537313,31.139894,...,55.188358,30.718786,31.428571,30.135801,30.647716,Mamba_jina_8_38.23_checkpoint.pth,46.314264,31.829259,46.986419,30.732719
0,64,1,4,40.145359,51.758341,39.115123,51.694738,30.871059,31.044776,30.719675,...,52.955878,28.617273,28.928571,28.003793,28.358252,Mamba_jina_14_38.13_checkpoint.pth,45.67839,30.982001,44.096824,28.476972


In [13]:
%%capture --no-stdout
result = []
for ker_size in [2, 8]:
    for (d_model, num_layers) in [(512, 2), (256, 2)]:
        print(f"d_model={d_model}, num_layers={num_layers}, ker_size={ker_size}")
        model_mamba = Mamba(model_name='jina', pooling=None,  num_layers = num_layers, d_input = 1024, d_model = d_model, num_classes=7, ker_size=ker_size).to(device)
        optimizer = optim.Adam(params = model_mamba.parameters(), lr = LEARNING_RATE)
        loss_fn = nn.CrossEntropyLoss(weight=class_weights)
        trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
        trainer.train(PATH_TO_MODEL)
        checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
        model_mamba.load_state_dict(checkpoint['model_state_dict'])
        metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
        metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
        print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
        print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
        metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
        metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
        print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
        print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
        result.append([{"d_model" : d_model, "num_layers": num_layers, "ker_size" : ker_size}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

d_model=512, num_layers=2, ker_size=2
Ранняя остановка на эпохе 17 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 41.62485082227551, 'war': 49.95491433724076, 'mf1': 39.387680121808145, 'wf1': 50.70561248038333}
Метрики на валидационной выборке RESD:  {'uar': 39.84149988497814, 'war': 39.701492537313435, 'mf1': 38.8124310751948, 'wf1': 39.22532756878456}
Метрики на тестовой выборке MELD:  {'uar': 38.41799053621324, 'war': 49.88505747126437, 'mf1': 35.75268408394602, 'wf1': 51.53520604639389}
Метрики на тестовой выборке RESD:  {'uar': 32.353272616430516, 'war': 33.214285714285715, 'mf1': 31.87809939681777, 'wf1': 32.663925575095575}
d_model=256, num_layers=2, ker_size=2
Ранняя остановка на эпохе 20 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 38.15311220636128, 'war': 51.307484220018026, 'mf1': 38.127662108208604, 'wf1': 51.21426943011797}
Метрики на валидационной выборке 

In [14]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["d_model", "num_layers", "ker_size", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_ker_size_d_model.csv"))

In [16]:
columns = ["d_model", "num_layers", "ker_size", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df = pd.concat([pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_1_ker_size_4_d_model.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_2_ker_size_4_d_model.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_3_ker_size_4_d_model.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_ker_size_d_model.csv"), index_col=0)])
df.columns=columns

In [17]:
df['average_dev_meld'] = (df['uar_dev_meld'] + df['war_dev_meld'] + df['mf1_dev_meld'] + df['wf1_dev_meld']) / 4.0
df['average_dev_resd'] = (df['uar_dev_resd'] + df['war_dev_resd'] + df['mf1_dev_resd'] + df['wf1_dev_resd']) / 4.0
df['average_test_meld'] = (df['uar_test_meld'] + df['war_test_meld'] + df['mf1_test_meld'] + df['wf1_test_meld']) / 4.0
df['average_test_resd'] = (df['uar_test_resd'] + df['war_test_resd'] + df['mf1_test_resd'] + df['wf1_test_resd']) / 4.0

#### Results of with different model hyperparameters

In [19]:
df.sort_values(['average_test_resd', 'average_test_meld'] , ascending=False)

Unnamed: 0,d_model,num_layers,ker_size,uar_dev_meld,war_dev_meld,mf1_dev_meld,wf1_dev_meld,uar_dev_resd,war_dev_resd,mf1_dev_resd,...,wf1_test_meld,uar_test_resd,war_test_resd,mf1_test_resd,wf1_test_resd,путь,average_dev_meld,average_dev_resd,average_test_meld,average_test_resd
3,512,2,4,39.408059,54.914337,40.489086,53.02849,35.5694,35.820896,35.508898,...,55.268566,37.200911,37.5,37.175183,37.356274,Mamba_jina_25_41.19_checkpoint.pth,46.959993,35.694853,46.159247,37.308092
2,256,2,4,36.618373,49.413886,36.077703,49.244611,35.923668,36.41791,35.455143,...,54.136911,36.247523,36.428571,36.69278,36.864663,Mamba_jina_12_38.87_checkpoint.pth,42.838643,35.923612,45.331295,36.558384
3,512,3,4,42.546295,53.471596,41.691935,53.289745,34.613148,34.925373,34.855382,...,54.887694,34.519766,35.0,34.314357,34.668531,Mamba_jina_5_41.4_checkpoint.pth,47.749893,34.885006,46.653132,34.625663
1,256,2,2,38.153112,51.307484,38.127662,51.214269,36.164114,35.820896,35.745036,...,53.932402,33.600899,33.928571,33.197634,33.489637,Mamba_jina_10_40.53_checkpoint.pth,44.700632,35.831813,45.047066,33.554185
1,128,3,4,41.516373,50.856628,38.258881,51.02863,32.73407,32.835821,32.228419,...,53.305432,32.917755,33.214286,32.042812,32.539638,Mamba_jina_6_38.54_checkpoint.pth,45.415128,32.674703,44.8539,32.678623
0,512,2,2,41.624851,49.954914,39.38768,50.705612,39.8415,39.701493,38.812431,...,51.535206,32.353273,33.214286,31.878099,32.663926,Mamba_jina_7_41.92_checkpoint.pth,45.418264,39.395188,43.897735,32.527396
2,512,2,8,42.243621,52.479711,39.895981,52.980979,35.506404,34.626866,34.12241,...,54.837972,33.072658,33.571429,31.392407,31.956568,Mamba_jina_6_40.1_checkpoint.pth,46.900073,34.600171,46.167395,32.498265
1,128,1,4,40.037321,48.692516,36.887268,49.796973,33.293573,33.432836,32.88729,...,49.912187,32.41753,32.857143,31.723123,32.311382,Mamba_jina_8_37.64_checkpoint.pth,43.853519,33.19922,42.792389,32.327295
2,256,3,4,41.209792,47.971145,38.08253,49.577856,36.362515,37.014925,35.983411,...,48.737094,32.215697,32.5,30.97197,31.168165,Mamba_jina_6_40.1_checkpoint.pth,44.210331,36.480246,42.070172,31.713958
3,512,1,4,38.897907,51.66817,36.979896,50.7704,33.072608,32.835821,32.881661,...,54.539324,31.514808,31.785714,31.523172,31.974245,Mamba_jina_13_38.3_checkpoint.pth,44.579093,32.974173,45.527194,31.699485


Best model hyperparameters: d_model = 512, num_layers = 2, ker_size = 4

### Training hyperparameters

batch size, learning rate and optimiser

1) BATCH_SIZE=32

In [11]:
BATCH_SIZE = 32
train_dataloader = DataLoader(dataset=Dataset_MELD_RESD('train'), batch_size=BATCH_SIZE, shuffle=True)
dev_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD('dev_meld'), batch_size=BATCH_SIZE, shuffle=False)
dev_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD('dev_resd'), batch_size=BATCH_SIZE, shuffle=False)
test_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD('test_meld'), batch_size=BATCH_SIZE, shuffle=False)
test_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD('test_resd'), batch_size=BATCH_SIZE, shuffle=False)

In [22]:
# result for lr=1e-4, batch_size=32, optimizer=Adam
metrics_dev_meld = {'uar': 39.40805881324705, 'war': 54.91433724075744, 'mf1': 40.48908578091043, 'wf1': 53.0284896324136}
metrics_dev_resd = {'uar': 35.56939974057256, 'war': 35.82089552238806, 'mf1': 35.508897820893885, 'wf1': 35.88021880583667}
metrics_test_meld = {'uar': 36.168594927705044, 'war': 56.43678160919541, 'mf1': 36.763044570154726, 'wf1': 55.26856554959636}
metrics_test_resd = {'uar': 37.20091075354233, 'war': 37.5, 'mf1': 37.175183397267936, 'wf1': 37.3562736499167}
result = [[{"lr" : 1e-4, "batch_size": 32, "optimizer" : "Adam"}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, "Mamba_jina_25_41.19_checkpoint.pth"]]

In [13]:
%%capture --no-stdout
optimizer_ = "Adam"
for lr in [1e-3, 1e-5]:
    print(f"lr={lr}, batch_size={BATCH_SIZE}, optimizer={optimizer_}")
    model_mamba = Mamba(num_layers = 2, d_input = 1024, d_model = 512, num_classes=7, model_name='jina', pooling=None).to(device)
    optimizer = optim.Adam(params = model_mamba.parameters(), lr = lr)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"lr" : lr, "batch_size": BATCH_SIZE, "optimizer" : optimizer_}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

lr=0.001, batch_size=32, optimizer=Adam
Ранняя остановка на эпохе 12 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 39.63253111555437, 'war': 45.716862037871955, 'mf1': 35.67381435478196, 'wf1': 46.3805170100662}
Метрики на валидационной выборке RESD:  {'uar': 23.71884889221389, 'war': 24.17910447761194, 'mf1': 22.35091759405637, 'wf1': 23.077599541190214}
Метрики на тестовой выборке MELD:  {'uar': 41.29866900833727, 'war': 47.547892720306514, 'mf1': 36.08387281775006, 'wf1': 49.002335321693295}
Метрики на тестовой выборке RESD:  {'uar': 31.824928568349627, 'war': 32.142857142857146, 'mf1': 30.61793167845584, 'wf1': 30.99964688971703}
lr=1e-05, batch_size=32, optimizer=Adam
Ранняя остановка на эпохе 25 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 39.65757186038673, 'war': 49.59422903516681, 'mf1': 37.96290459574824, 'wf1': 50.09501777990961}
Метрики на валидационной выбор

In [32]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["lr", "batch_size", "optimizer", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_Adam_32_lr.csv"))

In [34]:
%%capture --no-stdout
result = []
optimizer_ = "AdamW"
for lr in [1e-3, 1e-4, 1e-5]:
    print(f"lr={lr}, batch_size={BATCH_SIZE}, optimizer={optimizer_}")
    model_mamba = Mamba(num_layers = 2, d_input = 1024, d_model = 512, num_classes=7, model_name='jina', pooling=None).to(device)
    optimizer = optim.Adam(params = model_mamba.parameters(), lr = lr)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"lr" : lr, "batch_size": BATCH_SIZE, "optimizer" : optimizer_}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

lr=0.001, batch_size=32, optimizer=AdamW
Ранняя остановка на эпохе 12 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 36.540695571811916, 'war': 48.78268710550045, 'mf1': 35.45383999995558, 'wf1': 49.24461676561281}
Метрики на валидационной выборке RESD:  {'uar': 22.560007577910387, 'war': 22.388059701492537, 'mf1': 20.12194215718674, 'wf1': 20.70075953233984}
Метрики на тестовой выборке MELD:  {'uar': 37.20982436323415, 'war': 49.08045977011494, 'mf1': 34.18835672625199, 'wf1': 50.81910465364593}
Метрики на тестовой выборке RESD:  {'uar': 30.0791970199865, 'war': 30.357142857142854, 'mf1': 26.50784385936874, 'wf1': 27.105194921005516}
lr=0.0001, batch_size=32, optimizer=AdamW
Ранняя остановка на эпохе 16 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 39.27218080353518, 'war': 53.471596032461676, 'mf1': 39.39816504118077, 'wf1': 52.79967981830771}
Метрики на валидационной вы

In [35]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["lr", "batch_size", "optimizer", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_AdamW_32_lr.csv"))

In [None]:
%%capture --no-stdout
result = []
optimizer_ = "SGD"
for lr in [1e-3, 1e-4, 1e-5]:
    print(f"lr={lr}, batch_size={BATCH_SIZE}, optimizer={optimizer_}")
    model_mamba = Mamba(num_layers = 2, d_input = 1024, d_model = 512, num_classes=7, model_name='jina', pooling=None).to(device)
    optimizer = optim.SGD(params = model_mamba.parameters(), lr = lr)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"lr" : lr, "batch_size": BATCH_SIZE, "optimizer" : optimizer_}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

lr=0.001, batch_size=32, optimizer=SGD
Ранняя остановка на эпохе 38 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 39.81903422747807, 'war': 50.76645626690712, 'mf1': 38.31111909322741, 'wf1': 52.278558901980595}
Метрики на валидационной выборке RESD:  {'uar': 27.690630841708664, 'war': 27.761194029850746, 'mf1': 26.166557158402526, 'wf1': 26.829859017950277}
Метрики на тестовой выборке MELD:  {'uar': 42.546045830737995, 'war': 53.14176245210728, 'mf1': 39.62152417469816, 'wf1': 55.2464241173081}
Метрики на тестовой выборке RESD:  {'uar': 28.961127480864324, 'war': 29.28571428571429, 'mf1': 27.960000884651077, 'wf1': 28.19765289339128}
lr=0.0001, batch_size=32, optimizer=SGD
Метрики на валидационной выборке MELD:  {'uar': 34.178048323034005, 'war': 43.01172227231741, 'mf1': 31.96548437040661, 'wf1': 44.244232912321365}
Метрики на валидационной выборке RESD:  {'uar': 22.54896451334157, 'war': 22.686567164179106, 'mf1': 22.06252363

In [None]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["lr", "batch_size", "optimizer", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_SGD_32_lr.csv"))

In [32]:
columns = ["lr", "batch_size", "optimizer", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df = pd.concat([pd.read_csv(os.path.join(PATH_TO_MODEL, "result_Adam_32_lr_.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_AdamW_32_lr.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_SGD_32_lr.csv"), index_col=0)])
df.columns=columns

In [33]:
df['average_dev_meld'] = (df['uar_dev_meld'] + df['war_dev_meld'] + df['mf1_dev_meld'] + df['wf1_dev_meld']) / 4.0
df['average_dev_resd'] = (df['uar_dev_resd'] + df['war_dev_resd'] + df['mf1_dev_resd'] + df['wf1_dev_resd']) / 4.0
df['average_test_meld'] = (df['uar_test_meld'] + df['war_test_meld'] + df['mf1_test_meld'] + df['wf1_test_meld']) / 4.0
df['average_test_resd'] = (df['uar_test_resd'] + df['war_test_resd'] + df['mf1_test_resd'] + df['wf1_test_resd']) / 4.0

In [34]:
df.sort_values(['average_test_resd', 'average_test_meld'] , ascending=False)

Unnamed: 0,lr,batch_size,optimizer,uar_dev_meld,war_dev_meld,mf1_dev_meld,wf1_dev_meld,uar_dev_resd,war_dev_resd,mf1_dev_resd,...,wf1_test_meld,uar_test_resd,war_test_resd,mf1_test_resd,wf1_test_resd,путь,average_dev_meld,average_dev_resd,average_test_meld,average_test_resd
0,0.0001,32,Adam,39.408059,54.914337,40.489086,53.02849,35.5694,35.820896,35.508898,...,55.268566,37.200911,37.5,37.175183,37.356274,Mamba_jina_25_41.19_checkpoint.pth,46.959993,35.694853,46.159247,37.308092
1,0.0001,32,AdamW,39.272181,53.471596,39.398165,52.79968,30.955142,31.940299,31.139891,...,55.929145,33.53713,34.285714,33.357609,33.914529,Mamba_jina_6_38.57_checkpoint.pth,46.235405,31.48226,46.434589,33.773746
2,1e-05,32,Adam,39.657572,49.594229,37.962905,50.095018,31.12401,31.044776,31.048774,...,51.399651,32.760868,33.214286,32.477401,32.990102,Mamba_jina_15_37.89_checkpoint.pth,44.327431,31.153082,43.475837,32.860664
1,0.001,32,Adam,39.632531,45.716862,35.673814,46.380517,23.718849,24.179104,22.350918,...,49.002335,31.824929,32.142857,30.617932,30.999647,Mamba_jina_2_32.39_checkpoint.pth,41.850931,23.331618,43.483192,31.396341
2,1e-05,32,AdamW,38.928585,46.528404,36.497938,47.304779,34.25331,34.626866,34.119602,...,50.373476,30.904365,31.071429,30.414992,30.752693,Mamba_jina_14_37.81_checkpoint.pth,42.314926,34.391793,42.325121,30.785869
0,0.001,32,SGD,39.819034,50.766456,38.311119,52.278559,27.690631,27.761194,26.166557,...,55.246424,28.961127,29.285714,27.960001,28.197653,Mamba_jina_28_35.5_checkpoint.pth,45.293792,27.11206,47.638939,28.601124
0,0.001,32,AdamW,36.540696,48.782687,35.45384,49.244617,22.560008,22.38806,20.121942,...,50.819105,30.079197,30.357143,26.507844,27.105195,Mamba_jina_2_31.62_checkpoint.pth,42.50546,21.442692,42.824436,28.512345
1,0.0001,32,SGD,34.178048,43.011722,31.965484,44.244233,22.548965,22.686567,22.062524,...,45.3811,26.000256,26.428571,26.121467,26.682866,Mamba_jina_48_30.24_checkpoint.pth,38.349872,22.409986,38.532712,26.30829


2) BATCH_SIZE=16

In [12]:
BATCH_SIZE = 16
train_dataloader = DataLoader(dataset=Dataset_MELD_RESD('train'), batch_size=BATCH_SIZE, shuffle=True)
dev_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD('dev_meld'), batch_size=BATCH_SIZE, shuffle=False)
dev_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD('dev_resd'), batch_size=BATCH_SIZE, shuffle=False)
test_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD('test_meld'), batch_size=BATCH_SIZE, shuffle=False)
test_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD('test_resd'), batch_size=BATCH_SIZE, shuffle=False)

In [38]:
%%capture --no-stdout
result = []
lr = 1e-4
optimizer_ = "Adam"
print(f"lr={lr}, batch_size={BATCH_SIZE}, optimizer={optimizer_}")
model_mamba = Mamba(num_layers = 2, d_input = 1024, d_model = 512, num_classes=7, model_name='jina', pooling=None).to(device)
optimizer = optim.Adam(params = model_mamba.parameters(), lr = lr)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
trainer.train(PATH_TO_MODEL)
checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
model_mamba.load_state_dict(checkpoint['model_state_dict'])
metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
result.append([{"lr" : lr, "batch_size": BATCH_SIZE, "optimizer" : optimizer_}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

lr=0.0001, batch_size=16, optimizer=Adam
Ранняя остановка на эпохе 26 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 36.84301610314956, 'war': 53.02073940486925, 'mf1': 36.92701779859716, 'wf1': 51.451910196097195}
Метрики на валидационной выборке RESD:  {'uar': 34.54804143879774, 'war': 34.626865671641795, 'mf1': 33.7910661718222, 'wf1': 34.11829786626451}
Метрики на тестовой выборке MELD:  {'uar': 36.760082847273004, 'war': 55.51724137931034, 'mf1': 36.48022271564247, 'wf1': 55.139519159102434}
Метрики на тестовой выборке RESD:  {'uar': 33.52375230664704, 'war': 33.92857142857143, 'mf1': 33.90281582067809, 'wf1': 34.37690804806097}


3) BATCH_SIZE=64

In [39]:
BATCH_SIZE = 64
train_dataloader = DataLoader(dataset=Dataset_MELD_RESD('train'), batch_size=BATCH_SIZE, shuffle=True)
dev_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD('dev_meld'), batch_size=BATCH_SIZE, shuffle=False)
dev_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD('dev_resd'), batch_size=BATCH_SIZE, shuffle=False)
test_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD('test_meld'), batch_size=BATCH_SIZE, shuffle=False)
test_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD('test_resd'), batch_size=BATCH_SIZE, shuffle=False)

In [40]:
%%capture --no-stdout
lr = 1e-4
optimizer_ = "Adam"
print(f"lr={lr}, batch_size={BATCH_SIZE}, optimizer={optimizer_}")
model_mamba = Mamba(num_layers = 2, d_input = 1024, d_model = 512, num_classes=7, model_name='jina', pooling=None).to(device)
optimizer = optim.Adam(params = model_mamba.parameters(), lr = lr)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
trainer.train(PATH_TO_MODEL)
checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
model_mamba.load_state_dict(checkpoint['model_state_dict'])
metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
result.append([{"lr" : lr, "batch_size": BATCH_SIZE, "optimizer" : optimizer_}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

lr=0.0001, batch_size=64, optimizer=Adam
Ранняя остановка на эпохе 16 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 40.18126949396865, 'war': 53.02073940486925, 'mf1': 39.173454122954226, 'wf1': 53.05150496464572}
Метрики на валидационной выборке RESD:  {'uar': 33.79082164266673, 'war': 34.32835820895522, 'mf1': 33.53628666025902, 'wf1': 34.12665598889447}
Метрики на тестовой выборке MELD:  {'uar': 37.8756085415619, 'war': 54.32950191570881, 'mf1': 36.918258466589506, 'wf1': 55.079649085872596}
Метрики на тестовой выборке RESD:  {'uar': 32.126511764669665, 'war': 32.857142857142854, 'mf1': 32.50367057376912, 'wf1': 33.13118514216283}


In [49]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["lr", "batch_size", "optimizer", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_batch_size.csv"))

In [50]:
columns = ["lr", "batch_size", "optimizer", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df = pd.concat([pd.read_csv(os.path.join(PATH_TO_MODEL, "result_Adam_32_lr_.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_AdamW_32_lr.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_SGD_32_lr.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_batch_size.csv"), index_col=0)])
df.columns=columns

In [51]:
df['average_dev_meld'] = (df['uar_dev_meld'] + df['war_dev_meld'] + df['mf1_dev_meld'] + df['wf1_dev_meld']) / 4.0
df['average_dev_resd'] = (df['uar_dev_resd'] + df['war_dev_resd'] + df['mf1_dev_resd'] + df['wf1_dev_resd']) / 4.0
df['average_test_meld'] = (df['uar_test_meld'] + df['war_test_meld'] + df['mf1_test_meld'] + df['wf1_test_meld']) / 4.0
df['average_test_resd'] = (df['uar_test_resd'] + df['war_test_resd'] + df['mf1_test_resd'] + df['wf1_test_resd']) / 4.0

In [52]:
df.sort_values(['average_test_resd', 'average_test_meld'] , ascending=False)

Unnamed: 0,lr,batch_size,optimizer,uar_dev_meld,war_dev_meld,mf1_dev_meld,wf1_dev_meld,uar_dev_resd,war_dev_resd,mf1_dev_resd,...,wf1_test_meld,uar_test_resd,war_test_resd,mf1_test_resd,wf1_test_resd,путь,average_dev_meld,average_dev_resd,average_test_meld,average_test_resd
0,0.0001,32,Adam,39.408059,54.914337,40.489086,53.02849,35.5694,35.820896,35.508898,...,55.268566,37.200911,37.5,37.175183,37.356274,Mamba_jina_25_41.19_checkpoint.pth,46.959993,35.694853,46.159247,37.308092
0,0.0001,16,Adam,36.843016,53.020739,36.927018,51.45191,34.548041,34.626866,33.791066,...,55.139519,33.523752,33.928571,33.902816,34.376908,Mamba_jina_16_38.2_checkpoint.pth,44.560671,34.271068,45.974267,33.933012
1,0.0001,32,AdamW,39.272181,53.471596,39.398165,52.79968,30.955142,31.940299,31.139891,...,55.929145,33.53713,34.285714,33.357609,33.914529,Mamba_jina_6_38.57_checkpoint.pth,46.235405,31.48226,46.434589,33.773746
2,1e-05,32,Adam,39.657572,49.594229,37.962905,50.095018,31.12401,31.044776,31.048774,...,51.399651,32.760868,33.214286,32.477401,32.990102,Mamba_jina_15_37.89_checkpoint.pth,44.327431,31.153082,43.475837,32.860664
1,0.0001,64,Adam,40.181269,53.020739,39.173454,53.051505,33.790822,34.328358,33.536287,...,55.079649,32.126512,32.857143,32.503671,33.131185,Mamba_jina_6_40.5_checkpoint.pth,46.356742,33.945531,46.050755,32.654628
1,0.001,32,Adam,39.632531,45.716862,35.673814,46.380517,23.718849,24.179104,22.350918,...,49.002335,31.824929,32.142857,30.617932,30.999647,Mamba_jina_2_32.39_checkpoint.pth,41.850931,23.331618,43.483192,31.396341
2,1e-05,32,AdamW,38.928585,46.528404,36.497938,47.304779,34.25331,34.626866,34.119602,...,50.373476,30.904365,31.071429,30.414992,30.752693,Mamba_jina_14_37.81_checkpoint.pth,42.314926,34.391793,42.325121,30.785869
0,0.001,32,SGD,39.819034,50.766456,38.311119,52.278559,27.690631,27.761194,26.166557,...,55.246424,28.961127,29.285714,27.960001,28.197653,Mamba_jina_28_35.5_checkpoint.pth,45.293792,27.11206,47.638939,28.601124
0,0.001,32,AdamW,36.540696,48.782687,35.45384,49.244617,22.560008,22.38806,20.121942,...,50.819105,30.079197,30.357143,26.507844,27.105195,Mamba_jina_2_31.62_checkpoint.pth,42.50546,21.442692,42.824436,28.512345
1,0.0001,32,SGD,34.178048,43.011722,31.965484,44.244233,22.548965,22.686567,22.062524,...,45.3811,26.000256,26.428571,26.121467,26.682866,Mamba_jina_48_30.24_checkpoint.pth,38.349872,22.409986,38.532712,26.30829


In [3]:
columns = ["lr", "batch_size", "optimizer", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df = pd.concat([pd.read_csv(os.path.join(PATH_TO_MODEL, "result_Adam_32_lr_.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_AdamW_32_lr.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_SGD_32_lr.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_batch_size.csv"), index_col=0)])
df.columns=columns

In [4]:
df['average_dev_meld'] = (df['uar_dev_meld'] + df['war_dev_meld'] + df['mf1_dev_meld'] + df['wf1_dev_meld']) / 4.0
df['average_dev_resd'] = (df['uar_dev_resd'] + df['war_dev_resd'] + df['mf1_dev_resd'] + df['wf1_dev_resd']) / 4.0
df['average_test_meld'] = (df['uar_test_meld'] + df['war_test_meld'] + df['mf1_test_meld'] + df['wf1_test_meld']) / 4.0
df['average_test_resd'] = (df['uar_test_resd'] + df['war_test_resd'] + df['mf1_test_resd'] + df['wf1_test_resd']) / 4.0
df['avg'] = (df['uar_test_meld'] + df['war_test_meld'] + df['mf1_test_meld'] + df['wf1_test_meld'] + df['uar_test_resd'] + df['war_test_resd'] + df['mf1_test_resd'] + df['wf1_test_resd']) / 8.0

In [9]:
df.sort_values('avg', ascending=False)

Unnamed: 0,lr,batch_size,optimizer,uar_dev_meld,war_dev_meld,mf1_dev_meld,wf1_dev_meld,uar_dev_resd,war_dev_resd,mf1_dev_resd,wf1_dev_resd,uar_test_meld,war_test_meld,mf1_test_meld,wf1_test_meld,uar_test_resd,war_test_resd,mf1_test_resd,wf1_test_resd,путь,average_dev_meld,average_dev_resd,average_test_meld,average_test_resd,avg
0,0.0001,32,Adam,39.408059,54.914337,40.489086,53.02849,35.5694,35.820896,35.508898,35.880219,36.168595,56.436782,36.763045,55.268566,37.200911,37.5,37.175183,37.356274,Mamba_jina_25_41.19_checkpoint.pth,46.959993,35.694853,46.159247,37.308092,41.733669
1,0.0001,32,AdamW,39.272181,53.471596,39.398165,52.79968,30.955142,31.940299,31.139891,31.893709,37.60565,55.670498,36.533064,55.929145,33.53713,34.285714,33.357609,33.914529,Mamba_jina_6_38.57_checkpoint.pth,46.235405,31.48226,46.434589,33.773746,40.104167
0,0.0001,16,Adam,36.843016,53.020739,36.927018,51.45191,34.548041,34.626866,33.791066,34.118298,36.760083,55.517241,36.480223,55.139519,33.523752,33.928571,33.902816,34.376908,Mamba_jina_16_38.2_checkpoint.pth,44.560671,34.271068,45.974267,33.933012,39.953639
1,0.0001,64,Adam,40.181269,53.020739,39.173454,53.051505,33.790822,34.328358,33.536287,34.126656,37.875609,54.329502,36.918258,55.079649,32.126512,32.857143,32.503671,33.131185,Mamba_jina_6_40.5_checkpoint.pth,46.356742,33.945531,46.050755,32.654628,39.352691
2,1e-05,32,Adam,39.657572,49.594229,37.962905,50.095018,31.12401,31.044776,31.048774,31.394767,37.37854,49.961686,35.163473,51.399651,32.760868,33.214286,32.477401,32.990102,Mamba_jina_15_37.89_checkpoint.pth,44.327431,31.153082,43.475837,32.860664,38.168251
0,0.001,32,SGD,39.819034,50.766456,38.311119,52.278559,27.690631,27.761194,26.166557,26.829859,42.546046,53.141762,39.621524,55.246424,28.961127,29.285714,27.960001,28.197653,Mamba_jina_28_35.5_checkpoint.pth,45.293792,27.11206,47.638939,28.601124,38.120032
1,0.001,32,Adam,39.632531,45.716862,35.673814,46.380517,23.718849,24.179104,22.350918,23.0776,41.298669,47.547893,36.083873,49.002335,31.824929,32.142857,30.617932,30.999647,Mamba_jina_2_32.39_checkpoint.pth,41.850931,23.331618,43.483192,31.396341,37.439767
2,1e-05,32,AdamW,38.928585,46.528404,36.497938,47.304779,34.25331,34.626866,34.119602,34.567394,36.276241,48.582375,34.068392,50.373476,30.904365,31.071429,30.414992,30.752693,Mamba_jina_14_37.81_checkpoint.pth,42.314926,34.391793,42.325121,30.785869,36.555495
0,0.001,32,AdamW,36.540696,48.782687,35.45384,49.244617,22.560008,22.38806,20.121942,20.70076,37.209824,49.08046,34.188357,50.819105,30.079197,30.357143,26.507844,27.105195,Mamba_jina_2_31.62_checkpoint.pth,42.50546,21.442692,42.824436,28.512345,35.668391
1,0.0001,32,SGD,34.178048,43.011722,31.965484,44.244233,22.548965,22.686567,22.062524,22.34189,34.25085,43.48659,31.012309,45.3811,26.000256,26.428571,26.121467,26.682866,Mamba_jina_48_30.24_checkpoint.pth,38.349872,22.409986,38.532712,26.30829,32.420501


Best training hyperparameters: optimizer=Adam, batch_size=32, lr=1e-4

### Mamba + xml-roberta-base

In [18]:
PATH_TO_MODEL = os.path.join(ROOT_DIR, "Models_mamba_xlm-roberta-base")

In [12]:
%%capture --no-stdout
result = []
ker_size = 4
num_layers = 1
for d_model in [64, 128, 256, 512]:
    print(f"d_model={d_model}, num_layers={num_layers}, ker_size={ker_size}")
    model_mamba = Mamba(model_name='xlm-roberta-base', pooling=None,  num_layers = num_layers, d_input = 768, d_model = d_model, num_classes=7, ker_size=ker_size).to(device)
    optimizer = optim.Adam(params = model_mamba.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"d_model" : d_model, "num_layers": num_layers, "ker_size" : ker_size}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

d_model=64, num_layers=1, ker_size=4
Ранняя остановка на эпохе 37 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 39.96650369997422, 'war': 51.848512173128945, 'mf1': 38.51876799811108, 'wf1': 52.77032221495045}
Метрики на валидационной выборке RESD:  {'uar': 27.550740102804404, 'war': 27.46268656716418, 'mf1': 26.121217113467242, 'wf1': 26.20332442244656}
Метрики на тестовой выборке MELD:  {'uar': 39.717429102273044, 'war': 52.37547892720307, 'mf1': 37.0623593632568, 'wf1': 53.958732943952135}
Метрики на тестовой выборке RESD:  {'uar': 29.13272380377644, 'war': 29.28571428571429, 'mf1': 27.457276706369, 'wf1': 27.632554889433365}
d_model=128, num_layers=1, ker_size=4
Ранняя остановка на эпохе 23 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 39.23575575396102, 'war': 51.03697024346258, 'mf1': 37.78538873207924, 'wf1': 52.2489344296831}
Метрики на валидационной выборке RESD:

In [13]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["d_model", "num_layers", "ker_size", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_1_ker_size_4_d_model.csv"))

In [14]:
%%capture --no-stdout
result = []
ker_size = 4
num_layers = 2
for d_model in [64, 128, 256, 512]:
    print(f"d_model={d_model}, num_layers={num_layers}, ker_size={ker_size}")
    model_mamba = Mamba(model_name='xlm-roberta-base', pooling=None,  num_layers = num_layers, d_input = 768, d_model = d_model, num_classes=7, ker_size=ker_size).to(device)
    optimizer = optim.Adam(params = model_mamba.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"d_model" : d_model, "num_layers": num_layers, "ker_size" : ker_size}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

d_model=64, num_layers=2, ker_size=4
Ранняя остановка на эпохе 23 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 38.33757707731309, 'war': 51.21731289449954, 'mf1': 37.666483776004576, 'wf1': 52.14221156896745}
Метрики на валидационной выборке RESD:  {'uar': 27.180954507406824, 'war': 25.37313432835821, 'mf1': 23.0728851083921, 'wf1': 23.038648380990022}
Метрики на тестовой выборке MELD:  {'uar': 39.186695658347446, 'war': 52.29885057471264, 'mf1': 37.01929288377187, 'wf1': 53.94019561482847}
Метрики на тестовой выборке RESD:  {'uar': 26.429017876386297, 'war': 26.071428571428573, 'mf1': 23.219080987445498, 'wf1': 23.29551011566644}
d_model=128, num_layers=2, ker_size=4
Ранняя остановка на эпохе 29 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 35.7478449325198, 'war': 49.77457168620379, 'mf1': 35.598220143146506, 'wf1': 50.91208775834735}
Метрики на валидационной выборке R

In [15]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["d_model", "num_layers", "ker_size", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_2_ker_size_4_d_model.csv"))

In [16]:
%%capture --no-stdout
result = []
ker_size = 4
num_layers = 3
for d_model in [64, 128, 256, 512]:
    print(f"d_model={d_model}, num_layers={num_layers}, ker_size={ker_size}")
    model_mamba = Mamba(model_name='xlm-roberta-base', pooling=None,  num_layers = num_layers, d_input = 768, d_model = d_model, num_classes=7, ker_size=ker_size).to(device)
    optimizer = optim.Adam(params = model_mamba.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"d_model" : d_model, "num_layers": num_layers, "ker_size" : ker_size}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

d_model=64, num_layers=3, ker_size=4
Ранняя остановка на эпохе 27 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 35.91860287522427, 'war': 49.23354373309288, 'mf1': 34.698950612271915, 'wf1': 50.129076392490376}
Метрики на валидационной выборке RESD:  {'uar': 24.116170139553407, 'war': 25.07462686567164, 'mf1': 21.219354614065026, 'wf1': 22.115798946043437}
Метрики на тестовой выборке MELD:  {'uar': 37.997732025170905, 'war': 52.10727969348659, 'mf1': 35.42693626029635, 'wf1': 53.656482949431584}
Метрики на тестовой выборке RESD:  {'uar': 23.136460110144323, 'war': 24.285714285714285, 'mf1': 21.68176220529647, 'wf1': 22.342058441833135}
d_model=128, num_layers=3, ker_size=4
Ранняя остановка на эпохе 19 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 39.51535411954111, 'war': 53.11091073038774, 'mf1': 38.98685460324184, 'wf1': 53.509163467224084}
Метрики на валидационной выбо

In [17]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["d_model", "num_layers", "ker_size", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_3_ker_size_4_d_model.csv"))

In [19]:
columns = ["d_model", "num_layers", "ker_size", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df = pd.concat([pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_1_ker_size_4_d_model.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_2_ker_size_4_d_model.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_3_ker_size_4_d_model.csv"), index_col=0)])
df.columns=columns

In [20]:
df['average_dev_meld'] = (df['uar_dev_meld'] + df['war_dev_meld'] + df['mf1_dev_meld'] + df['wf1_dev_meld']) / 4.0
df['average_dev_resd'] = (df['uar_dev_resd'] + df['war_dev_resd'] + df['mf1_dev_resd'] + df['wf1_dev_resd']) / 4.0
df['average_test_meld'] = (df['uar_test_meld'] + df['war_test_meld'] + df['mf1_test_meld'] + df['wf1_test_meld']) / 4.0
df['average_test_resd'] = (df['uar_test_resd'] + df['war_test_resd'] + df['mf1_test_resd'] + df['wf1_test_resd']) / 4.0
df['avg_test'] = 0.5 * (df['average_test_meld'] + df['average_test_resd'])

In [21]:
df.sort_values('avg_test' , ascending=False)

Unnamed: 0,d_model,num_layers,ker_size,uar_dev_meld,war_dev_meld,mf1_dev_meld,wf1_dev_meld,uar_dev_resd,war_dev_resd,mf1_dev_resd,wf1_dev_resd,uar_test_meld,war_test_meld,mf1_test_meld,wf1_test_meld,uar_test_resd,war_test_resd,mf1_test_resd,wf1_test_resd,путь,average_dev_meld,average_dev_resd,average_test_meld,average_test_resd,avg_test
3,512,2,4,39.072163,53.291253,38.484684,53.416099,26.278339,27.462687,25.134453,26.146727,39.441668,55.517241,38.092454,56.599351,31.709136,32.5,31.056948,31.704467,Mamba_xlm-roberta-base_12_35.38_checkpoint.pth,46.06605,26.255551,47.412679,31.742638,39.577658
3,512,1,4,36.018227,51.217313,35.771728,51.01278,29.946215,28.656716,28.048422,27.807791,38.538147,55.402299,37.366364,55.745763,33.206415,32.5,32.09727,31.688049,Mamba_xlm-roberta-base_21_35.19_checkpoint.pth,43.505012,28.614786,46.763143,32.372934,39.568038
1,128,2,4,35.747845,49.774572,35.59822,50.912088,27.990647,28.656716,26.970871,27.655881,38.639018,52.068966,36.842625,53.735555,29.646533,30.357143,29.338808,29.958893,Mamba_xlm-roberta-base_19_35.11_checkpoint.pth,43.008181,27.818529,45.321541,29.825344,37.573442
0,64,1,4,39.966504,51.848512,38.518768,52.770322,27.55074,27.462687,26.121217,26.203324,39.717429,52.375479,37.062359,53.958733,29.132724,29.285714,27.457277,27.632555,Mamba_xlm-roberta-base_27_35.17_checkpoint.pth,45.776027,26.834492,45.7785,28.377067,37.077784
1,128,3,4,39.515354,53.110911,38.986855,53.509163,24.888504,25.671642,21.705818,22.446848,40.68459,55.938697,39.150052,57.10905,27.424668,28.928571,22.940517,23.988273,Mamba_xlm-roberta-base_9_34.09_checkpoint.pth,46.280571,23.678203,48.220598,25.820508,37.020553
3,512,3,4,36.387782,49.594229,35.571475,48.981559,30.121622,31.641791,28.978625,30.094919,36.526124,53.716475,35.575741,53.788356,28.656183,29.642857,28.145169,28.789287,Mamba_xlm-roberta-base_17_35.16_checkpoint.pth,42.633761,30.209239,44.901674,28.808374,36.855024
1,128,1,4,39.235756,51.03697,37.785389,52.248934,26.563831,26.865672,23.667477,23.786079,39.508366,52.56705,37.109702,54.356915,26.551457,27.5,24.348392,24.808219,Mamba_xlm-roberta-base_13_33.53_checkpoint.pth,45.076762,25.220765,45.885508,25.802017,35.843763
2,256,3,4,37.918016,48.782687,36.642228,50.737371,28.961514,28.955224,27.520561,27.957079,38.034978,50.0,35.794887,52.440412,27.655016,28.571429,25.996219,26.710026,Mamba_xlm-roberta-base_12_34.21_checkpoint.pth,43.520076,28.348594,44.067569,27.233173,35.650371
2,256,2,4,38.822184,49.774572,36.788205,51.409359,26.787572,25.970149,23.783688,24.236441,39.10167,50.881226,35.777244,53.278374,27.739781,27.5,24.784697,25.00711,Mamba_xlm-roberta-base_11_33.28_checkpoint.pth,44.19858,25.194463,44.759629,26.257897,35.508763
2,256,1,4,37.541469,51.487827,36.686548,51.786169,26.730686,28.059701,24.110561,24.935932,39.102554,53.678161,37.185821,54.791004,24.883109,26.428571,22.999795,23.960296,Mamba_xlm-roberta-base_8_34.61_checkpoint.pth,44.375503,25.95922,46.189385,24.567943,35.378664


### Mamba + canine-c

In [22]:
PATH_TO_MODEL = os.path.join(ROOT_DIR, "Models_mamba_canine-c")

In [None]:
%%capture --no-stdout
result = []
ker_size = 4
num_layers = 1
for d_model in [64, 128]:
    print(f"d_model={d_model}, num_layers={num_layers}, ker_size={ker_size}")
    model_mamba = Mamba(model_name='canine-c', pooling=None,  num_layers = num_layers, d_input = 768, d_model = d_model, num_classes=7, ker_size=ker_size).to(device)
    optimizer = optim.Adam(params = model_mamba.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"d_model" : d_model, "num_layers": num_layers, "ker_size" : ker_size}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

d_model=64, num_layers=1, ker_size=4
Ранняя остановка на эпохе 33 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 29.696756988690158, 'war': 44.63480613165014, 'mf1': 29.514830994117947, 'wf1': 44.748154233762556}
Метрики на валидационной выборке RESD:  {'uar': 17.097854600777506, 'war': 17.91044776119403, 'mf1': 15.85179854555259, 'wf1': 16.574049752507456}
Метрики на тестовой выборке MELD:  {'uar': 30.263335996910545, 'war': 46.934865900383144, 'mf1': 29.421193100654286, 'wf1': 47.68815871527711}
Метрики на тестовой выборке RESD:  {'uar': 19.398411569464198, 'war': 20.0, 'mf1': 18.89859900921925, 'wf1': 19.365486260254748}
d_model=128, num_layers=1, ker_size=4
Ранняя остановка на эпохе 26 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 30.084844770656023, 'war': 42.20018034265104, 'mf1': 29.435609148123614, 'wf1': 43.52543486238246}
Метрики на валидационной выборке RESD:  {

In [18]:
%%capture --no-stdout
ker_size = 4
num_layers = 1
for d_model in [256, 512]:
    print(f"d_model={d_model}, num_layers={num_layers}, ker_size={ker_size}")
    model_mamba = Mamba(model_name='canine-c', pooling=None,  num_layers = num_layers, d_input = 768, d_model = d_model, num_classes=7, ker_size=ker_size).to(device)
    optimizer = optim.Adam(params = model_mamba.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"d_model" : d_model, "num_layers": num_layers, "ker_size" : ker_size}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

d_model=256, num_layers=1, ker_size=4
Ранняя остановка на эпохе 35 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 31.20267795519452, 'war': 48.69251577998197, 'mf1': 31.577473803259444, 'wf1': 47.398886700535186}
Метрики на валидационной выборке RESD:  {'uar': 22.01530563916388, 'war': 22.686567164179106, 'mf1': 21.661445557331938, 'wf1': 22.445869607974902}
Метрики на тестовой выборке MELD:  {'uar': 29.279947201357786, 'war': 48.275862068965516, 'mf1': 29.04034519922844, 'wf1': 48.04598037248346}
Метрики на тестовой выборке RESD:  {'uar': 21.499135841241106, 'war': 21.785714285714285, 'mf1': 21.81891101631794, 'wf1': 22.105044171377234}
d_model=512, num_layers=1, ker_size=4
Ранняя остановка на эпохе 33 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 31.909293312995796, 'war': 49.864743011722275, 'mf1': 32.304131661803446, 'wf1': 48.54405507706126}
Метрики на валидационной в

In [19]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["d_model", "num_layers", "ker_size", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_1_ker_size_4_d_model.csv"))

In [None]:
%%capture --no-stdout
result = []
ker_size = 4
num_layers = 2
for d_model in [64, 128]:
    print(f"d_model={d_model}, num_layers={num_layers}, ker_size={ker_size}")
    model_mamba = Mamba(model_name='canine-c', pooling=None,  num_layers = num_layers, d_input = 768, d_model = d_model, num_classes=7, ker_size=ker_size).to(device)
    optimizer = optim.Adam(params = model_mamba.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"d_model" : d_model, "num_layers": num_layers, "ker_size" : ker_size}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

d_model=64, num_layers=2, ker_size=4
Ранняя остановка на эпохе 32 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 29.183566179634884, 'war': 41.92966636609558, 'mf1': 28.480214088953293, 'wf1': 42.67541766386388}
Метрики на валидационной выборке RESD:  {'uar': 19.111583763940356, 'war': 20.0, 'mf1': 18.845898022019547, 'wf1': 19.657489482933368}
Метрики на тестовой выборке MELD:  {'uar': 31.537178251316067, 'war': 45.24904214559387, 'mf1': 29.96074005718703, 'wf1': 46.79118460074581}
Метрики на тестовой выборке RESD:  {'uar': 21.713060035428455, 'war': 22.5, 'mf1': 21.1358442974372, 'wf1': 21.871272636044274}
d_model=128, num_layers=2, ker_size=4
Метрики на валидационной выборке MELD:  {'uar': 29.38531284221449, 'war': 46.979260595130754, 'mf1': 29.528804661653712, 'wf1': 45.39370748033987}
Метрики на валидационной выборке RESD:  {'uar': 24.47847254843967, 'war': 25.671641791044774, 'mf1': 24.010960418078554, 'wf1': 25.11132247417

In [13]:
%%capture --no-stdout
ker_size = 4
num_layers = 2
for d_model in [256, 512]:
    print(f"d_model={d_model}, num_layers={num_layers}, ker_size={ker_size}")
    model_mamba = Mamba(model_name='canine-c', pooling=None,  num_layers = num_layers, d_input = 768, d_model = d_model, num_classes=7, ker_size=ker_size).to(device)
    optimizer = optim.Adam(params = model_mamba.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"d_model" : d_model, "num_layers": num_layers, "ker_size" : ker_size}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

d_model=256, num_layers=2, ker_size=4
Ранняя остановка на эпохе 28 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 32.30754835604839, 'war': 48.06131650135257, 'mf1': 32.348482539993526, 'wf1': 47.67677370432132}
Метрики на валидационной выборке RESD:  {'uar': 23.46943028419097, 'war': 24.47761194029851, 'mf1': 23.14494594485508, 'wf1': 24.14850180795207}
Метрики на тестовой выборке MELD:  {'uar': 32.01968921919479, 'war': 49.50191570881226, 'mf1': 31.46190676553262, 'wf1': 49.50040393932791}
Метрики на тестовой выборке RESD:  {'uar': 25.91081512134144, 'war': 26.071428571428573, 'mf1': 25.976772938879318, 'wf1': 26.122826873965877}
d_model=512, num_layers=2, ker_size=4
Ранняя остановка на эпохе 23 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 29.197200861221074, 'war': 47.52028854824166, 'mf1': 28.987009240636542, 'wf1': 45.73279069758367}
Метрики на валидационной выборке 

In [14]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["d_model", "num_layers", "ker_size", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_2_ker_size_4_d_model.csv"))

In [11]:
%%capture --no-stdout
result = []
ker_size = 4
num_layers = 3
for d_model in [64, 128, 256, 512]:
    print(f"d_model={d_model}, num_layers={num_layers}, ker_size={ker_size}")
    model_mamba = Mamba(model_name='canine-c', pooling=None,  num_layers = num_layers, d_input = 768, d_model = d_model, num_classes=7, ker_size=ker_size).to(device)
    optimizer = optim.Adam(params = model_mamba.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"d_model" : d_model, "num_layers": num_layers, "ker_size" : ker_size}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

d_model=64, num_layers=3, ker_size=4
Ранняя остановка на эпохе 35 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 31.103250283679035, 'war': 46.077547339945895, 'mf1': 30.567915824794728, 'wf1': 45.24413522245491}
Метрики на валидационной выборке RESD:  {'uar': 24.160599505502685, 'war': 24.47761194029851, 'mf1': 24.20491007410784, 'wf1': 24.753292130830797}
Метрики на тестовой выборке MELD:  {'uar': 29.700312284817315, 'war': 47.89272030651341, 'mf1': 29.132206702243202, 'wf1': 48.01777915900259}
Метрики на тестовой выборке RESD:  {'uar': 22.32668074773338, 'war': 22.5, 'mf1': 21.413435804046184, 'wf1': 21.648223654690742}
d_model=128, num_layers=3, ker_size=4
Ранняя остановка на эпохе 22 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 33.12235447286162, 'war': 44.8151487826871, 'mf1': 32.00166503170652, 'wf1': 44.84040828920593}
Метрики на валидационной выборке RESD:  {'uar

In [23]:
columns = ["d_model", "num_layers", "ker_size", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df = pd.concat([pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_1_ker_size_4_d_model.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_2_ker_size_4_d_model.csv"), index_col=0), pd.read_csv(os.path.join(os.path.join(ROOT_DIR, "Models_mamba"), "result_num_layers_3_ker_size_4_d_model.csv"), index_col=0)])
df.columns=columns

In [24]:
df['average_dev_meld'] = (df['uar_dev_meld'] + df['war_dev_meld'] + df['mf1_dev_meld'] + df['wf1_dev_meld']) / 4.0
df['average_dev_resd'] = (df['uar_dev_resd'] + df['war_dev_resd'] + df['mf1_dev_resd'] + df['wf1_dev_resd']) / 4.0
df['average_test_meld'] = (df['uar_test_meld'] + df['war_test_meld'] + df['mf1_test_meld'] + df['wf1_test_meld']) / 4.0
df['average_test_resd'] = (df['uar_test_resd'] + df['war_test_resd'] + df['mf1_test_resd'] + df['wf1_test_resd']) / 4.0
df['avg_test'] = 0.5 * (df['average_test_meld'] + df['average_test_resd'])

In [25]:
df.sort_values('avg_test' , ascending=False)

Unnamed: 0,d_model,num_layers,ker_size,uar_dev_meld,war_dev_meld,mf1_dev_meld,wf1_dev_meld,uar_dev_resd,war_dev_resd,mf1_dev_resd,wf1_dev_resd,uar_test_meld,war_test_meld,mf1_test_meld,wf1_test_meld,uar_test_resd,war_test_resd,mf1_test_resd,wf1_test_resd,путь,average_dev_meld,average_dev_resd,average_test_meld,average_test_resd,avg_test
3,512,2,4,29.197201,47.520289,28.987009,45.732791,25.989318,26.865672,25.557837,26.746069,32.880575,49.578544,32.523851,49.567624,27.344766,27.5,26.94778,27.012609,Mamba_canine-c_13_31.89_checkpoint.pth,37.859322,26.289724,41.137648,27.201289,34.169469
2,256,2,4,32.307548,48.061317,32.348483,47.676774,23.46943,24.477612,23.144946,24.148502,32.019689,49.501916,31.461907,49.500404,25.910815,26.071429,25.976773,26.122827,Mamba_canine-c_18_31.63_checkpoint.pth,40.09853,23.810122,40.620979,26.020461,33.32072
3,512,3,4,32.633288,52.209197,33.490617,49.636138,22.804199,23.283582,22.841738,23.612597,33.100846,54.827586,33.780349,52.998455,23.128833,23.214286,22.53851,22.775042,Mamba_canine-c_25_32.49_checkpoint.pth,41.99231,23.135529,43.676809,22.914168,33.295489
2,256,3,4,31.342846,49.6844,31.957986,47.304297,26.504984,26.865672,25.574932,26.158468,31.216765,52.835249,31.61894,51.076225,24.953985,25.357143,23.797335,24.247665,Mamba_canine-c_30_32.78_checkpoint.pth,40.072382,26.276014,41.686795,24.589032,33.137913
3,512,1,4,31.909293,49.864743,32.304132,48.544055,23.506373,23.58209,23.370838,23.685629,31.15039,49.54023,31.086705,49.26867,23.044748,22.857143,22.093909,22.268563,Mamba_canine-c_23_31.71_checkpoint.pth,40.655556,23.536232,40.261499,22.566091,31.413795
1,128,2,4,29.385313,46.979261,29.528805,45.393707,24.478473,25.671642,24.01096,25.111322,29.4127,49.885057,28.999728,49.166067,23.089375,23.928571,22.478247,23.190633,Mamba_canine-c_47_31.28_checkpoint.pth,37.821771,24.818099,39.365888,23.171707,31.268798
1,128,3,4,33.122354,44.815149,32.001665,44.840408,21.963026,22.38806,21.800712,22.643391,31.170214,44.521073,29.394649,45.815873,25.442008,25.714286,23.731061,23.972679,Mamba_canine-c_12_29.72_checkpoint.pth,38.694894,22.198797,37.725452,24.715008,31.22023
0,64,3,4,31.10325,46.077547,30.567916,45.244135,24.1606,24.477612,24.20491,24.753292,29.700312,47.89272,29.132207,48.017779,22.326681,22.5,21.413436,21.648224,Mamba_canine-c_25_30.98_checkpoint.pth,38.248212,24.399103,38.685755,21.972085,30.32892
2,256,1,4,31.202678,48.692516,31.577474,47.398887,22.015306,22.686567,21.661446,22.44587,29.279947,48.275862,29.040345,48.04598,21.499136,21.785714,21.818911,22.105044,Mamba_canine-c_25_30.65_checkpoint.pth,39.717889,22.202297,38.660534,21.802201,30.231368
0,64,2,4,29.183566,41.929666,28.480214,42.675418,19.111584,20.0,18.845898,19.657489,31.537178,45.249042,29.96074,46.791185,21.71306,22.5,21.135844,21.871273,Mamba_canine-c_22_27.7_checkpoint.pth,35.567216,19.403743,38.384536,21.805044,30.09479


### Взвешивание корпусов

In [11]:
from torch.utils.data import DataLoader, ConcatDataset, WeightedRandomSampler

In [12]:
from torch.utils.data import Dataset, DataLoader 
import numpy as np 
import math 

class Dataset_MELD_RESD_(): 
    def __init__(self, part='train', transform=None): 
        if part == 'train_meld':
            df = pd.read_csv("train_sent_emo.csv")[['Utterance', 'Emotion']]
            df.columns = ['text', 'emotion']
        elif part == 'train_resd':
            df = pd.read_csv("train.csv")[['text', 'emotion']]
            df = df[0:int(len(df)*0.7)]
        elif part == 'dev_meld':
            df = pd.read_csv("dev_sent_emo.csv")[['Utterance', 'Emotion']]
            df.columns = ['text', 'emotion']
        elif part == 'dev_resd':
            df = pd.read_csv("train.csv")
            df = df[int(len(df)*0.7):]
        elif part == 'test_resd':
            df = pd.read_csv("test.csv")
        elif part == 'test_meld':
            df = pd.read_csv("test_sent_emo.csv")[['Utterance', 'Emotion']]
            df.columns = ['text', 'emotion']
        elif part == 'test_resd':
            df = pd.read_csv("test.csv")
        else:
            raise ValueError('Unknown part of Dataset (train / test_meld / test_resd)')
        self.x = list(df['text'].values)
        emotion_mapping = {
            'anger': 0,
            'disgust': 1,
            'fear': 2,
            'joy': 3,
            'happiness': 3,
            'neutral': 4,
            'sadness': 5,
            'surprise': 6,
            'enthusiasm': 6
        }

        self.y = torch.tensor(df['emotion'].apply(lambda x : emotion_mapping[x]).values).to(device)
        self.n_samples = df.shape[0]

    def __getitem__(self, index): 
        return self.x[index], self.y[index] 
        
    def __len__(self): 
        return self.n_samples 

In [13]:
BATCH_SIZE = 32
train_datasets = [Dataset_MELD_RESD_('train_meld'), Dataset_MELD_RESD_('train_resd')]
lengths = [len(d) for d in train_datasets]
total = sum(lengths)
weights = []
for d_len in lengths:
    w = 1.0 / d_len
    weights += [w] * d_len
sampler = WeightedRandomSampler(weights, num_samples=total, replacement=True)
train_dataloader = DataLoader(dataset=ConcatDataset(train_datasets), batch_size=BATCH_SIZE, sampler=sampler)
dev_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD_('dev_meld'), batch_size=BATCH_SIZE, shuffle=False)
dev_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD_('dev_resd'), batch_size=BATCH_SIZE, shuffle=False)
test_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD_('test_meld'), batch_size=BATCH_SIZE, shuffle=False)
test_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD_('test_resd'), batch_size=BATCH_SIZE, shuffle=False)

In [15]:
%%capture --no-stdout
model_mamba = Mamba(model_name='jina', pooling=None,  num_layers=2, d_input=1024, d_model=512, num_classes=7, ker_size=4).to(device)
optimizer = optim.Adam(params = model_mamba.parameters(), lr = LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
trainer.train(PATH_TO_MODEL)
checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
model_mamba.load_state_dict(checkpoint['model_state_dict'])
metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
print(trainer._best_model_name)

Ранняя остановка на эпохе 16 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 43.36808384034978, 'war': 48.602344454463484, 'mf1': 39.67644990967112, 'wf1': 50.528662743478826}
Метрики на валидационной выборке RESD:  {'uar': 39.52483022194385, 'war': 39.701492537313435, 'mf1': 39.34319575297019, 'wf1': 39.600495093817955}
Метрики на тестовой выборке MELD:  {'uar': 37.90417258874719, 'war': 47.77777777777778, 'mf1': 35.01740097146559, 'wf1': 50.57866230507139}
Метрики на тестовой выборке RESD:  {'uar': 39.09825495351811, 'war': 39.64285714285714, 'mf1': 39.29648531428141, 'wf1': 39.55313782074682}
Mamba_jina_6_41.14_checkpoint.pth


In [3]:
meld_avg_test = (37.90417258874719+47.77777777777778+35.01740097146559+50.57866230507139) / 4.0
resd_avg_test = (39.09825495351811+39.64285714285714+39.29648531428141+39.55313782074682) / 4.0
print(f"meld_avg_test = {meld_avg_test}, resd_avg_test = {resd_avg_test}, avg_test = {(meld_avg_test + resd_avg_test) / 2.0}")

meld_avg_test = 42.81950341076549, resd_avg_test = 39.39768380785087, avg_test = 41.10859360930818


In [16]:
%%capture --no-stdout
result = []
ker_size = 4
num_layers = 3
for d_model in [64, 128, 256, 512]:
    print(f"d_model={d_model}, num_layers={num_layers}, ker_size={ker_size}")
    model_mamba = Mamba(model_name='jina', pooling=None,  num_layers = num_layers, d_input = 1024, d_model = d_model, num_classes=7, ker_size=ker_size).to(device)
    optimizer = optim.Adam(params = model_mamba.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"d_model" : d_model, "num_layers": num_layers, "ker_size" : ker_size}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

d_model=64, num_layers=3, ker_size=4
Ранняя остановка на эпохе 28 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 36.327986434185135, 'war': 48.06131650135257, 'mf1': 34.709939059945896, 'wf1': 49.09321922332876}
Метрики на валидационной выборке RESD:  {'uar': 38.34288014722798, 'war': 39.1044776119403, 'mf1': 37.85761066836365, 'wf1': 38.48047409721354}
Метрики на тестовой выборке MELD:  {'uar': 37.92637081692081, 'war': 49.69348659003832, 'mf1': 35.053003630232546, 'wf1': 51.38018627931397}
Метрики на тестовой выборке RESD:  {'uar': 35.97125620151936, 'war': 36.78571428571429, 'mf1': 35.86463472973893, 'wf1': 36.6090699972011}
d_model=128, num_layers=3, ker_size=4
Ранняя остановка на эпохе 24 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 34.765948606316556, 'war': 51.487826871055006, 'mf1': 35.099919741690655, 'wf1': 50.62738224591493}
Метрики на валидационной выборке RES

In [17]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["d_model", "num_layers", "ker_size", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_3_ker_size_4_d_model_.csv"))

In [None]:
%%capture --no-stdout
result = []
ker_size = 4
num_layers = 2
for d_model in [64, 128, 256, 512]:
    print(f"d_model={d_model}, num_layers={num_layers}, ker_size={ker_size}")
    model_mamba = Mamba(model_name='jina', pooling=None,  num_layers = num_layers, d_input = 1024, d_model = d_model, num_classes=7, ker_size=ker_size).to(device)
    optimizer = optim.Adam(params = model_mamba.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"d_model" : d_model, "num_layers": num_layers, "ker_size" : ker_size}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

d_model=64, num_layers=2, ker_size=4
Ранняя остановка на эпохе 20 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 39.612011617570715, 'war': 48.602344454463484, 'mf1': 37.774463482913184, 'wf1': 49.7380210404323}
Метрики на валидационной выборке RESD:  {'uar': 36.672497984701124, 'war': 36.71641791044776, 'mf1': 36.672667147039505, 'wf1': 36.87112953139489}
Метрики на тестовой выборке MELD:  {'uar': 39.0066663922664, 'war': 50.191570881226056, 'mf1': 36.17621636596297, 'wf1': 52.169702386354146}
Метрики на тестовой выборке RESD:  {'uar': 36.32307247438827, 'war': 36.42857142857142, 'mf1': 35.9314377759824, 'wf1': 36.1992739340633}
d_model=128, num_layers=2, ker_size=4
Ранняя остановка на эпохе 14 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 38.64667144171817, 'war': 46.79891794409377, 'mf1': 36.09402145442258, 'wf1': 47.53895628079115}
Метрики на валидационной выборке RESD

In [None]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["d_model", "num_layers", "ker_size", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_2_ker_size_4_d_model_.csv"))

In [12]:
%%capture --no-stdout
result = []
ker_size = 4
num_layers = 1
for d_model in [64, 128, 256, 512]:
    print(f"d_model={d_model}, num_layers={num_layers}, ker_size={ker_size}")
    model_mamba = Mamba(model_name='jina', pooling=None,  num_layers = num_layers, d_input = 1024, d_model = d_model, num_classes=7, ker_size=ker_size).to(device)
    optimizer = optim.Adam(params = model_mamba.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"d_model" : d_model, "num_layers": num_layers, "ker_size" : ker_size}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

d_model=64, num_layers=1, ker_size=4
Ранняя остановка на эпохе 20 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 41.30845376921761, 'war': 49.864743011722275, 'mf1': 39.31859609191632, 'wf1': 51.070143912108946}
Метрики на валидационной выборке RESD:  {'uar': 28.836133487211306, 'war': 29.55223880597015, 'mf1': 27.716939952071673, 'wf1': 28.41207467913695}
Метрики на тестовой выборке MELD:  {'uar': 39.82516996803059, 'war': 49.88505747126437, 'mf1': 37.06654099374234, 'wf1': 52.3679255443976}
Метрики на тестовой выборке RESD:  {'uar': 28.455349573770626, 'war': 28.92857142857143, 'mf1': 28.087293600936476, 'wf1': 28.5194163707421}
d_model=128, num_layers=1, ker_size=4
Ранняя остановка на эпохе 27 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 38.31620058209276, 'war': 52.209197475202885, 'mf1': 38.68373560605177, 'wf1': 51.53035785532614}
Метрики на валидационной выборке RE

In [13]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["d_model", "num_layers", "ker_size", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_1_ker_size_4_d_model_.csv"))

In [12]:
%%capture --no-stdout
result = []
ker_size = 4
num_layers = 3
for d_model in [64, 128, 256, 512]:
    print(f"d_model={d_model}, num_layers={num_layers}, ker_size={ker_size}")
    model_mamba = Mamba(model_name='jina', pooling=None,  num_layers = num_layers, d_input = 1024, d_model = d_model, num_classes=7, ker_size=ker_size).to(device)
    optimizer = optim.Adam(params = model_mamba.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_mamba, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_mamba, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_mamba, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_mamba, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"d_model" : d_model, "num_layers": num_layers, "ker_size" : ker_size}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

d_model=64, num_layers=3, ker_size=4
Ранняя остановка на эпохе 20 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 40.802261908203384, 'war': 52.93056807935077, 'mf1': 39.60917711381529, 'wf1': 52.731416008129315}
Метрики на валидационной выборке RESD:  {'uar': 30.24905034477559, 'war': 30.447761194029848, 'mf1': 29.563577961868948, 'wf1': 30.072389576463188}
Метрики на тестовой выборке MELD:  {'uar': 40.13173625572798, 'war': 55.32567049808429, 'mf1': 38.9353657473957, 'wf1': 56.143925374097456}
Метрики на тестовой выборке RESD:  {'uar': 27.700629542734806, 'war': 28.57142857142857, 'mf1': 27.311526162217337, 'wf1': 28.153406957451168}
d_model=128, num_layers=3, ker_size=4
Ранняя остановка на эпохе 16 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 41.57772179580114, 'war': 52.750225428313804, 'mf1': 40.10517150666552, 'wf1': 53.485670728772114}
Метрики на валидационной выбор

In [13]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["d_model", "num_layers", "ker_size", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_3_ker_size_4_d_model__.csv"))

In [11]:
columns = ["d_model", "num_layers", "ker_size", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df = pd.concat([pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_2_ker_size_4_d_model_.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_3_ker_size_4_d_model_.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_1_ker_size_4_d_model_.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_3_ker_size_4_d_model__.csv"), index_col=0)])
df.columns=columns
df['average_dev_meld'] = (df['uar_dev_meld'] + df['war_dev_meld'] + df['mf1_dev_meld'] + df['wf1_dev_meld']) / 4.0
df['average_dev_resd'] = (df['uar_dev_resd'] + df['war_dev_resd'] + df['mf1_dev_resd'] + df['wf1_dev_resd']) / 4.0
df['average_test_meld'] = (df['uar_test_meld'] + df['war_test_meld'] + df['mf1_test_meld'] + df['wf1_test_meld']) / 4.0
df['average_test_resd'] = (df['uar_test_resd'] + df['war_test_resd'] + df['mf1_test_resd'] + df['wf1_test_resd']) / 4.0
df['avg'] = (df['uar_test_meld'] + df['war_test_meld'] + df['mf1_test_meld'] + df['wf1_test_meld'] + df['uar_test_resd'] + df['war_test_resd'] + df['mf1_test_resd'] + df['wf1_test_resd']) / 8.0
df.sort_values('avg' , ascending=False)

Unnamed: 0,d_model,num_layers,ker_size,uar_dev_meld,war_dev_meld,mf1_dev_meld,wf1_dev_meld,uar_dev_resd,war_dev_resd,mf1_dev_resd,wf1_dev_resd,uar_test_meld,war_test_meld,mf1_test_meld,wf1_test_meld,uar_test_resd,war_test_resd,mf1_test_resd,wf1_test_resd,путь,average_dev_meld,average_dev_resd,average_test_meld,average_test_resd,avg
1,128,3,4,34.765949,51.487827,35.09992,50.627382,37.398598,37.313433,37.152694,37.246492,37.87812,55.095785,37.617919,55.186434,35.949369,36.428571,36.037299,36.548087,Mamba_jina_14_40.17_checkpoint.pth,42.995269,37.277804,46.444565,36.240832,41.342698
2,256,2,4,41.713066,55.004509,41.310083,54.625087,36.849412,37.014925,35.905482,36.341286,37.579763,55.862069,37.74154,56.09824,34.800181,35.357143,34.256209,34.985298,Mamba_jina_5_41.43_checkpoint.pth,48.163186,36.527776,46.820403,34.849708,40.835055
3,512,3,4,36.944364,51.848512,36.882821,51.259909,38.429058,38.80597,37.583553,38.084248,37.465551,54.789272,36.912279,55.290512,35.385284,35.357143,34.644151,34.688032,Mamba_jina_15_41.04_checkpoint.pth,44.233901,38.225707,46.114403,35.018652,40.566528
1,128,2,4,38.646671,46.798918,36.094021,47.538956,40.25298,40.298507,39.938442,40.336743,37.988764,48.697318,34.661389,50.481666,37.687384,37.857143,37.625343,37.922521,Mamba_jina_4_41.2_checkpoint.pth,42.269642,40.206668,42.957284,37.773097,40.365191
0,64,2,4,39.612012,48.602344,37.774463,49.738021,36.672498,36.716418,36.672667,36.87113,39.006666,50.191571,36.176216,52.169702,36.323072,36.428571,35.931438,36.199274,Mamba_jina_10_39.96_checkpoint.pth,43.93171,36.733178,44.386039,36.220589,40.303314
2,256,3,4,42.71486,48.241659,38.949803,49.982883,38.677262,38.80597,38.080587,38.706673,39.31883,49.54023,36.10628,51.798191,35.837356,36.428571,35.725523,36.424176,Mamba_jina_5_40.11_checkpoint.pth,44.972301,38.567623,44.190883,36.103907,40.147395
0,64,3,4,36.327986,48.061317,34.709939,49.093219,38.34288,39.104478,37.857611,38.480474,37.926371,49.693487,35.053004,51.380186,35.971256,36.785714,35.864635,36.60907,Mamba_jina_18_40.62_checkpoint.pth,42.048115,38.446361,43.513262,36.307669,39.910465
2,256,3,4,37.765499,50.405771,37.408455,50.062605,34.588318,34.626866,35.045869,35.095291,38.657489,53.716475,37.522825,54.309152,32.759241,33.214286,33.012691,33.485543,Mamba_jina_18_39.97_checkpoint.pth,43.910583,34.839086,46.051485,33.11794,39.584713
3,512,3,4,38.73736,52.479711,38.367624,52.705176,36.297541,37.313433,35.837044,36.864917,37.124988,53.371648,36.329934,54.169054,32.529223,33.214286,32.486865,33.122483,Mamba_jina_16_41.01_checkpoint.pth,45.572468,36.578234,45.248906,32.838214,39.04356
1,128,3,4,41.577722,52.750225,40.105172,53.485671,33.676577,34.029851,33.595092,33.931655,41.226675,53.218391,38.899671,54.610206,29.227445,30.0,28.528594,29.123737,Mamba_jina_6_39.37_checkpoint.pth,46.979697,33.808293,46.988736,29.219944,38.10434
