In [1]:
# Подавление предупреждений
import warnings
for warn in [UserWarning, FutureWarning]: warnings.filterwarnings("ignore", category = warn)

# Импорт необходимых библиотек
import os
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from tqdm import tqdm
import random

from transformers import AutoTokenizer, AutoModel,AutoModelForMaskedLM, RobertaModel, RobertaTokenizer
import torch
import torch.nn.functional as F
from torch import Tensor
from einops import rearrange
from typing import Tuple, Callable
from torch.autograd import Function
import gc
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Данные

In [3]:
from torch.utils.data import Dataset, DataLoader 
import numpy as np 
import math 

class MELDDataset(): 
    def __init__(self, part='train', transform=None): 
        if part == 'train':
            df = pd.read_csv("train_sent_emo.csv")
        elif part == 'valid':
            df = pd.read_csv("dev_sent_emo.csv")
        elif part == 'test':
            df = pd.read_csv("test_sent_emo.csv")
        else:
            raise ValueError('Unknown part of RESDDataset (train / test)')
        self.x = list(df['Utterance'].values)
        transform_dict = {'anger' : 0, 'disgust' : 1, 'fear' : 2, 'sadness' : 3, 'neutral' : 4, 'joy' : 5, 'surprise' : 6}
        self.y = torch.tensor(df['Emotion'].apply(lambda x : transform_dict[x])).to(device)
        self.n_samples = df.shape[0]  
        self.transform = transform

    def __getitem__(self, index): 
        if self.transform is not None:
            x = self.transform(self.x[index])
            return x, self.y[index] 
        return self.x[index], self.y[index] 
        
    def __len__(self): 
        return self.n_samples 

### Feature Extractor

In [4]:
class Embedding():
    def __init__(self, model_name='jina', pooling=None):
        self.model_name = model_name
        self.pooling = pooling
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        if model_name == 'jina':
            self.tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v3", code_revision='da863dd04a4e5dce6814c6625adfba87b83838aa', trust_remote_code=True)
            self.model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", code_revision='da863dd04a4e5dce6814c6625adfba87b83838aa', trust_remote_code=True).to(self.device)
        elif model_name == 'xlm-roberta-base':
            self.tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
            self.model = AutoModel.from_pretrained('xlm-roberta-base').to(self.device)
        elif model_name == 'canine-c':
            self.tokenizer = AutoTokenizer.from_pretrained('google/canine-c')
            self.model = AutoModel.from_pretrained('google/canine-c').to(self.device)
        else:
            raise ValueError('Unknown name of Embedding')
    def _mean_pooling(self, X):
        def mean_pooling(model_output, attention_mask):
            token_embeddings = model_output[0]
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        encoded_input = self.tokenizer(X, padding=True, truncation=True, return_tensors='pt').to(self.device)
        with torch.no_grad():
            model_output = self.model(**encoded_input)
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings.unsqueeze(1)
    
    def get_embeddings(self, X):
        if self.pooling is None:
            if self.model_name == 'canine-c_emb':
                max_len = 300
            else:
                max_len = 88
            encoded_input = self.tokenizer(X, padding=True, truncation=True, return_tensors='pt').to(self.device)
            with torch.no_grad():
                features = self.model(**encoded_input)[0].detach().cpu().float().numpy()
            res = np.pad(features[:, :max_len, :], ((0, 0), (0, max(0, max_len - features.shape[1])), (0, 0)), "constant")
            return torch.tensor(res)
        elif self.pooling == 'mean':
            return self._mean_pooling(X)
        else:
            raise ValueError('Unknown type of pooling')

In [5]:
class PScan(Function):
    @staticmethod
    def forward(ctx, A_inp, X_inp):
        A, X = A_inp.clone(), X_inp.clone()
        A, X = rearrange(A, "l b d s -> b d l s"), rearrange(X, "l b d s -> b d l s")
        PScan._forward(A, X)
        ctx.save_for_backward(A.clone(), X)
        return rearrange(X, "b d l s -> b l d s")

    @staticmethod
    def backward(ctx, grad_inp: Tensor) -> Tuple[Tensor, Tensor]:
        A, X = ctx.saved_tensors
        A = torch.cat((A[:, :, :1], A[:, :, 1:].flip(2)), dim = 2)
        grad_out = rearrange(grad_inp, "b l d s -> b d l s")
        grad_out = grad_out.flip(2)
        PScan._forward(A, grad_out)
        grad_out = grad_out.flip(2)
        Q = torch.zeros_like(X)
        Q[:, :, 1:].add_(X[:, :, :-1] * grad_out[:, :, 1:])
        return rearrange(Q, "b d l s -> b l d s"), rearrange(grad_out, "b d l s -> b l d s")

    @staticmethod
    def _forward(A: Tensor, X: Tensor) -> None:
        b, d, l, s = A.shape
        num_steps = int(math.log2(l))
        Av, Xv = A, X
        for _ in range(num_steps):
            T = Xv.size(2)
            Av, Xv = Av[:, :, :T].reshape(b, d, T // 2, 2, -1), Xv[:, :, :T].reshape(b, d, T // 2, 2, -1)
            Xv[:, :, :, 1].add_(Av[:, :, :, 1].mul(Xv[:, :, :, 0]))
            Av[:, :, :, 1].mul_(Av[:, :, :, 0])
            Av, Xv = Av[:, :, :, 1], Xv[:, :, :, 1]
        for k in range(num_steps - 1, -1, -1):
            Av, Xv = A[:, :, 2**k - 1 : l : 2**k], X[:, :, 2**k - 1 : l : 2**k]
            T = 2 * (Xv.size(2) // 2)
            if T < Xv.size(2):
                Xv[:, :, -1].add_(Av[:, :, -1].mul(Xv[:, :, -2]))
                Av[:, :, -1].mul_(Av[:, :, -2])
            Av, Xv = Av[:, :, :T].reshape(b, d, T // 2, 2, -1), Xv[:, :, :T].reshape(b, d, T // 2, 2, -1)
            Xv[:, :, 1:, 0].add_(Av[:, :, 1:, 0].mul(Xv[:, :, :-1, 1]))
            Av[:, :, 1:, 0].mul_(Av[:, :, :-1, 1])

pscan: Callable[[Tensor, Tensor], Tensor] = PScan.apply

class RMSNorm(nn.Module):
    def __init__(self, d_model: int, eps: float = 1e-8) -> None:
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(d_model))

    def forward(self, x: Tensor) -> Tensor:        
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim = True) + self.eps) * self.weight

class MambaBlock(nn.Module):
    def __init__(self, d_input, d_model):
        super(MambaBlock, self).__init__()
        self.in_proj = nn.Linear(d_input, d_model)
        self.s_B = nn.Linear(d_model, d_model)
        self.s_C = nn.Linear(d_model, d_model)
        self.out_proj = nn.Linear(d_model, d_input)

    def forward(self, x):
        x = x.to(device)
        x = self.in_proj(x)
        B, C = self.s_B(x), self.s_C(x)
        res = self.out_proj(x + B + C)
        return res

class Mamba(nn.Module):
    def __init__(self, num_layers, d_input, d_model, num_classes, model_name='jina', pooling=None):
        super(Mamba, self).__init__()
        self.model_name = model_name
        embed = Embedding(model_name, pooling)
        self.embedding = embed.get_embeddings
        self.layers = nn.ModuleList([MambaBlock(d_input, d_model) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_input, num_classes)

    def forward(self, seq):
        seq = torch.tensor(self.embedding(seq)).to(device)
        for mamba in self.layers:
            seq = mamba(seq)
        return self.fc_out(seq.mean(dim = 1))

In [6]:
from dataclasses import dataclass
from typing import ClassVar
from typing import List, Dict, Any, Tuple, Optional
@dataclass
class ModelTrainer:
    model: 'typing.Any'
    train_dataloader: DataLoader
    val_dataloader: DataLoader
    test_dataloader: DataLoader
    device: torch.device
    epochs: int
    round_loss: int
    round_acc: int

    optimizer: torch.optim
    loss_fn: 'typing.Any'
    
    patience: int = 5 # Ранняя остановка обучения

    class_names: ClassVar[Optional[List[str]]] = None # Список имен классов

    def __post_init__(self):
        
        # История обучения и тестирования
        self.__history = pd.DataFrame({
            "train_acc": [], # Точность на тренировочной выборке
            "test_acc": [], # Точность на тестовой выборке
            "train_loss": [], # Loss на тренировочной выборке
            "test_loss": [] # Loss на тестовой выборке
        })

        # Количество шагов в одной эпохе
        self.__train_steps = len(self.train_dataloader)
        self.__test_steps = len(self.val_dataloader)

        self.__best_test_accuracy = 0
        self.__no_improvement_count = 0
        
        self.loss_fn = self.loss_fn

    @property
    def history(self) -> pd.DataFrame:
        """Получение DataFrame историей обучения и тестирования

        Returns:
            pd.DataFrame: **DataFrame** c историей обучения и тестирования
        """

        return self.__history

    @classmethod
    def get_model_logits(cls, logits: torch.Tensor) -> torch.Tensor:
        """Получение логитов модели в зависимости от функции потерь

        Args:
            logits (torch.Tensor): Входные логиты

        Returns:
            torch.Tensor: Обработанные логиты
        """

        if isinstance(cls.loss_fn, nn.NLLLoss):
            log_softmax = nn.LogSoftmax(dim = 1)
            return log_softmax(logits)
        elif isinstance(cls.loss_fn, nn.CrossEntropyLoss):
            return logits

    def _is_best_model(self, test_accuracy: float) -> bool:
        """Проверка, является ли текущая модель лучшей на основе точности тестирования

        Args:
            test_accuracy (float): Текущая точность тестирования

        Returns:
            bool: True, если текущая модель лучшая, иначе False
        """

        try:
            max_test_acc = max(self.__history["test_acc"])
        except ValueError:
            max_test_acc = 0
        return test_accuracy > max_test_acc

    def _save_model(self, epoch: int, path_to_model: str, test_accuracy: float, loss: torch.Tensor) -> None:
        """Сохранение модели

        Args:
            epoch (int): Текущая эпоха
            path_to_model (str): Путь для сохранения модели
            test_accuracy (float): Точность на тестовой выборке
            loss (torch.Tensor): Значение потерь
        """
        
        os.makedirs(path_to_model, exist_ok = True)

        torch.save({
            "epoch": epoch,
            "model_state_dict": self.model.state_dict(),
            "optimizer_state_dict": self.optimizer.state_dict(),
            "test_loss": loss,
        }, os.path.join(path_to_model, f"{self.model.__class__.__name__}_{self.model.model_name}_{epoch}_{test_accuracy}_checkpoint.pth"))
    
    # Процесс обучения
    def train(self, path_to_model: str) -> None:
        """Процесс обучения

        Args:
            path_to_model (str): Путь для сохранения моделей

        Returns:
            None
        """
        
        losses_train_list = []
        losses_test_list = []
        accuracy_train_list = []
        accuracy_test_list = []

        for epoch in range(1, self.epochs + 1):
            gc.collect()
            with torch.no_grad():
                torch.cuda.empty_cache()
            self.model.train() # Установка модели в режим обучения
            # Сумма Loss
            total_train_loss = 0
            total_test_loss = 0
            # Сумма точности
            train_accuracy = 0
            test_accuracy = 0

            # Проход по всем тренировочным пакетам
            with tqdm(total = self.__train_steps, desc = f"Эпоха {epoch}", unit = "batch") as pbar_train:
                for batch, (batch_X, targets) in enumerate(self.train_dataloader, 1):
                    logits = self.model(batch_X)
                    loss = self.loss_fn(logits, targets) # Ошибка предсказаний

                    # Обратное распространение для обновления весов
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()
        
                    total_train_loss += loss.item() # Потеря
                    # Количество правильных предсказаний
                    train_accuracy += (logits.argmax(1) == targets).type(torch.float).sum().item()
        
                    pbar_train.update(1)
                    with torch.no_grad():
                        torch.cuda.empty_cache()

                # Средняя потеря
                avg_train_loss = round(total_train_loss / batch, self.round_loss)
                losses_train_list.append(avg_train_loss)
        
                # Точность
                train_accuracy = round(train_accuracy / len(self.train_dataloader.dataset) * 100, self.round_acc)
                accuracy_train_list.append(train_accuracy)
        
                pbar_train.set_postfix({
                    "Точность": train_accuracy,
                    "Средняя потеря": avg_train_loss
                })
            
            
            # Установка модели в режим предсказаний
            self.model.eval()
        
            # Предсказания на валидационной выборке
            with torch.no_grad():
                with tqdm(total = self.__test_steps, desc = f"Тестирование {epoch}", unit = "batch") as pbar_test:
                    # Проход по всем тестовым пакетам
                    for batch, (batch_X, targets) in enumerate(self.val_dataloader, 1):    
                        logits = self.model(batch_X)
                        loss = self.loss_fn(logits, targets) # Ошибка предсказаний
                        
                        total_test_loss += loss.item() # Потеря
                        # Количество правильных предсказаний
                        test_accuracy += (logits.argmax(1) == targets).type(torch.float).sum().item()
        
                        pbar_test.update(1)
                        with torch.no_grad():
                            torch.cuda.empty_cache()

                    # Средняя потеря
                    avg_test_loss = round(total_test_loss / batch, self.round_loss)
                    #print("Средняя потеря/тестовая", avg_test_loss, epoch)
                    losses_test_list.append(avg_test_loss)
        
                    # Точность
                    test_accuracy = round(test_accuracy / len(self.val_dataloader.dataset) * 100, self.round_acc)
                    #print("Точность/тестовая", test_accuracy, epoch)
                    accuracy_test_list.append(test_accuracy)
                    
                    pbar_test.set_postfix({
                        "Точность": test_accuracy,
                        "Средняя потеря": avg_test_loss
                    })
            
            if self._is_best_model(test_accuracy):
                self._save_model(epoch, path_to_model, test_accuracy, avg_test_loss)
                self._epoch = epoch
                self.__best_test_accuracy = test_accuracy
                self.__no_improvement_count = 0
            else:
                self.__no_improvement_count += 1

            # Добавлениие данных в историю обучения
            new_row = pd.Series([train_accuracy, test_accuracy, avg_train_loss, avg_test_loss], index = self.__history.columns)
            self.__history = pd.concat([self.__history, new_row.to_frame().T], ignore_index = True)

            if self.__no_improvement_count >= self.patience:
                print(f"Ранняя остановка на эпохе {epoch} из-за отсутствия улучшения точности на тестовой выборке")
                break
        self._best_model_name = f"{self.model.__class__.__name__}_{self.model.model_name}_{self._epoch}_{max(accuracy_test_list)}_checkpoint.pth"
        # Визуализация графиков потерь и точности
        plt.figure(figsize=(12, 6))

        plt.subplot(1, 2, 1)
        plt.plot(losses_train_list, label = 'Потери на тренировочной выборке')
        plt.plot(losses_test_list, label = 'Потери на валидационной выборке')
        plt.title('Потери во время обучения')
        plt.xlabel('Эпоха')
        plt.ylabel('Потери')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(accuracy_train_list, label = 'Точность на валидационной выборке')
        plt.plot(accuracy_test_list, label = 'Точность на тестовой выборке')
        plt.title('Точность во время обучения')
        plt.xlabel('Эпоха')
        plt.ylabel('Точность')
        plt.legend()

        plt.tight_layout()
        plt.show()

    # Получение хэш-значения
    def __hash__(self):
        return id(self)

In [7]:
def evaluate_metrics(model, test_dataloader):
    model.eval()
    y_test = []
    y_predict = []
    with torch.no_grad():
        for batch, (batch_X, targets) in enumerate(test_dataloader, 1):
            y_test.extend(list(map(int, targets)))
            output = model(batch_X)
            _, predictions = torch.max(output, dim=1)
            y_predict.extend(list(map(int, predictions)))
        # Unweighted Average Recall (UAR)
        uar = recall_score(y_test, y_predict, average='macro')
        # Weighted Average Recall (WAR)
        war = recall_score(y_test, y_predict, average='weighted')
        # Macro F1-score (MF1)
        mf1 = f1_score(y_test, y_predict, average='macro')
        # Weighted F1-score (WF1)
        wf1 = f1_score(y_test, y_predict, average='weighted')
    return {'uar': 100.0 * uar, 'war': 100.0 * war, 'mf1': 100.0 * mf1, 'wf1': 100.0 * wf1}

In [8]:
BATCH_SIZE = 32
train_dataloader = DataLoader(dataset=MELDDataset('train'), batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(dataset=MELDDataset('valid'), batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(dataset=MELDDataset('test'), batch_size=BATCH_SIZE, shuffle=False)

In [9]:
EPOCHS = 50 # Количество эпох
LEARNING_RATE = 1e-4 # Скорость обучения
ROUND_ACC = 2 # Знаков Accuracy после запятой
ROUND_LOSS = 7 # Знаков Loss после запятой
# Проверка наличия GPU
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ROOT_DIR = os.path.join(".")
PATH_TO_MODEL = os.path.join(ROOT_DIR, "models_MELD_jina_experiments")

In [10]:
from sklearn.utils.class_weight import compute_class_weight
y = []
for batch, (batch_X, targets) in enumerate(train_dataloader, 1):
    y.extend(list(map(int, targets)))
class_weights = torch.tensor(compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y), dtype=torch.float).to(device)

In [12]:
%%capture --no-stdout
result = []
d_model = 32
for num_layers in [1, 2, 4, 6, 8, 10]:
    print(f"d_model={d_model}, num_layers={num_layers}")
    model_mamba = Mamba(model_name='jina', pooling=None, num_layers = num_layers, d_input = 1024, d_model = d_model, num_classes=7).to(device)
    optimizer = optim.AdamW(params = model_mamba.parameters(), lr = 1e-4)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, val_dataloader, test_dataloader, DEVICE, 50, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn, patience=10)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_val = evaluate_metrics(model_mamba, val_dataloader)
    print("Метрики на валидационной выборке: ", metrics_val)
    metrics_test = evaluate_metrics(model_mamba, test_dataloader)
    print("Метрики на тестовой выборке: ", metrics_test)
    result.append([{"d_model" : d_model, "num_layers": num_layers}, metrics_val, metrics_test, trainer._best_model_name])

d_model=32, num_layers=1
Ранняя остановка на эпохе 15 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 42.93131778180958, 'war': 50.85662759242561, 'mf1': 39.876425014511945, 'wf1': 52.592043384214}
Метрики на тестовой выборке:  {'uar': 43.88226682601091, 'war': 51.64750957854406, 'mf1': 38.658057789695405, 'wf1': 54.18625152814411}
d_model=32, num_layers=2
Ранняя остановка на эпохе 18 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 43.760799813154364, 'war': 50.49594229035167, 'mf1': 40.68484953825469, 'wf1': 51.78515767762656}
Метрики на тестовой выборке:  {'uar': 41.298507364217436, 'war': 48.160919540229884, 'mf1': 36.939714714356484, 'wf1': 50.42749390267017}
d_model=32, num_layers=4
Ранняя остановка на эпохе 21 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 39.68627268597707, 'war': 49.50405770964833, 'mf1': 37.026146181648585, '

In [13]:
df = pd.DataFrame(result, columns=["параметры", "метрики val", "метрики test", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики val"].apply(pd.Series), df["метрики test"].apply(pd.Series), df["путь"]], axis=1)
df

Unnamed: 0,d_model,num_layers,uar,war,mf1,wf1,uar.1,war.1,mf1.1,wf1.1,путь
0,32,1,42.931318,50.856628,39.876425,52.592043,43.882267,51.64751,38.658058,54.186252,Mamba_jina_5_50.23_checkpoint.pth
1,32,2,43.7608,50.495942,40.68485,51.785158,41.298507,48.16092,36.939715,50.427494,Mamba_jina_8_50.14_checkpoint.pth
2,32,4,39.686273,49.504058,37.026146,51.039128,38.363627,50.344828,35.358811,52.870411,Mamba_jina_11_50.05_checkpoint.pth
3,32,6,20.634027,43.733093,18.234502,34.681796,20.772794,47.777778,18.667851,39.008135,Mamba_jina_1_44.0_checkpoint.pth
4,32,8,24.137222,33.904418,22.254444,33.41567,25.605845,36.666667,22.610534,37.080615,Mamba_jina_7_34.81_checkpoint.pth
5,32,10,23.914271,37.240757,20.914489,34.281506,25.801594,39.118774,21.422852,37.509207,Mamba_jina_19_36.25_checkpoint.pth


In [15]:
df.to_csv(os.path.join(PATH_TO_MODEL, "result_d_model32_num_layers.csv"))

In [14]:
%%capture --no-stdout
result = []
d_model = 64
for num_layers in [1, 2, 4, 6, 8, 10]:
    print(f"d_model={d_model}, num_layers={num_layers}")
    model_mamba = Mamba(model_name='jina', pooling=None, num_layers = num_layers, d_input = 1024, d_model = d_model, num_classes=7).to(device)
    optimizer = optim.AdamW(params = model_mamba.parameters(), lr = 1e-4)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, val_dataloader, test_dataloader, DEVICE, 50, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn, patience=10)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_val = evaluate_metrics(model_mamba, val_dataloader)
    print("Метрики на валидационной выборке: ", metrics_val)
    metrics_test = evaluate_metrics(model_mamba, test_dataloader)
    print("Метрики на тестовой выборке: ", metrics_test)
    result.append([{"d_model" : d_model, "num_layers": num_layers}, metrics_val, metrics_test, trainer._best_model_name])

d_model=64, num_layers=1
Ранняя остановка на эпохе 11 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 36.569355509453175, 'war': 52.479711451758334, 'mf1': 35.82084701223917, 'wf1': 51.66351481592708}
Метрики на тестовой выборке:  {'uar': 39.924187439817324, 'war': 55.632183908045974, 'mf1': 37.98091109135523, 'wf1': 55.300113677036975}
d_model=64, num_layers=2
Ранняя остановка на эпохе 26 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 44.41362934836372, 'war': 51.12714156898106, 'mf1': 41.03095419506542, 'wf1': 52.51586385432475}
Метрики на тестовой выборке:  {'uar': 42.524652697682605, 'war': 52.10727969348659, 'mf1': 38.44102342797285, 'wf1': 54.65596100842095}
d_model=64, num_layers=4
Ранняя остановка на эпохе 24 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 44.13146414465548, 'war': 53.471596032461676, 'mf1': 41.3323431375288, 

In [16]:
df = pd.DataFrame(result, columns=["параметры", "метрики val", "метрики test", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики val"].apply(pd.Series), df["метрики test"].apply(pd.Series), df["путь"]], axis=1)
df.to_csv(os.path.join(PATH_TO_MODEL, "result_d_model64_num_layers.csv"))
df

Unnamed: 0,d_model,num_layers,uar,war,mf1,wf1,uar.1,war.1,mf1.1,wf1.1,путь
0,64,1,36.569356,52.479711,35.820847,51.663515,39.924187,55.632184,37.980911,55.300114,Mamba_jina_1_52.57_checkpoint.pth
1,64,2,44.413629,51.127142,41.030954,52.515864,42.524653,52.10728,38.441023,54.655961,Mamba_jina_16_51.58_checkpoint.pth
2,64,4,44.131464,53.471596,41.332343,53.494907,41.431457,53.180077,37.460736,54.036947,Mamba_jina_14_53.74_checkpoint.pth
3,64,6,35.160007,44.18395,33.272429,45.90919,36.284913,46.206897,33.38119,48.803532,Mamba_jina_10_44.45_checkpoint.pth
4,64,8,22.499359,37.150586,21.020504,36.019382,21.534045,38.62069,19.871811,38.031699,Mamba_jina_5_37.42_checkpoint.pth
5,64,10,19.7662,41.929666,16.061796,33.665498,20.318499,47.89272,17.787428,40.50051,Mamba_jina_1_41.75_checkpoint.pth


In [13]:
%%capture --no-stdout
result = []
d_model = 128
for num_layers in [1, 2, 4, 6, 8, 10]:
    print(f"d_model={d_model}, num_layers={num_layers}")
    model_mamba = Mamba(model_name='jina', pooling=None, num_layers = num_layers, d_input = 1024, d_model = d_model, num_classes=7).to(device)
    optimizer = optim.AdamW(params = model_mamba.parameters(), lr = 1e-4)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, val_dataloader, test_dataloader, DEVICE, 50, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn, patience=10)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_val = evaluate_metrics(model_mamba, val_dataloader)
    print("Метрики на валидационной выборке: ", metrics_val)
    metrics_test = evaluate_metrics(model_mamba, test_dataloader)
    print("Метрики на тестовой выборке: ", metrics_test)
    result.append([{"d_model" : d_model, "num_layers": num_layers}, metrics_val, metrics_test, trainer._best_model_name])

d_model=128, num_layers=1
Ранняя остановка на эпохе 18 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 45.1862251598732, 'war': 51.39765554553652, 'mf1': 41.76178109199485, 'wf1': 53.0600837998231}
Метрики на тестовой выборке:  {'uar': 43.782256609916445, 'war': 51.8007662835249, 'mf1': 39.4144798351026, 'wf1': 54.55131871767654}
d_model=128, num_layers=2
Ранняя остановка на эпохе 21 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 42.956185522568354, 'war': 51.93868349864743, 'mf1': 39.780789328595006, 'wf1': 52.1089293856312}
Метрики на тестовой выборке:  {'uar': 42.799447978482654, 'war': 52.87356321839081, 'mf1': 38.334911119211725, 'wf1': 54.108953527511375}
d_model=128, num_layers=4
Ранняя остановка на эпохе 35 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 42.18522496864673, 'war': 54.102795311091064, 'mf1': 40.995637465418916, 

In [14]:
df = pd.DataFrame(result, columns=["параметры", "метрики val", "метрики test", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики val"].apply(pd.Series), df["метрики test"].apply(pd.Series), df["путь"]], axis=1)
df.to_csv(os.path.join(PATH_TO_MODEL, "result_d_model128_num_layers.csv"))

In [11]:
df = pd.read_csv(os.path.join(PATH_TO_MODEL, "result_d_model128_num_layers.csv"))

In [15]:
df

Unnamed: 0,d_model,num_layers,uar,war,mf1,wf1,uar.1,war.1,mf1.1,wf1.1,путь
0,128,1,45.186225,51.397656,41.761781,53.060084,43.782257,51.800766,39.41448,54.551319,Mamba_jina_8_51.76_checkpoint.pth
1,128,2,42.956186,51.938683,39.780789,52.108929,42.799448,52.873563,38.334911,54.108954,Mamba_jina_11_51.85_checkpoint.pth
2,128,4,42.185225,54.102795,40.995637,54.603087,40.217596,54.559387,38.474267,55.910219,Mamba_jina_25_53.83_checkpoint.pth
3,128,6,34.145839,45.175834,30.14081,44.33155,38.293822,48.390805,32.882919,48.216406,Mamba_jina_3_44.91_checkpoint.pth
4,128,8,30.948365,46.708747,30.267642,44.515658,32.199881,50.383142,31.320981,48.929041,Mamba_jina_5_46.35_checkpoint.pth
5,128,10,21.052894,40.937782,15.975845,32.607298,21.365498,44.176245,16.275717,36.948601,Mamba_jina_6_40.4_checkpoint.pth


In [16]:
%%capture --no-stdout
result = []
d_model = 256
for num_layers in [1, 2, 4, 6, 8, 10]:
    print(f"d_model={d_model}, num_layers={num_layers}")
    model_mamba = Mamba(model_name='jina', pooling=None, num_layers = num_layers, d_input = 1024, d_model = d_model, num_classes=7).to(device)
    optimizer = optim.AdamW(params = model_mamba.parameters(), lr = 1e-4)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, val_dataloader, test_dataloader, DEVICE, 50, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn, patience=10)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_val = evaluate_metrics(model_mamba, val_dataloader)
    print("Метрики на валидационной выборке: ", metrics_val)
    metrics_test = evaluate_metrics(model_mamba, test_dataloader)
    print("Метрики на тестовой выборке: ", metrics_test)
    result.append([{"d_model" : d_model, "num_layers": num_layers}, metrics_val, metrics_test, trainer._best_model_name])

d_model=256, num_layers=1
Ранняя остановка на эпохе 20 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 43.60059412894245, 'war': 51.93868349864743, 'mf1': 40.46608495158181, 'wf1': 53.16470280896313}
Метрики на тестовой выборке:  {'uar': 41.440683343280014, 'war': 52.60536398467433, 'mf1': 38.26106870998215, 'wf1': 55.02460878077714}
d_model=256, num_layers=2
Ранняя остановка на эпохе 21 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 42.39339323049022, 'war': 52.38954012623985, 'mf1': 40.7586817027804, 'wf1': 53.299239082549065}
Метрики на тестовой выборке:  {'uar': 42.16307714569944, 'war': 53.9080459770115, 'mf1': 39.22602064830651, 'wf1': 55.568019241905034}
d_model=256, num_layers=4
Ранняя остановка на эпохе 22 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 41.7817854912229, 'war': 53.56176735798016, 'mf1': 39.93485512761682, 'wf

In [17]:
df = pd.DataFrame(result, columns=["параметры", "метрики val", "метрики test", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики val"].apply(pd.Series), df["метрики test"].apply(pd.Series), df["путь"]], axis=1)
df.to_csv(os.path.join(PATH_TO_MODEL, "result_d_model256_num_layers.csv"))

In [18]:
df

Unnamed: 0,d_model,num_layers,uar,war,mf1,wf1,uar.1,war.1,mf1.1,wf1.1,путь
0,256,1,43.600594,51.938683,40.466085,53.164703,41.440683,52.605364,38.261069,55.024609,Mamba_jina_10_52.03_checkpoint.pth
1,256,2,42.393393,52.38954,40.758682,53.299239,42.163077,53.908046,39.226021,55.568019,Mamba_jina_11_52.66_checkpoint.pth
2,256,4,41.781785,53.561767,39.934855,53.430985,40.544381,53.94636,37.41156,54.372451,Mamba_jina_12_53.38_checkpoint.pth
3,256,6,28.209693,46.979261,27.128984,44.013258,32.400841,51.800766,30.827228,50.055218,Mamba_jina_5_46.98_checkpoint.pth
4,256,8,24.435028,37.150586,21.2896,34.226855,26.28466,39.233716,21.900067,37.760437,Mamba_jina_23_37.78_checkpoint.pth
5,256,10,25.709751,34.174932,23.428707,33.97134,25.651594,36.704981,23.767199,38.01024,Mamba_jina_12_34.36_checkpoint.pth


In [11]:
%%capture --no-stdout
result = []
d_model = 512
for num_layers in [1, 2, 4, 6, 8, 10]:
    print(f"d_model={d_model}, num_layers={num_layers}")
    model_mamba = Mamba(model_name='jina', pooling=None, num_layers = num_layers, d_input = 1024, d_model = d_model, num_classes=7).to(device)
    optimizer = optim.AdamW(params = model_mamba.parameters(), lr = 1e-4)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, val_dataloader, test_dataloader, DEVICE, 50, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn, patience=10)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_val = evaluate_metrics(model_mamba, val_dataloader)
    print("Метрики на валидационной выборке: ", metrics_val)
    metrics_test = evaluate_metrics(model_mamba, test_dataloader)
    print("Метрики на тестовой выборке: ", metrics_test)
    result.append([{"d_model" : d_model, "num_layers": num_layers}, metrics_val, metrics_test, trainer._best_model_name])

d_model=512, num_layers=1
Ранняя остановка на эпохе 19 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 43.917892441737436, 'war': 51.75834084761046, 'mf1': 41.39487410232722, 'wf1': 53.2704695917091}
Метрики на тестовой выборке:  {'uar': 43.41123425795647, 'war': 51.724137931034484, 'mf1': 39.4281025570958, 'wf1': 54.26205699236203}
d_model=512, num_layers=2
Ранняя остановка на эпохе 16 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 42.30671917587694, 'war': 51.75834084761046, 'mf1': 40.542665177953204, 'wf1': 52.99913589265653}
Метрики на тестовой выборке:  {'uar': 42.21757727276531, 'war': 51.685823754789276, 'mf1': 38.561841665834244, 'wf1': 54.06801714372623}
d_model=512, num_layers=4
Ранняя остановка на эпохе 35 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 44.19337248905267, 'war': 52.84039675383229, 'mf1': 40.98817780535831, 

In [12]:
df = pd.DataFrame(result, columns=["параметры", "метрики val", "метрики test", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики val"].apply(pd.Series), df["метрики test"].apply(pd.Series), df["путь"]], axis=1)
df.to_csv(os.path.join(PATH_TO_MODEL, "result_d_model512_num_layers.csv"))

In [13]:
df

Unnamed: 0,d_model,num_layers,uar,war,mf1,wf1,uar.1,war.1,mf1.1,wf1.1,путь
0,512,1,43.917892,51.758341,41.394874,53.27047,43.411234,51.724138,39.428103,54.262057,Mamba_jina_9_52.03_checkpoint.pth
1,512,2,42.306719,51.758341,40.542665,52.999136,42.217577,51.685824,38.561842,54.068017,Mamba_jina_6_52.57_checkpoint.pth
2,512,4,44.193372,52.840397,40.988178,53.579737,40.464605,52.413793,36.979897,54.230565,Mamba_jina_25_53.2_checkpoint.pth
3,512,6,31.849374,48.33183,28.395337,44.750966,32.95376,52.720307,29.929172,49.74671,Mamba_jina_12_48.96_checkpoint.pth
4,512,8,16.674772,44.274121,12.297702,29.400266,17.195579,49.157088,13.600158,34.892426,Mamba_jina_4_44.72_checkpoint.pth
5,512,10,14.285714,42.380523,8.504478,25.229697,14.285714,48.122605,9.282389,31.268491,Mamba_jina_3_42.38_checkpoint.pth


In [26]:
columns = ["d_model", "num_layers", "uar_val", "war_val", "mf1_val", "wf1_val", "uar_test", "war_test", "mf1_test", "wf1_test", "path"]
df = pd.concat([pd.read_csv(os.path.join(PATH_TO_MODEL, "result_d_model32_num_layers.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_d_model64_num_layers.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_d_model128_num_layers.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_d_model256_num_layers.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_d_model512_num_layers.csv"), index_col=0)])
df.columns=columns

In [28]:
df['average_val'] = (df['uar_val'] + df['war_val'] + df['mf1_val'] + df['wf1_val']) / 4.0
df['average_test'] = (df['uar_test'] + df['war_test'] + df['mf1_test'] + df['wf1_test']) / 4.0

In [29]:
df

Unnamed: 0,d_model,num_layers,uar_val,war_val,mf1_val,wf1_val,uar_test,war_test,mf1_test,wf1_test,path,average_val,average_test
0,32,1,42.931318,50.856628,39.876425,52.592043,43.882267,51.64751,38.658058,54.186252,Mamba_jina_5_50.23_checkpoint.pth,46.564103,47.093521
1,32,2,43.7608,50.495942,40.68485,51.785158,41.298507,48.16092,36.939715,50.427494,Mamba_jina_8_50.14_checkpoint.pth,46.681687,44.206659
2,32,4,39.686273,49.504058,37.026146,51.039128,38.363627,50.344828,35.358811,52.870411,Mamba_jina_11_50.05_checkpoint.pth,44.313901,44.234419
3,32,6,20.634027,43.733093,18.234502,34.681796,20.772794,47.777778,18.667851,39.008135,Mamba_jina_1_44.0_checkpoint.pth,29.320855,31.556639
4,32,8,24.137222,33.904418,22.254444,33.41567,25.605845,36.666667,22.610534,37.080615,Mamba_jina_7_34.81_checkpoint.pth,28.427939,30.490915
5,32,10,23.914271,37.240757,20.914489,34.281506,25.801594,39.118774,21.422852,37.509207,Mamba_jina_19_36.25_checkpoint.pth,29.087756,30.963107
0,64,1,36.569356,52.479711,35.820847,51.663515,39.924187,55.632184,37.980911,55.300114,Mamba_jina_1_52.57_checkpoint.pth,44.133357,47.209349
1,64,2,44.413629,51.127142,41.030954,52.515864,42.524653,52.10728,38.441023,54.655961,Mamba_jina_16_51.58_checkpoint.pth,47.271897,46.932229
2,64,4,44.131464,53.471596,41.332343,53.494907,41.431457,53.180077,37.460736,54.036947,Mamba_jina_14_53.74_checkpoint.pth,48.107578,46.527304
3,64,6,35.160007,44.18395,33.272429,45.90919,36.284913,46.206897,33.38119,48.803532,Mamba_jina_10_44.45_checkpoint.pth,39.631394,41.169133


### d_model=256, num_layers=2

In [30]:
optim_dict = {"Adam" : optim.Adam, "AdamW" : optim.AdamW, "Rprop" : optim.Rprop, "RMSprop" : optim.RMSprop, "RAdam" : optim.RAdam}

### Adam

In [31]:
%%capture --no-stdout
result = []
optimizer_ = "Adam"
for lr in [1e-3, 1e-4, 1e-5]:
    print(f"lr={lr}, optimizer={optimizer_}")
    model_mamba = Mamba(model_name='jina', pooling=None, num_layers = 2, d_input = 1024, d_model = 64, num_classes=7).to(device)
    optimizer = optim_dict[optimizer_](params = model_mamba.parameters(), lr = lr)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, val_dataloader, test_dataloader, DEVICE, 50, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn, patience=10)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_val = evaluate_metrics(model_mamba, val_dataloader)
    print("Метрики на валидационной выборке: ", metrics_val)
    metrics_test = evaluate_metrics(model_mamba, test_dataloader)
    print("Метрики на тестовой выборке: ", metrics_test)
    result.append([{"lr" : lr, "optimizer": optimizer_}, metrics_val, metrics_test, trainer._best_model_name])

lr=0.001, optimizer=Adam
Ранняя остановка на эпохе 24 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 39.41372781689666, 'war': 51.75834084761046, 'mf1': 37.489084370875474, 'wf1': 51.00421180998561}
Метрики на тестовой выборке:  {'uar': 39.31945278586066, 'war': 54.09961685823754, 'mf1': 37.19815185000767, 'wf1': 54.24793397991136}
lr=0.0001, optimizer=Adam
Ранняя остановка на эпохе 16 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 42.52625246059524, 'war': 53.29125338142471, 'mf1': 40.688080451521316, 'wf1': 53.737677644248514}
Метрики на тестовой выборке:  {'uar': 41.7956691903345, 'war': 54.94252873563218, 'mf1': 39.18510381013996, 'wf1': 56.275111509053666}
lr=1e-05, optimizer=Adam
Ранняя остановка на эпохе 24 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 39.41591703737254, 'war': 47.8809738503156, 'mf1': 36.56823571280333, 'wf

In [32]:
df = pd.DataFrame(result, columns=["параметры", "метрики val", "метрики test", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики val"].apply(pd.Series), df["метрики test"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["lr", "optimizer", "uar_val", "war_val", "mf1_val", "wf1_val", "uar_test", "war_test", "mf1_test", "wf1_test", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_lr_optimizer_Adam.csv"))

In [35]:
%%capture --no-stdout
result = []
optimizer_ = "Adam"
for lr in [1e-3, 1e-4, 1e-5]:
    print(f"lr={lr}, optimizer={optimizer_}")
    model_mamba = Mamba(model_name='jina', pooling=None, num_layers = 2, d_input = 1024, d_model = 256, num_classes=7).to(device)
    optimizer = optim_dict[optimizer_](params = model_mamba.parameters(), lr = lr)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, val_dataloader, test_dataloader, DEVICE, 50, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn, patience=10)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_val = evaluate_metrics(model_mamba, val_dataloader)
    print("Метрики на валидационной выборке: ", metrics_val)
    metrics_test = evaluate_metrics(model_mamba, test_dataloader)
    print("Метрики на тестовой выборке: ", metrics_test)
    result.append([{"lr" : lr, "optimizer": optimizer_}, metrics_val, metrics_test, trainer._best_model_name])

lr=0.001, optimizer=Adam
Ранняя остановка на эпохе 19 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 33.862030827044606, 'war': 51.03697024346258, 'mf1': 33.11345356315619, 'wf1': 48.80285717471203}
Метрики на тестовой выборке:  {'uar': 36.907013847936284, 'war': 55.44061302681992, 'mf1': 36.42903213839121, 'wf1': 54.17744560806482}
lr=0.0001, optimizer=Adam
Ранняя остановка на эпохе 19 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 43.263453281036604, 'war': 52.479711451758334, 'mf1': 40.53144136307762, 'wf1': 53.590765924812914}
Метрики на тестовой выборке:  {'uar': 40.95013570320743, 'war': 52.720306513409966, 'mf1': 38.08806521339098, 'wf1': 54.69726145015555}
lr=1e-05, optimizer=Adam
Ранняя остановка на эпохе 31 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 45.44569951446262, 'war': 50.586113615870154, 'mf1': 41.44522912902339

In [36]:
df = pd.DataFrame(result, columns=["параметры", "метрики val", "метрики test", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики val"].apply(pd.Series), df["метрики test"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["lr", "optimizer", "uar_val", "war_val", "mf1_val", "wf1_val", "uar_test", "war_test", "mf1_test", "wf1_test", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_lr_with_optimizer_Adam.csv"))

In [37]:
df

Unnamed: 0,lr,optimizer,uar_val,war_val,mf1_val,wf1_val,uar_test,war_test,mf1_test,wf1_test,путь
0,0.001,Adam,33.862031,51.03697,33.113454,48.802857,36.907014,55.440613,36.429032,54.177446,Mamba_jina_9_50.5_checkpoint.pth
1,0.0001,Adam,43.263453,52.479711,40.531441,53.590766,40.950136,52.720307,38.088065,54.697261,Mamba_jina_9_52.66_checkpoint.pth
2,1e-05,Adam,45.4457,50.586114,41.445229,52.58597,44.200167,50.536398,38.775381,53.385819,Mamba_jina_21_50.77_checkpoint.pth


### AdamW

In [33]:
%%capture --no-stdout
result = []
optimizer_ = "AdamW"
for lr in [1e-3, 1e-4, 1e-5]:
    print(f"lr={lr}, optimizer={optimizer_}")
    model_mamba = Mamba(model_name='jina', pooling=None, num_layers = 2, d_input = 1024, d_model = 64, num_classes=7).to(device)
    optimizer = optim_dict[optimizer_](params = model_mamba.parameters(), lr = lr)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, val_dataloader, test_dataloader, DEVICE, 50, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn, patience=10)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_val = evaluate_metrics(model_mamba, val_dataloader)
    print("Метрики на валидационной выборке: ", metrics_val)
    metrics_test = evaluate_metrics(model_mamba, test_dataloader)
    print("Метрики на тестовой выборке: ", metrics_test)
    result.append([{"lr" : lr, "optimizer": optimizer_}, metrics_val, metrics_test, trainer._best_model_name])

lr=0.001, optimizer=AdamW
Ранняя остановка на эпохе 14 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 34.85594672498548, 'war': 54.283137962128045, 'mf1': 37.2952315914008, 'wf1': 51.28705605937237}
Метрики на тестовой выборке:  {'uar': 34.77925390124557, 'war': 57.8544061302682, 'mf1': 37.123378014901725, 'wf1': 55.162182793724}
lr=0.0001, optimizer=AdamW
Ранняя остановка на эпохе 17 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 41.9878001366672, 'war': 51.21731289449954, 'mf1': 39.72301852823247, 'wf1': 52.30674923795529}
Метрики на тестовой выборке:  {'uar': 43.17025032221932, 'war': 54.48275862068965, 'mf1': 40.03068471549209, 'wf1': 56.312189853717584}
lr=1e-05, optimizer=AdamW
Ранняя остановка на эпохе 15 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 36.50768809810788, 'war': 45.98737601442741, 'mf1': 33.55853788905785, 'wf1

In [34]:
df = pd.DataFrame(result, columns=["параметры", "метрики val", "метрики test", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики val"].apply(pd.Series), df["метрики test"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["lr", "optimizer", "uar_val", "war_val", "mf1_val", "wf1_val", "uar_test", "war_test", "mf1_test", "wf1_test", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_lr_optimizer_AdamW.csv"))

In [38]:
%%capture --no-stdout
result = []
optimizer_ = "AdamW"
for lr in [1e-3, 1e-4, 1e-5]:
    print(f"lr={lr}, optimizer={optimizer_}")
    model_mamba = Mamba(model_name='jina', pooling=None, num_layers = 2, d_input = 1024, d_model = 256, num_classes=7).to(device)
    optimizer = optim_dict[optimizer_](params = model_mamba.parameters(), lr = lr)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_mamba, train_dataloader, val_dataloader, test_dataloader, DEVICE, 50, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn, patience=10)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_mamba.load_state_dict(checkpoint['model_state_dict'])
    metrics_val = evaluate_metrics(model_mamba, val_dataloader)
    print("Метрики на валидационной выборке: ", metrics_val)
    metrics_test = evaluate_metrics(model_mamba, test_dataloader)
    print("Метрики на тестовой выборке: ", metrics_test)
    result.append([{"lr" : lr, "optimizer": optimizer_}, metrics_val, metrics_test, trainer._best_model_name])

lr=0.001, optimizer=AdamW
Ранняя остановка на эпохе 11 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 30.86510302601736, 'war': 48.512173128944994, 'mf1': 26.905084493292676, 'wf1': 42.79956507634093}
Метрики на тестовой выборке:  {'uar': 28.79735408083514, 'war': 50.72796934865901, 'mf1': 25.227974054305502, 'wf1': 45.79891675586527}
lr=0.0001, optimizer=AdamW
Ранняя остановка на эпохе 15 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 40.35005961808637, 'war': 52.84039675383229, 'mf1': 39.512088516682184, 'wf1': 53.21768690271795}
Метрики на тестовой выборке:  {'uar': 43.061347382614116, 'war': 55.28735632183908, 'mf1': 40.20593683000967, 'wf1': 56.61343156710402}
lr=1e-05, optimizer=AdamW
Ранняя остановка на эпохе 26 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке:  {'uar': 45.000601302282085, 'war': 49.413886384129846, 'mf1': 40.31163163822

In [39]:
df = pd.DataFrame(result, columns=["параметры", "метрики val", "метрики test", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики val"].apply(pd.Series), df["метрики test"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["lr", "optimizer", "uar_val", "war_val", "mf1_val", "wf1_val", "uar_test", "war_test", "mf1_test", "wf1_test", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_lr_with_optimizer_AdamW.csv"))

In [40]:
df

Unnamed: 0,lr,optimizer,uar_val,war_val,mf1_val,wf1_val,uar_test,war_test,mf1_test,wf1_test,путь
0,0.001,AdamW,30.865103,48.512173,26.905084,42.799565,28.797354,50.727969,25.227974,45.798917,Mamba_jina_1_48.6_checkpoint.pth
1,0.0001,AdamW,40.35006,52.840397,39.512089,53.217687,43.061347,55.287356,40.205937,56.613432,Mamba_jina_5_53.02_checkpoint.pth
2,1e-05,AdamW,45.000601,49.413886,40.311632,51.774946,44.681665,49.54023,38.147014,52.695483,Mamba_jina_16_50.05_checkpoint.pth
