In [1]:
# Подавление предупреждений
import warnings
for warn in [UserWarning, FutureWarning]: warnings.filterwarnings("ignore", category = warn)

# Импорт необходимых библиотек
import os
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModel,AutoModelForMaskedLM
import torch
import torch.nn.functional as F
from torch import Tensor
from einops import rearrange
from typing import Tuple, Callable
from torch.autograd import Function
import gc
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
pd.set_option('display.max_columns', None)

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Данные

In [3]:
from torch.utils.data import Dataset, DataLoader 
import numpy as np 
import math 

class Dataset_MELD_RESD(): 
    def __init__(self, part='train', transform=None): 
        if part == 'train':
            df_meld = pd.read_csv("train_sent_emo.csv")[['Utterance', 'Emotion']]
            df_meld.columns = ['text', 'emotion']
            df_resd = pd.read_csv("train.csv")[['text', 'emotion']]
            df = pd.concat([df_meld, df_resd[0:int(len(df_resd)*0.7)]], axis=0)
        elif part == 'dev_meld':
            df = pd.read_csv("dev_sent_emo.csv")[['Utterance', 'Emotion']]
            df.columns = ['text', 'emotion']
        elif part == 'dev_resd':
            df = pd.read_csv("train.csv")
            df = df[int(len(df)*0.7):]
        elif part == 'test_resd':
            df = pd.read_csv("test.csv")
        elif part == 'test_meld':
            df = pd.read_csv("test_sent_emo.csv")[['Utterance', 'Emotion']]
            df.columns = ['text', 'emotion']
        elif part == 'test_resd':
            df = pd.read_csv("test.csv")
        else:
            raise ValueError('Unknown part of Dataset (train / test_meld / test_resd)')
        self.x = list(df['text'].values)
        emotion_mapping = {
            'anger': 0,
            'disgust': 1,
            'fear': 2,
            'joy': 3,
            'happiness': 3,
            'neutral': 4,
            'sadness': 5,
            'surprise': 6,
            'enthusiasm': 6
        }

        self.y = torch.tensor(df['emotion'].apply(lambda x : emotion_mapping[x]).values).to(device)
        self.n_samples = df.shape[0]

    def __getitem__(self, index): 
        return self.x[index], self.y[index] 
        
    def __len__(self): 
        return self.n_samples 

In [4]:
BATCH_SIZE = 32
train_dataloader = DataLoader(dataset=Dataset_MELD_RESD('train'), batch_size=BATCH_SIZE, shuffle=True)
dev_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD('dev_meld'), batch_size=BATCH_SIZE, shuffle=False)
dev_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD('dev_resd'), batch_size=BATCH_SIZE, shuffle=False)
test_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD('test_meld'), batch_size=BATCH_SIZE, shuffle=False)
test_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD('test_resd'), batch_size=BATCH_SIZE, shuffle=False)

### Feature Extractor

In [6]:
class Embedding():
    def __init__(self, model_name='jina', pooling=None):
        self.model_name = model_name
        self.pooling = pooling
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        if model_name == 'jina':
            self.tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v3", code_revision='da863dd04a4e5dce6814c6625adfba87b83838aa', trust_remote_code=True)
            self.model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", code_revision='da863dd04a4e5dce6814c6625adfba87b83838aa', trust_remote_code=True).to(self.device)
        elif model_name == 'xlm-roberta-base':
            self.tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
            self.model = AutoModel.from_pretrained('xlm-roberta-base').to(self.device)
        elif model_name == 'canine-c':
            self.tokenizer = AutoTokenizer.from_pretrained('google/canine-c')
            self.model = AutoModel.from_pretrained('google/canine-c').to(self.device)
        else:
            raise ValueError('Unknown name of Embedding')
    def _mean_pooling(self, X):
        def mean_pooling(model_output, attention_mask):
            token_embeddings = model_output[0]
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        encoded_input = self.tokenizer(X, padding=True, truncation=True, return_tensors='pt').to(self.device)
        with torch.no_grad():
            model_output = self.model(**encoded_input)
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings.unsqueeze(1)
    
    def get_embeddings(self, X):
        if self.pooling is None:
            if self.model_name == 'canine-c':
                max_len = 329
            else:
                max_len = 95
            encoded_input = self.tokenizer(X, padding=True, truncation=True, return_tensors='pt').to(self.device)
            with torch.no_grad():
                features = self.model(**encoded_input)[0].detach().cpu().float().numpy()
            res = np.pad(features[:, :max_len, :], ((0, 0), (0, max(0, max_len - features.shape[1])), (0, 0)), "constant")
            return torch.tensor(res)
        elif self.pooling == 'mean':
            return self._mean_pooling(X)
        else:
            raise ValueError('Unknown type of pooling')

### Метрики

In [7]:
def evaluate_metrics(model, test_dataloader):
    model.eval()
    y_test = []
    y_predict = []
    with torch.no_grad():
        for batch, (batch_X, targets) in enumerate(test_dataloader, 1):
            y_test.extend(list(map(int, targets)))
            output = model(batch_X)
            _, predictions = torch.max(output, dim=1)
            y_predict.extend(list(map(int, predictions)))
        # Unweighted Average Recall (UAR)
        uar = recall_score(y_test, y_predict, average='macro')
        # Weighted Average Recall (WAR)
        war = recall_score(y_test, y_predict, average='weighted')
        # Macro F1-score (MF1)
        mf1 = f1_score(y_test, y_predict, average='macro')
        # Weighted F1-score (WF1)
        wf1 = f1_score(y_test, y_predict, average='weighted')
    return {'uar': 100.0 * uar, 'war': 100.0 * war, 'mf1': 100.0 * mf1, 'wf1': 100.0 * wf1}

# Обучение

In [8]:
from dataclasses import dataclass
from typing import ClassVar
from typing import List, Dict, Any, Tuple, Optional
@dataclass
class ModelTrainer:
    model: 'typing.Any'
    train_dataloader: DataLoader
    dev_meld_dataloader: DataLoader
    dev_resd_dataloader: DataLoader
    test_meld_dataloader: DataLoader
    test_resd_dataloader: DataLoader
    device: torch.device
    epochs: int
    round_loss: int
    round_acc: int

    optimizer: torch.optim
    loss_fn: 'typing.Any'
    
    patience: int = 10 # Ранняя остановка обучения

    class_names: ClassVar[Optional[List[str]]] = None # Список имен классов

    def __post_init__(self):
        
        # История обучения и тестирования
        self.__history = pd.DataFrame({
            "train_avg": [], # Средние метрики на тренировочной выборке
            "dev_avg": [], # Средние метрики на валидационной выборке
            "train_loss": [], # Loss на тренировочной выборке
            "dev_loss": [], # Loss на валидационной выборке
        })

        # Количество шагов в одной эпохе
        self.__train_steps = len(self.train_dataloader)
        self.__dev_steps = len(self.dev_meld_dataloader) + len(self.dev_resd_dataloader)
        self.__test_steps = len(self.test_meld_dataloader) + len(self.test_resd_dataloader)

        self.__best_dev_avg = 0
        self.__no_improvement_count = 0
        
        self.loss_fn = self.loss_fn

    @property
    def history(self) -> pd.DataFrame:
        """Получение DataFrame историей обучения и тестирования

        Returns:
            pd.DataFrame: **DataFrame** c историей обучения и тестирования
        """

        return self.__history

    @classmethod
    def get_model_logits(cls, logits: torch.Tensor) -> torch.Tensor:
        """Получение логитов модели в зависимости от функции потерь

        Args:
            logits (torch.Tensor): Входные логиты

        Returns:
            torch.Tensor: Обработанные логиты
        """

        if isinstance(cls.loss_fn, nn.NLLLoss):
            log_softmax = nn.LogSoftmax(dim = 1)
            return log_softmax(logits)
        elif isinstance(cls.loss_fn, nn.CrossEntropyLoss):
            return logits

    def _is_best_model(self, dev_avg: float) -> bool:
        """Проверка, является ли текущая модель лучшей на основе метрик валидации

        Args:
            test_accuracy (float): Текущая точность тестирования

        Returns:
            bool: True, если текущая модель лучшая, иначе False
        """

        try:
            max_dev_avg = max(self.__history["dev_avg"])
        except ValueError:
            max_dev_avg = 0
        return dev_avg > max_dev_avg

    def _save_model(self, epoch: int, path_to_model: str, test_accuracy: float, loss: torch.Tensor) -> None:
        """Сохранение модели

        Args:
            epoch (int): Текущая эпоха
            path_to_model (str): Путь для сохранения модели
            test_accuracy (float): Точность на тестовой выборке
            loss (torch.Tensor): Значение потерь
        """
        
        os.makedirs(path_to_model, exist_ok = True)
        self._best_model_name = f"{self.model.__class__.__name__}_{self.model.model_name}_{epoch}_{test_accuracy}_checkpoint.pth"

        torch.save({
            "epoch": epoch,
            "model_state_dict": self.model.state_dict(),
            "optimizer_state_dict": self.optimizer.state_dict(),
            "test_loss": loss,
        }, os.path.join(path_to_model, f"{self.model.__class__.__name__}_{self.model.model_name}_{epoch}_{test_accuracy}_checkpoint.pth"))
    
    # Процесс обучения
    def train(self, path_to_model: str) -> None:
        """Процесс обучения

        Args:
            path_to_model (str): Путь для сохранения моделей

        Returns:
            None
        """
        
        losses_train_list = []
        losses_dev_list = []
        accuracy_train_list = []
        accuracy_dev_list = []

        for epoch in range(1, self.epochs + 1):
            with torch.no_grad():
                torch.cuda.empty_cache()
            self.model.train() # Установка модели в режим обучения
            # Сумма Loss
            total_train_loss = 0
            total_dev_loss = 0
            total_dev_loss_meld = 0
            total_dev_loss_resd = 0
            # Сумма точности
            train_accuracy = 0
            dev_accuracy = 0
            dev_accuracy_meld = 0
            dev_accuracy_resd = 0
            # Сумма метрик
            train_uar = 0
            train_war = 0
            train_mf1 = 0
            train_wf1 = 0
            dev_uar_meld = 0
            dev_war_meld = 0
            dev_mf1_meld = 0
            dev_wf1_meld = 0
            dev_uar_resd = 0
            dev_war_resd = 0
            dev_mf1_resd = 0
            dev_wf1_resd = 0

            # Проход по всем тренировочным пакетам
            with tqdm(total = self.__train_steps, desc = f"Эпоха {epoch}", unit = "batch") as pbar_train:
                for batch, (batch_X, targets) in enumerate(self.train_dataloader, 1):
                    targets = targets.to(device)
                    logits = self.model(batch_X)
                    loss = self.loss_fn(logits, targets) # Ошибка предсказаний

                    # Обратное распространение для обновления весов
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()
        
                    total_train_loss += loss.item() # Потеря
                    # Метрики
                    train_uar += 100.0 * recall_score(targets.cpu(), logits.argmax(1).cpu(), average='macro')
                    train_war += 100.0 * recall_score(targets.cpu(), logits.argmax(1).cpu(), average='weighted')
                    train_mf1 += 100.0 * f1_score(targets.cpu(), logits.argmax(1).cpu(), average='macro')
                    train_wf1 += 100.0 * f1_score(targets.cpu(), logits.argmax(1).cpu(), average='weighted')
                    train_accuracy += (logits.argmax(1) == targets).type(torch.float).sum().item()
        
                    pbar_train.update(1)
                    with torch.no_grad():
                        torch.cuda.empty_cache()

                # Средняя потеря
                avg_train_loss = round(total_train_loss / batch, self.round_loss)
                losses_train_list.append(avg_train_loss)
        
                # Точность
                train_accuracy = round(train_accuracy / len(self.train_dataloader.dataset) * 100, self.round_acc)
                
                
                train_uar = round(train_uar / len(self.train_dataloader), self.round_acc)
                train_war = round(train_war / len(self.train_dataloader), self.round_acc)
                train_mf1 = round(train_mf1 / len(self.train_dataloader), self.round_acc)
                train_wf1 = round(train_wf1 / len(self.train_dataloader), self.round_acc)
                
                train_avg_metrics = 0.25 * (train_uar + train_war + train_mf1 + train_wf1)
                accuracy_train_list.append(train_avg_metrics)
        
                pbar_train.set_postfix({
                    "uar": train_uar,
                    "war" : train_war,
                    "mf1" : train_mf1,
                    "wf1" : train_wf1,
                    "avg" : train_avg_metrics,
                    "Средняя потеря": avg_train_loss
                })
            
            
            # Установка модели в режим предсказаний
            self.model.eval()
        
            # Предсказания на валидационной выборке
            with torch.no_grad():
                with tqdm(total = self.__dev_steps, desc = f"Тестирование {epoch}", unit = "batch") as pbar_dev:
                    num_batches = 0
                    for batch, (batch_X, targets) in enumerate(self.dev_meld_dataloader, 1):
                        targets = targets.to(device)
                        logits = self.model(batch_X)
                        loss = self.loss_fn(logits, targets) # Ошибка предсказаний
                        
                        total_dev_loss += loss.item() # Потеря
                        total_dev_loss_meld += loss.item()
                        dev_accuracy_meld += (logits.argmax(1) == targets).type(torch.float).sum().item()
                        # Метрики
                        dev_uar_meld += 100.0 * recall_score(targets.cpu(), logits.argmax(1).cpu(), average='macro')
                        dev_war_meld += 100.0 * recall_score(targets.cpu(), logits.argmax(1).cpu(), average='weighted')
                        dev_mf1_meld += 100.0 * f1_score(targets.cpu(), logits.argmax(1).cpu(), average='macro')
                        dev_wf1_meld += 100.0 * f1_score(targets.cpu(), logits.argmax(1).cpu(), average='weighted')
        
                        pbar_dev.update(1)
                        with torch.no_grad():
                            torch.cuda.empty_cache()
                    num_batches += batch
                    batch_meld = batch
                    for batch, (batch_X, targets) in enumerate(self.dev_resd_dataloader, 1):
                        targets = targets.to(device)
                        logits = self.model(batch_X)
                        loss = self.loss_fn(logits, targets) # Ошибка предсказаний
                        
                        total_dev_loss += loss.item() # Потеря
                        total_dev_loss_resd += loss.item()
                        # Количество правильных предсказаний
                        dev_accuracy_resd += (logits.argmax(1) == targets).type(torch.float).sum().item()
                        # Метрики
                        dev_uar_resd += 100.0 * recall_score(targets.cpu(), logits.argmax(1).cpu(), average='macro')
                        dev_war_resd += 100.0 * recall_score(targets.cpu(), logits.argmax(1).cpu(), average='weighted')
                        dev_mf1_resd += 100.0 * f1_score(targets.cpu(), logits.argmax(1).cpu(), average='macro')
                        dev_wf1_resd += 100.0 * f1_score(targets.cpu(), logits.argmax(1).cpu(), average='weighted')
        
                        pbar_dev.update(1)
                        with torch.no_grad():
                            torch.cuda.empty_cache()
                    num_batches += batch
                    # Средняя потеря
                    avg_dev_loss = round(total_dev_loss / num_batches, self.round_loss)
                    avg_dev_loss = round(0.5 * (total_dev_loss_meld / batch_meld + total_dev_loss_resd / batch), self.round_loss)
                    losses_dev_list.append(avg_dev_loss)
        
                    # Точность
                    dev_accuracy = round(0.5 * (dev_accuracy_meld / len(self.dev_meld_dataloader.dataset) * 100 + dev_accuracy_resd / len(self.dev_resd_dataloader.dataset) * 100), self.round_acc)
                
                    dev_uar_meld = round(dev_uar_meld / len(self.dev_meld_dataloader), self.round_acc)
                    dev_war_meld = round(dev_war_meld / len(self.dev_meld_dataloader), self.round_acc)
                    dev_mf1_meld = round(dev_mf1_meld / len(self.dev_meld_dataloader), self.round_acc)
                    dev_wf1_meld = round(dev_wf1_meld / len(self.dev_meld_dataloader), self.round_acc)
                    
                    dev_uar_resd = round(dev_uar_resd / len(self.dev_resd_dataloader), self.round_acc)
                    dev_war_resd = round(dev_war_resd / len(self.dev_resd_dataloader), self.round_acc)
                    dev_mf1_resd = round(dev_mf1_resd / len(self.dev_resd_dataloader), self.round_acc)
                    dev_wf1_resd = round(dev_wf1_resd / len(self.dev_resd_dataloader), self.round_acc)
                    
                    
                    dev_uar = 0.5 * (dev_uar_meld + dev_uar_resd)
                    dev_war = 0.5 * (dev_war_meld + dev_war_resd)
                    dev_mf1 = 0.5 * (dev_mf1_meld + dev_mf1_resd)
                    dev_wf1 = 0.5 * (dev_wf1_meld + dev_wf1_resd)
                    
                    dev_avg_metrics = 0.25 * (dev_uar + dev_war + dev_mf1 + dev_wf1)
                    accuracy_dev_list.append(dev_avg_metrics)
                    
                    pbar_dev.set_postfix({
                        "uar": dev_uar,
                        "war" : dev_war,
                        "mf1" : dev_mf1,
                        "wf1" : dev_wf1,
                        "avg" : dev_avg_metrics,
                        "Средняя потеря": avg_dev_loss
                    })
            
            if self._is_best_model(dev_avg_metrics):
                self._save_model(epoch, path_to_model, round(dev_avg_metrics, self.round_acc), avg_dev_loss)
                self.__best_dev_avg = dev_avg_metrics
                self.__no_improvement_count = 0
            else:
                self.__no_improvement_count += 1

            # Добавлениие данных в историю обучения
            new_row = pd.Series([train_avg_metrics, dev_avg_metrics, avg_train_loss, avg_dev_loss], index = self.__history.columns)
            self.__history = pd.concat([self.__history, new_row.to_frame().T], ignore_index = True)

            if self.__no_improvement_count >= self.patience:
                print(f"Ранняя остановка на эпохе {epoch} из-за отсутствия улучшения точности на тестовой выборке")
                break
        '''checkpoint = torch.load(os.path.join(path_to_model, self._best_model_name))
        self.model.load_state_dict(checkpoint['model_state_dict'])
        metrics_dev_meld = evaluate_metrics(self.model, dev_meld_dataloader)
        metrics_dev_resd = evaluate_metrics(self.model, dev_resd_dataloader)
        print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
        print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
        metrics_test_meld = evaluate_metrics(self.model, test_meld_dataloader)
        metrics_test_resd = evaluate_metrics(self.model, test_resd_dataloader)
        print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
        print("Метрики на тестовой выборке RESD: ", metrics_test_resd)'''
        '''# Визуализация графиков потерь и точности
        plt.figure(figsize=(12, 6))

        plt.subplot(1, 2, 1)
        plt.plot(losses_train_list, label = 'Потери на тренировочной выборке')
        plt.plot(losses_dev_list, label = 'Потери на валидационной выборке')
        plt.title('Потери во время обучения')
        plt.xlabel('Эпоха')
        plt.ylabel('Потери')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(accuracy_train_list, label = 'Средние метрики на тренировочной выборке')
        plt.plot(accuracy_dev_list, label = 'Средние метрики на валидационной выборке')
        plt.title('Средние метрики во время обучения')
        plt.xlabel('Эпоха')
        plt.ylabel('Точность')
        plt.legend()

        plt.tight_layout()
        plt.show()'''

    # Получение хэш-значения
    def __hash__(self):
        return id(self)

In [2]:
EPOCHS = 50 # Количество эпох
BATCH_SIZE = 32 # Размер выборки (пакета)
LEARNING_RATE = 1e-4 # Скорость обучения
ROUND_ACC = 2 # Знаков Accuracy после запятой
ROUND_LOSS = 7 # Знаков Loss после запятой
ROOT_DIR = os.path.join(".")
PATH_TO_MODEL = os.path.join(ROOT_DIR, "Models_transformer")

class_weight

In [10]:
from sklearn.utils.class_weight import compute_class_weight
y = []
for batch, (batch_X, targets) in enumerate(train_dataloader, 1):
    y.extend(list(map(int, targets)))
class_weights = torch.tensor(compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y), dtype=torch.float).to(device)

In [11]:
class TransformerModelWithAttention(nn.Module):
    def __init__(self, num_classes, model_name='jina', pooling=None, input_dim = 1024, hidden_dim=128, num_heads = 4, num_layers = 8, dropout = 0.1):
        super(TransformerModelWithAttention, self).__init__()
        self.model_name = model_name
        embed = Embedding(model_name, pooling)
        self.embedding = embed.get_embeddings
        self.in_layer = nn.Linear(input_dim, hidden_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 500, hidden_dim))
        encoder_layer = nn.TransformerEncoderLayer(d_model = hidden_dim, nhead = num_heads, dim_feedforward = hidden_dim, dropout = dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers = num_layers)
        self.fc_out = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = torch.tensor(self.embedding(x)).to(device)
        x = self.in_layer(x)
        batch_size, seq_len, _ = x.size()
        x = x + self.positional_encoding[:, :seq_len, :]
        encoder_output = self.transformer_encoder(x).to(device)
        x = encoder_output.mean(dim = 1) # Глобальное усреднение для классификации
        return self.fc_out(x)

#### Transformer + jina

In [17]:
%%capture --no-stdout
result = []
num_heads = 4
num_layers = 1
for hidden_dim in [64, 128, 256, 512]:
    print(f"hidden_dim={hidden_dim}, num_layers={num_layers}, num_heads={num_heads}")
    model_transformer = TransformerModelWithAttention(model_name='jina', pooling=None,  num_layers=num_layers, input_dim=1024, hidden_dim=hidden_dim, num_heads=num_heads, num_classes=7).to(device)
    optimizer = optim.Adam(params = model_transformer.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_transformer, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_transformer.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_transformer, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_transformer, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_transformer, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_transformer, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"hidden_dim" : hidden_dim, "num_layers": num_layers, "num_heads" : num_heads}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

hidden_dim=64, num_layers=1, num_heads=4
Ранняя остановка на эпохе 27 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 43.46663897219806, 'war': 49.864743011722275, 'mf1': 40.23581729535014, 'wf1': 51.757774191446146}
Метрики на валидационной выборке RESD:  {'uar': 31.262822037027366, 'war': 31.343283582089555, 'mf1': 29.671806020638343, 'wf1': 30.112172261083085}
Метрики на тестовой выборке MELD:  {'uar': 42.115623724456256, 'war': 50.57471264367817, 'mf1': 37.834699960123, 'wf1': 53.226001720734196}
Метрики на тестовой выборке RESD:  {'uar': 32.25226764700449, 'war': 32.857142857142854, 'mf1': 30.783826642031293, 'wf1': 31.223810752744264}
hidden_dim=128, num_layers=1, num_heads=4
Ранняя остановка на эпохе 25 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 42.119619490504874, 'war': 51.21731289449954, 'mf1': 39.291794877489274, 'wf1': 52.45689829434617}
Метрики на валидацион

In [18]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["hidden_dim", "num_layers", "num_heads", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_1_num_heads_4_hidden_dim.csv"))

In [21]:
%%capture --no-stdout
result = []
num_heads = 4
num_layers = 2
for hidden_dim in [64, 128, 256, 512]:
    print(f"hidden_dim={hidden_dim}, num_layers={num_layers}, num_heads={num_heads}")
    model_transformer = TransformerModelWithAttention(model_name='jina', pooling=None,  num_layers=num_layers, input_dim=1024, hidden_dim=hidden_dim, num_heads=num_heads, num_classes=7).to(device)
    optimizer = optim.Adam(params = model_transformer.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_transformer, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_transformer.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_transformer, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_transformer, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_transformer, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_transformer, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"hidden_dim" : hidden_dim, "num_layers": num_layers, "num_heads" : num_heads}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

hidden_dim=64, num_layers=2, num_heads=4
Ранняя остановка на эпохе 37 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 44.330698623123205, 'war': 52.209197475202885, 'mf1': 41.61987185609153, 'wf1': 53.62548211380194}
Метрики на валидационной выборке RESD:  {'uar': 33.40454442646624, 'war': 33.43283582089553, 'mf1': 33.05372691703453, 'wf1': 33.300285090027316}
Метрики на тестовой выборке MELD:  {'uar': 40.366638836108706, 'war': 52.1455938697318, 'mf1': 37.836189873677995, 'wf1': 54.72746148638371}
Метрики на тестовой выборке RESD:  {'uar': 30.749697493118543, 'war': 31.071428571428573, 'mf1': 30.534011057814897, 'wf1': 30.804202620098692}
hidden_dim=128, num_layers=2, num_heads=4
Ранняя остановка на эпохе 30 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 41.24194511797497, 'war': 52.569882777276824, 'mf1': 39.983268965555666, 'wf1': 53.03049552620104}
Метрики на валидационн

In [22]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["hidden_dim", "num_layers", "num_heads", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_2_num_heads_4_hidden_dim.csv"))

In [27]:
%%capture --no-stdout
result = []
num_heads = 4
num_layers = 3
for hidden_dim in [64, 128, 256, 512]:
    print(f"hidden_dim={hidden_dim}, num_layers={num_layers}, num_heads={num_heads}")
    model_transformer = TransformerModelWithAttention(model_name='jina', pooling=None,  num_layers=num_layers, input_dim=1024, hidden_dim=hidden_dim, num_heads=num_heads, num_classes=7).to(device)
    optimizer = optim.Adam(params = model_transformer.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_transformer, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_transformer.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_transformer, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_transformer, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_transformer, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_transformer, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"hidden_dim" : hidden_dim, "num_layers": num_layers, "num_heads" : num_heads}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

hidden_dim=64, num_layers=3, num_heads=4
Ранняя остановка на эпохе 30 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 42.577722886413966, 'war': 49.32371505861136, 'mf1': 39.47999022588684, 'wf1': 51.10059826371444}
Метрики на валидационной выборке RESD:  {'uar': 31.646654797184574, 'war': 32.23880597014925, 'mf1': 31.229030557051452, 'wf1': 31.83764587305881}
Метрики на тестовой выборке MELD:  {'uar': 42.425191554633955, 'war': 49.57854406130268, 'mf1': 37.751790934995576, 'wf1': 52.2778985029318}
Метрики на тестовой выборке RESD:  {'uar': 32.45302804513331, 'war': 33.214285714285715, 'mf1': 31.93203157523915, 'wf1': 32.55048340794771}
hidden_dim=128, num_layers=3, num_heads=4
Ранняя остановка на эпохе 26 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 42.86118877437781, 'war': 49.413886384129846, 'mf1': 39.8704072595689, 'wf1': 50.94731594087768}
Метрики на валидационной вы

In [28]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["hidden_dim", "num_layers", "num_heads", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_3_num_heads_4_hidden_dim.csv"))

In [11]:
columns = ["hidden_dim", "num_layers", "num_heads", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df = pd.concat([pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_1_num_heads_4_hidden_dim.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_2_num_heads_4_hidden_dim.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_3_num_heads_4_hidden_dim.csv"), index_col=0)])
df.columns=columns

In [12]:
df['average_dev_meld'] = (df['uar_dev_meld'] + df['war_dev_meld'] + df['mf1_dev_meld'] + df['wf1_dev_meld']) / 4.0
df['average_dev_resd'] = (df['uar_dev_resd'] + df['war_dev_resd'] + df['mf1_dev_resd'] + df['wf1_dev_resd']) / 4.0
df['average_test_meld'] = (df['uar_test_meld'] + df['war_test_meld'] + df['mf1_test_meld'] + df['wf1_test_meld']) / 4.0
df['average_test_resd'] = (df['uar_test_resd'] + df['war_test_resd'] + df['mf1_test_resd'] + df['wf1_test_resd']) / 4.0

In [14]:
df.sort_values(['average_test_resd', 'average_test_meld'] , ascending=False)

Unnamed: 0,hidden_dim,num_layers,num_heads,uar_dev_meld,war_dev_meld,mf1_dev_meld,wf1_dev_meld,uar_dev_resd,war_dev_resd,mf1_dev_resd,...,wf1_test_meld,uar_test_resd,war_test_resd,mf1_test_resd,wf1_test_resd,путь,average_dev_meld,average_dev_resd,average_test_meld,average_test_resd
0,64,3,4,42.577723,49.323715,39.47999,51.100598,31.646655,32.238806,31.229031,...,52.277899,32.453028,33.214286,31.932032,32.550483,TransformerModelWithAttention_jina_20_37.76_ch...,45.620507,31.738034,45.508356,32.537457
3,512,3,4,43.020185,50.045086,39.72668,51.697128,34.905402,35.223881,34.822099,...,52.73965,31.675392,32.5,31.384693,32.178948,TransformerModelWithAttention_jina_9_39.73_che...,46.12227,35.050744,44.820289,31.934758
3,512,1,4,41.098528,48.602344,38.872358,50.177844,33.389364,34.328358,32.628789,...,53.749205,31.960182,32.5,31.285603,31.614452,TransformerModelWithAttention_jina_10_38.61_ch...,44.687768,33.426201,46.173178,31.840059
0,64,1,4,43.466639,49.864743,40.235817,51.757774,31.262822,31.343284,29.671806,...,53.226002,32.252268,32.857143,30.783827,31.223811,TransformerModelWithAttention_jina_17_37.69_ch...,46.331243,30.597521,45.93776,31.779262
2,256,3,4,39.451823,51.397656,38.518519,51.744103,32.429783,33.134328,32.430134,...,54.802795,31.792135,32.142857,31.2181,31.702578,TransformerModelWithAttention_jina_19_38.97_ch...,45.278025,32.762989,47.010831,31.713917
0,64,2,4,44.330699,52.209197,41.619872,53.625482,33.404544,33.432836,33.053727,...,54.727461,30.749697,31.071429,30.534011,30.804203,TransformerModelWithAttention_jina_27_39.39_ch...,47.946313,33.297848,46.268971,30.789835
1,128,1,4,42.119619,51.217313,39.291795,52.456898,31.214663,31.343284,30.348491,...,54.131643,30.752306,31.071429,29.969587,30.33499,TransformerModelWithAttention_jina_15_37.98_ch...,46.271406,30.936837,46.686892,30.532078
1,128,3,4,42.861189,49.413886,39.870407,50.947316,34.412596,34.626866,33.632314,...,51.477209,30.51718,30.357143,29.774488,29.826229,TransformerModelWithAttention_jina_16_38.89_ch...,45.7732,34.172809,44.520899,30.11876
3,512,2,4,40.869248,48.061317,36.714518,49.117176,32.592706,32.537313,32.615003,...,53.010923,29.633582,30.0,29.312895,29.909241,TransformerModelWithAttention_jina_12_37.91_ch...,43.690565,32.672798,45.597863,29.71393
2,256,1,4,43.283556,52.569883,40.992918,53.602777,33.952395,33.432836,33.06582,...,54.368431,29.343185,29.642857,28.750894,28.967176,TransformerModelWithAttention_jina_17_39.55_ch...,47.612283,33.394952,46.062207,29.176028


In [20]:
%%capture --no-stdout
result = []
for num_heads in [1, 2, 8]:
    for (hidden_dim, num_layers) in [(256, 3), (64, 3)]:
        print(f"hidden_dim={hidden_dim}, num_layers={num_layers}, num_heads={num_heads}")
        model_transformer = TransformerModelWithAttention(model_name='jina', pooling=None,  num_layers=num_layers, input_dim=1024, hidden_dim=hidden_dim, num_heads=num_heads, num_classes=7).to(device)
        optimizer = optim.Adam(params = model_transformer.parameters(), lr = LEARNING_RATE)
        loss_fn = nn.CrossEntropyLoss(weight=class_weights)
        trainer = ModelTrainer(model_transformer, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
        trainer.train(PATH_TO_MODEL)
        checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
        model_transformer.load_state_dict(checkpoint['model_state_dict'])
        metrics_dev_meld = evaluate_metrics(model_transformer, dev_meld_dataloader)
        metrics_dev_resd = evaluate_metrics(model_transformer, dev_resd_dataloader)
        print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
        print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
        metrics_test_meld = evaluate_metrics(model_transformer, test_meld_dataloader)
        metrics_test_resd = evaluate_metrics(model_transformer, test_resd_dataloader)
        print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
        print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
        result.append([{"hidden_dim" : hidden_dim, "num_layers": num_layers, "num_heads" : num_heads}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

hidden_dim=256, num_layers=3, num_heads=1
Ранняя остановка на эпохе 21 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 42.999125198553934, 'war': 48.151487826871055, 'mf1': 38.71840246546759, 'wf1': 50.32936563509476}
Метрики на валидационной выборке RESD:  {'uar': 32.09865105462475, 'war': 32.83582089552239, 'mf1': 31.499666293694474, 'wf1': 31.950044874262346}
Метрики на тестовой выборке MELD:  {'uar': 39.75984505599082, 'war': 48.35249042145594, 'mf1': 36.13982278098257, 'wf1': 51.977547887751165}
Метрики на тестовой выборке RESD:  {'uar': 29.28365395470659, 'war': 30.357142857142854, 'mf1': 28.57165457776051, 'wf1': 29.224151173345565}
hidden_dim=64, num_layers=3, num_heads=1
Ранняя остановка на эпохе 38 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 42.74885783902807, 'war': 50.3155996393147, 'mf1': 39.301317775209775, 'wf1': 51.74606070793991}
Метрики на валидационной 

In [23]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["hidden_dim", "num_layers", "num_heads", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_heads.csv"))

In [24]:
columns = ["hidden_dim", "num_layers", "num_heads", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df = pd.concat([pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_1_num_heads_4_hidden_dim.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_2_num_heads_4_hidden_dim.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_3_num_heads_4_hidden_dim.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_heads.csv"), index_col=0)])
df.columns=columns

In [26]:
df['average_dev_meld'] = (df['uar_dev_meld'] + df['war_dev_meld'] + df['mf1_dev_meld'] + df['wf1_dev_meld']) / 4.0
df['average_dev_resd'] = (df['uar_dev_resd'] + df['war_dev_resd'] + df['mf1_dev_resd'] + df['wf1_dev_resd']) / 4.0
df['average_test_meld'] = (df['uar_test_meld'] + df['war_test_meld'] + df['mf1_test_meld'] + df['wf1_test_meld']) / 4.0
df['average_test_resd'] = (df['uar_test_resd'] + df['war_test_resd'] + df['mf1_test_resd'] + df['wf1_test_resd']) / 4.0

In [27]:
df.sort_values(['average_test_resd', 'average_test_meld'] , ascending=False)

Unnamed: 0,hidden_dim,num_layers,num_heads,uar_dev_meld,war_dev_meld,mf1_dev_meld,wf1_dev_meld,uar_dev_resd,war_dev_resd,mf1_dev_resd,...,wf1_test_meld,uar_test_resd,war_test_resd,mf1_test_resd,wf1_test_resd,путь,average_dev_meld,average_dev_resd,average_test_meld,average_test_resd
4,256,3,8,41.382602,52.38954,39.443991,52.827297,28.474246,28.656716,28.083259,...,55.603465,33.801203,33.928571,33.438886,33.732468,TransformerModelWithAttention_jina_10_37.51_ch...,46.510857,28.426474,47.382241,33.725282
0,64,3,4,42.577723,49.323715,39.47999,51.100598,31.646655,32.238806,31.229031,...,52.277899,32.453028,33.214286,31.932032,32.550483,TransformerModelWithAttention_jina_20_37.76_ch...,45.620507,31.738034,45.508356,32.537457
3,512,3,4,43.020185,50.045086,39.72668,51.697128,34.905402,35.223881,34.822099,...,52.73965,31.675392,32.5,31.384693,32.178948,TransformerModelWithAttention_jina_9_39.73_che...,46.12227,35.050744,44.820289,31.934758
3,512,1,4,41.098528,48.602344,38.872358,50.177844,33.389364,34.328358,32.628789,...,53.749205,31.960182,32.5,31.285603,31.614452,TransformerModelWithAttention_jina_10_38.61_ch...,44.687768,33.426201,46.173178,31.840059
0,64,1,4,43.466639,49.864743,40.235817,51.757774,31.262822,31.343284,29.671806,...,53.226002,32.252268,32.857143,30.783827,31.223811,TransformerModelWithAttention_jina_17_37.69_ch...,46.331243,30.597521,45.93776,31.779262
2,256,3,4,39.451823,51.397656,38.518519,51.744103,32.429783,33.134328,32.430134,...,54.802795,31.792135,32.142857,31.2181,31.702578,TransformerModelWithAttention_jina_19_38.97_ch...,45.278025,32.762989,47.010831,31.713917
0,64,2,4,44.330699,52.209197,41.619872,53.625482,33.404544,33.432836,33.053727,...,54.727461,30.749697,31.071429,30.534011,30.804203,TransformerModelWithAttention_jina_27_39.39_ch...,47.946313,33.297848,46.268971,30.789835
3,64,3,2,41.5192,51.487827,40.052388,52.585926,32.710997,33.134328,32.620004,...,54.574755,30.839495,31.071429,30.454192,30.631533,TransformerModelWithAttention_jina_29_38.95_ch...,46.411335,32.888586,46.79424,30.749162
1,128,1,4,42.119619,51.217313,39.291795,52.456898,31.214663,31.343284,30.348491,...,54.131643,30.752306,31.071429,29.969587,30.33499,TransformerModelWithAttention_jina_15_37.98_ch...,46.271406,30.936837,46.686892,30.532078
2,256,3,2,39.59138,54.643823,40.088174,53.573242,31.4773,31.641791,31.928812,...,56.606427,30.186894,30.357143,30.000341,30.348258,TransformerModelWithAttention_jina_15_39.1_che...,46.974155,31.729772,47.256614,30.223159


### Training hyperparameters

1) BATCH_SIZE=32

In [11]:
BATCH_SIZE = 32
train_dataloader = DataLoader(dataset=Dataset_MELD_RESD('train'), batch_size=BATCH_SIZE, shuffle=True)
dev_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD('dev_meld'), batch_size=BATCH_SIZE, shuffle=False)
dev_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD('dev_resd'), batch_size=BATCH_SIZE, shuffle=False)
test_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD('test_meld'), batch_size=BATCH_SIZE, shuffle=False)
test_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD('test_resd'), batch_size=BATCH_SIZE, shuffle=False)

In [12]:
%%capture --no-stdout
optimizer_ = "Adam"
result = []
for lr in [1e-3, 1e-4, 1e-5]:
    print(f"lr={lr}, batch_size={BATCH_SIZE}, optimizer={optimizer_}")
    model_transformer = TransformerModelWithAttention(model_name='jina', pooling=None,  num_layers=3, input_dim=1024, hidden_dim=256, num_heads=8, num_classes=7).to(device)
    optimizer = optim.Adam(params = model_transformer.parameters(), lr = lr)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_transformer, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_transformer.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_transformer, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_transformer, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_transformer, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_transformer, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"lr" : lr, "batch_size": BATCH_SIZE, "optimizer" : optimizer_}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

lr=0.001, batch_size=32, optimizer=Adam
Ранняя остановка на эпохе 11 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 14.285714285714285, 'war': 42.38052299368801, 'mf1': 8.504478422147834, 'wf1': 25.229697032341182}
Метрики на валидационной выборке RESD:  {'uar': 14.285714285714285, 'war': 13.73134328358209, 'mf1': 3.4495688038995125, 'wf1': 3.315704939867591}
Метрики на тестовой выборке MELD:  {'uar': 14.285714285714285, 'war': 48.122605363984675, 'mf1': 9.28238858916562, 'wf1': 31.26849060381001}
Метрики на тестовой выборке RESD:  {'uar': 14.285714285714285, 'war': 13.571428571428571, 'mf1': 3.4141958670260557, 'wf1': 3.2434860736747533}
lr=0.0001, batch_size=32, optimizer=Adam
Ранняя остановка на эпохе 21 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 43.50291053613458, 'war': 48.78268710550045, 'mf1': 38.36700358905479, 'wf1': 50.51233985079461}
Метрики на валидационной 

In [29]:
%%capture --no-stdout
optimizer_ = "Adam"
result = []
for lr in [1e-3, 1e-4, 1e-5]:
    print(f"lr={lr}, batch_size={BATCH_SIZE}, optimizer={optimizer_}")
    model_transformer = TransformerModelWithAttention(model_name='jina', pooling=None,  num_layers=3, input_dim=1024, hidden_dim=256, num_heads=8, num_classes=7).to(device)
    optimizer = optim.Adam(params = model_transformer.parameters(), lr = lr)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_transformer, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_transformer.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_transformer, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_transformer, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_transformer, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_transformer, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"lr" : lr, "batch_size": BATCH_SIZE, "optimizer" : optimizer_}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

lr=0.001, batch_size=32, optimizer=Adam
Ранняя остановка на эпохе 20 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 14.241147183321834, 'war': 41.56898106402164, 'mf1': 9.02155541719091, 'wf1': 25.616211417903035}
Метрики на валидационной выборке RESD:  {'uar': 15.82608695652174, 'war': 15.522388059701491, 'mf1': 6.773242253899733, 'wf1': 6.782722614719859}
Метрики на тестовой выборке MELD:  {'uar': 14.085664193207686, 'war': 47.01149425287356, 'mf1': 9.507348617958087, 'wf1': 31.23819188416357}
Метрики на тестовой выборке RESD:  {'uar': 14.377563226247435, 'war': 13.214285714285715, 'mf1': 7.808755038620436, 'wf1': 7.335075623169621}
lr=0.0001, batch_size=32, optimizer=Adam
Ранняя остановка на эпохе 17 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 41.453520988499534, 'war': 53.56176735798016, 'mf1': 41.3434749844828, 'wf1': 53.65369980318479}
Метрики на валидационной выбо

In [30]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["lr", "batch_size", "optimizer", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_Adam_32_lr.csv"))

In [32]:
%%capture --no-stdout
optimizer_ = "AdamW"
result = []
for lr in [1e-3, 1e-4, 1e-5]:
    print(f"lr={lr}, batch_size={BATCH_SIZE}, optimizer={optimizer_}")
    model_transformer = TransformerModelWithAttention(model_name='jina', pooling=None,  num_layers=3, input_dim=1024, hidden_dim=256, num_heads=8, num_classes=7).to(device)
    optimizer = optim.AdamW(params = model_transformer.parameters(), lr = lr)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_transformer, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_transformer.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_transformer, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_transformer, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_transformer, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_transformer, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"lr" : lr, "batch_size": BATCH_SIZE, "optimizer" : optimizer_}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

lr=0.001, batch_size=32, optimizer=AdamW
Ранняя остановка на эпохе 12 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 14.285714285714285, 'war': 42.38052299368801, 'mf1': 8.504478422147834, 'wf1': 25.229697032341182}
Метрики на валидационной выборке RESD:  {'uar': 14.285714285714285, 'war': 13.73134328358209, 'mf1': 3.4495688038995125, 'wf1': 3.315704939867591}
Метрики на тестовой выборке MELD:  {'uar': 14.285714285714285, 'war': 48.122605363984675, 'mf1': 9.28238858916562, 'wf1': 31.26849060381001}
Метрики на тестовой выборке RESD:  {'uar': 14.285714285714285, 'war': 13.571428571428571, 'mf1': 3.4141958670260557, 'wf1': 3.2434860736747533}
lr=0.0001, batch_size=32, optimizer=AdamW
Ранняя остановка на эпохе 29 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 39.21906743163759, 'war': 51.577998196573496, 'mf1': 38.211701458933966, 'wf1': 51.937296743290375}
Метрики на валидацио

In [33]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["lr", "batch_size", "optimizer", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_AdamW_32_lr.csv"))

In [11]:
%%capture --no-stdout
optimizer_ = "SGD"
result = []
for lr in [1e-3, 1e-4, 1e-5]:
    print(f"lr={lr}, batch_size={BATCH_SIZE}, optimizer={optimizer_}")
    model_transformer = TransformerModelWithAttention(model_name='jina', pooling=None,  num_layers=3, input_dim=1024, hidden_dim=256, num_heads=8, num_classes=7).to(device)
    optimizer = optim.SGD(params = model_transformer.parameters(), lr = lr)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_transformer, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_transformer.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_transformer, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_transformer, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_transformer, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_transformer, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"lr" : lr, "batch_size": BATCH_SIZE, "optimizer" : optimizer_}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

lr=0.001, batch_size=32, optimizer=SGD
Ранняя остановка на эпохе 24 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 40.388577068099785, 'war': 50.40577096483319, 'mf1': 38.24995657023816, 'wf1': 51.6220605070061}
Метрики на валидационной выборке RESD:  {'uar': 29.169689131691324, 'war': 28.955223880597014, 'mf1': 27.641496185882474, 'wf1': 27.89296831004985}
Метрики на тестовой выборке MELD:  {'uar': 42.035941595221544, 'war': 51.11111111111111, 'mf1': 38.40161772403036, 'wf1': 53.10966847376359}
Метрики на тестовой выборке RESD:  {'uar': 32.270597665334506, 'war': 32.5, 'mf1': 30.83420248528586, 'wf1': 30.985777659130726}
lr=0.0001, batch_size=32, optimizer=SGD
Метрики на валидационной выборке MELD:  {'uar': 37.42553486826235, 'war': 47.790802524797115, 'mf1': 35.66021246290333, 'wf1': 48.735330171732585}
Метрики на валидационной выборке RESD:  {'uar': 22.810930169169115, 'war': 22.686567164179106, 'mf1': 21.35741892570025, 'wf1'

In [12]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["lr", "batch_size", "optimizer", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_SGD_32_lr.csv"))

Результаты

In [17]:
columns = ["lr", "batch_size", "optimizer", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df = pd.concat([pd.read_csv(os.path.join(PATH_TO_MODEL, "result_Adam_32_lr.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_AdamW_32_lr.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_SGD_32_lr.csv"), index_col=0)])
df.columns=columns

In [18]:
df['average_dev_meld'] = (df['uar_dev_meld'] + df['war_dev_meld'] + df['mf1_dev_meld'] + df['wf1_dev_meld']) / 4.0
df['average_dev_resd'] = (df['uar_dev_resd'] + df['war_dev_resd'] + df['mf1_dev_resd'] + df['wf1_dev_resd']) / 4.0
df['average_test_meld'] = (df['uar_test_meld'] + df['war_test_meld'] + df['mf1_test_meld'] + df['wf1_test_meld']) / 4.0
df['average_test_resd'] = (df['uar_test_resd'] + df['war_test_resd'] + df['mf1_test_resd'] + df['wf1_test_resd']) / 4.0

In [19]:
df.sort_values(['average_test_resd', 'average_test_meld'] , ascending=False)

Unnamed: 0,lr,batch_size,optimizer,uar_dev_meld,war_dev_meld,mf1_dev_meld,wf1_dev_meld,uar_dev_resd,war_dev_resd,mf1_dev_resd,...,wf1_test_meld,uar_test_resd,war_test_resd,mf1_test_resd,wf1_test_resd,путь,average_dev_meld,average_dev_resd,average_test_meld,average_test_resd
3,0.0001,32,Adam,41.382602,52.38954,39.443991,52.827297,28.474246,28.656716,28.083259,...,55.603465,33.801203,33.928571,33.438886,33.732468,TransformerModelWithAttention_jina_10_37.51_ch...,46.510857,28.426474,47.382241,33.725282
1,0.0001,32,Adam,43.502911,48.782687,38.367004,50.51234,31.87673,31.343284,31.286905,...,54.018038,32.040132,32.142857,32.170436,32.487462,TransformerModelWithAttention_jina_11_37.3_che...,45.291235,31.487727,45.546935,32.210222
0,0.001,32,SGD,40.388577,50.405771,38.249957,51.622061,29.169689,28.955224,27.641496,...,53.109668,32.270598,32.5,30.834202,30.985778,TransformerModelWithAttention_jina_14_36.14_ch...,45.166591,28.414844,46.164585,31.647644
1,0.0001,32,AdamW,39.219067,51.577998,38.211701,51.937297,33.790102,34.925373,33.806318,...,52.757467,30.944185,31.785714,30.616526,31.371769,TransformerModelWithAttention_jina_19_39.29_ch...,45.236516,34.321957,44.457817,31.179548
2,1e-05,32,Adam,42.553934,51.307484,40.241506,53.268034,28.510823,28.358209,26.896834,...,54.718359,29.59422,30.0,27.228509,27.736673,TransformerModelWithAttention_jina_18_36.21_ch...,46.84274,27.82575,46.921477,28.63985
2,1e-05,32,AdamW,39.122692,49.413886,37.257977,50.082217,26.945327,28.059701,25.515062,...,51.868827,28.59272,30.0,27.127282,28.049372,TransformerModelWithAttention_jina_6_34.93_che...,43.969193,26.754706,44.753342,28.442343
1,0.0001,32,SGD,37.425535,47.790803,35.660212,48.73533,22.81093,22.686567,21.357419,...,50.944993,27.774324,28.214286,25.253754,25.837215,TransformerModelWithAttention_jina_50_31.64_ch...,42.40297,22.176515,43.562317,26.769895
2,1e-05,32,SGD,18.919281,27.772768,18.569101,28.892428,19.619336,19.104478,18.497631,...,31.219034,20.800739,20.357143,18.702759,18.668624,TransformerModelWithAttention_jina_48_20.13_ch...,23.538395,18.896666,24.57663,19.632316
0,0.001,32,Adam,14.285714,42.380523,8.504478,25.229697,14.285714,13.731343,3.449569,...,31.268491,14.285714,13.571429,3.414196,3.243486,TransformerModelWithAttention_jina_1_16.63_che...,22.600103,8.695583,25.7398,8.628706
0,0.001,32,AdamW,14.285714,42.380523,8.504478,25.229697,14.285714,13.731343,3.449569,...,31.268491,14.285714,13.571429,3.414196,3.243486,TransformerModelWithAttention_jina_2_16.63_che...,22.600103,8.695583,25.7398,8.628706


2) BATCH_SIZE=16

In [21]:
BATCH_SIZE = 16
train_dataloader = DataLoader(dataset=Dataset_MELD_RESD('train'), batch_size=BATCH_SIZE, shuffle=True)
dev_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD('dev_meld'), batch_size=BATCH_SIZE, shuffle=False)
dev_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD('dev_resd'), batch_size=BATCH_SIZE, shuffle=False)
test_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD('test_meld'), batch_size=BATCH_SIZE, shuffle=False)
test_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD('test_resd'), batch_size=BATCH_SIZE, shuffle=False)

In [26]:
%%capture --no-stdout
result = []
lr = 1e-4
optimizer_ = "Adam"
print(f"lr={lr}, batch_size={BATCH_SIZE}, optimizer={optimizer_}")
model_transformer = TransformerModelWithAttention(model_name='jina', pooling=None, num_layers=3, input_dim=1024, hidden_dim=256, num_heads=8, num_classes=7).to(device)
optimizer = optim.Adam(params = model_transformer.parameters(), lr = lr)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
trainer = ModelTrainer(model_transformer, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
trainer.train(PATH_TO_MODEL)
checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
model_transformer.load_state_dict(checkpoint['model_state_dict'])
metrics_dev_meld = evaluate_metrics(model_transformer, dev_meld_dataloader)
metrics_dev_resd = evaluate_metrics(model_transformer, dev_resd_dataloader)
print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
metrics_test_meld = evaluate_metrics(model_transformer, test_meld_dataloader)
metrics_test_resd = evaluate_metrics(model_transformer, test_resd_dataloader)
print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
result.append([{"lr" : lr, "batch_size": BATCH_SIZE, "optimizer" : optimizer_}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

lr=0.0001, batch_size=64, optimizer=Adam
Ранняя остановка на эпохе 20 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 44.76951371437193, 'war': 49.6844003606853, 'mf1': 39.40577050664712, 'wf1': 51.393272593736306}
Метрики на валидационной выборке RESD:  {'uar': 30.785122744750076, 'war': 31.343283582089555, 'mf1': 29.761715367701342, 'wf1': 30.407446757221035}
Метрики на тестовой выборке MELD:  {'uar': 42.10489691009946, 'war': 50.26819923371647, 'mf1': 37.0694461138311, 'wf1': 53.30478076960284}
Метрики на тестовой выборке RESD:  {'uar': 35.03450990293096, 'war': 35.714285714285715, 'mf1': 34.31341234359091, 'wf1': 34.63083624103096}


3) BATCH_SIZE=64

In [12]:
BATCH_SIZE = 64
train_dataloader = DataLoader(dataset=Dataset_MELD_RESD('train'), batch_size=BATCH_SIZE, shuffle=True)
dev_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD('dev_meld'), batch_size=BATCH_SIZE, shuffle=False)
dev_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD('dev_resd'), batch_size=BATCH_SIZE, shuffle=False)
test_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD('test_meld'), batch_size=BATCH_SIZE, shuffle=False)
test_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD('test_resd'), batch_size=BATCH_SIZE, shuffle=False)

In [None]:
%%capture --no-stdout
lr = 1e-4
optimizer_ = "Adam"
model_transformer = TransformerModelWithAttention(model_name='jina', pooling=None, num_layers=3, input_dim=1024, hidden_dim=256, num_heads=8, num_classes=7).to(device)
optimizer = optim.Adam(params = model_transformer.parameters(), lr = lr)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
trainer = ModelTrainer(model_transformer, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
trainer.train(PATH_TO_MODEL)

In [15]:
checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
model_transformer.load_state_dict(checkpoint['model_state_dict'])
metrics_dev_meld = evaluate_metrics(model_transformer, dev_meld_dataloader)
metrics_dev_resd = evaluate_metrics(model_transformer, dev_resd_dataloader)
print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
metrics_test_meld = evaluate_metrics(model_transformer, test_meld_dataloader)
metrics_test_resd = evaluate_metrics(model_transformer, test_resd_dataloader)
print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
result.append([{"lr" : lr, "batch_size": BATCH_SIZE, "optimizer" : optimizer_}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

Метрики на валидационной выборке MELD:  {'uar': 40.784831868245185, 'war': 50.225428313796215, 'mf1': 39.21444906680436, 'wf1': 51.269979936942164}
Метрики на валидационной выборке RESD:  {'uar': 30.817176145820646, 'war': 31.343283582089555, 'mf1': 31.007677683599695, 'wf1': 31.7993200449282}
Метрики на тестовой выборке MELD:  {'uar': 37.81245353927666, 'war': 49.54022988505747, 'mf1': 35.38850010299255, 'wf1': 52.30072423340215}
Метрики на тестовой выборке RESD:  {'uar': 32.402770593560064, 'war': 32.857142857142854, 'mf1': 32.27554987287454, 'wf1': 32.906963289401546}


In [17]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["lr", "batch_size", "optimizer", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_batch_size.csv"))

In [19]:
columns = ["lr", "batch_size", "optimizer", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df = pd.concat([pd.read_csv(os.path.join(PATH_TO_MODEL, "result_Adam_32_lr.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_AdamW_32_lr.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_SGD_32_lr.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_batch_size.csv"), index_col=0)])
df.columns=columns

In [20]:
df['average_dev_meld'] = (df['uar_dev_meld'] + df['war_dev_meld'] + df['mf1_dev_meld'] + df['wf1_dev_meld']) / 4.0
df['average_dev_resd'] = (df['uar_dev_resd'] + df['war_dev_resd'] + df['mf1_dev_resd'] + df['wf1_dev_resd']) / 4.0
df['average_test_meld'] = (df['uar_test_meld'] + df['war_test_meld'] + df['mf1_test_meld'] + df['wf1_test_meld']) / 4.0
df['average_test_resd'] = (df['uar_test_resd'] + df['war_test_resd'] + df['mf1_test_resd'] + df['wf1_test_resd']) / 4.0

In [23]:
df['avg_test'] = 0.5 * (df['average_test_meld'] + df['average_test_resd'])

In [24]:
df.sort_values('avg_test' , ascending=False)

Unnamed: 0,lr,batch_size,optimizer,uar_dev_meld,war_dev_meld,mf1_dev_meld,wf1_dev_meld,uar_dev_resd,war_dev_resd,mf1_dev_resd,...,uar_test_resd,war_test_resd,mf1_test_resd,wf1_test_resd,путь,average_dev_meld,average_dev_resd,average_test_meld,average_test_resd,avg_test
3,0.0001,32,Adam,41.382602,52.38954,39.443991,52.827297,28.474246,28.656716,28.083259,...,33.801203,33.928571,33.438886,33.732468,TransformerModelWithAttention_jina_10_37.51_ch...,46.510857,28.426474,47.382241,33.725282,40.553761
0,0.0001,16,Adam,44.769514,49.6844,39.405771,51.393273,30.785123,31.343284,29.761715,...,35.03451,35.714286,34.313412,34.630836,TransformerModelWithAttention_jina_10_38.12_ch...,46.313239,30.574392,45.686831,34.923261,40.305046
0,0.001,32,SGD,40.388577,50.405771,38.249957,51.622061,29.169689,28.955224,27.641496,...,32.270598,32.5,30.834202,30.985778,TransformerModelWithAttention_jina_14_36.14_ch...,45.166591,28.414844,46.164585,31.647644,38.906115
1,0.0001,32,Adam,43.502911,48.782687,38.367004,50.51234,31.87673,31.343284,31.286905,...,32.040132,32.142857,32.170436,32.487462,TransformerModelWithAttention_jina_11_37.3_che...,45.291235,31.487727,45.546935,32.210222,38.878578
1,0.0001,64,Adam,40.784832,50.225428,39.214449,51.26998,30.817176,31.343284,31.007678,...,32.402771,32.857143,32.27555,32.906963,TransformerModelWithAttention_jina_14_38.72_ch...,45.373672,31.241864,43.760477,32.610607,38.185542
1,0.0001,32,AdamW,39.219067,51.577998,38.211701,51.937297,33.790102,34.925373,33.806318,...,30.944185,31.785714,30.616526,31.371769,TransformerModelWithAttention_jina_19_39.29_ch...,45.236516,34.321957,44.457817,31.179548,37.818683
2,1e-05,32,Adam,42.553934,51.307484,40.241506,53.268034,28.510823,28.358209,26.896834,...,29.59422,30.0,27.228509,27.736673,TransformerModelWithAttention_jina_18_36.21_ch...,46.84274,27.82575,46.921477,28.63985,37.780664
2,1e-05,32,AdamW,39.122692,49.413886,37.257977,50.082217,26.945327,28.059701,25.515062,...,28.59272,30.0,27.127282,28.049372,TransformerModelWithAttention_jina_6_34.93_che...,43.969193,26.754706,44.753342,28.442343,36.597843
1,0.0001,32,SGD,37.425535,47.790803,35.660212,48.73533,22.81093,22.686567,21.357419,...,27.774324,28.214286,25.253754,25.837215,TransformerModelWithAttention_jina_50_31.64_ch...,42.40297,22.176515,43.562317,26.769895,35.166106
2,1e-05,32,SGD,18.919281,27.772768,18.569101,28.892428,19.619336,19.104478,18.497631,...,20.800739,20.357143,18.702759,18.668624,TransformerModelWithAttention_jina_48_20.13_ch...,23.538395,18.896666,24.57663,19.632316,22.104473


In [25]:
BATCH_SIZE = 32
train_dataloader = DataLoader(dataset=Dataset_MELD_RESD('train'), batch_size=BATCH_SIZE, shuffle=True)
dev_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD('dev_meld'), batch_size=BATCH_SIZE, shuffle=False)
dev_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD('dev_resd'), batch_size=BATCH_SIZE, shuffle=False)
test_meld_dataloader = DataLoader(dataset=Dataset_MELD_RESD('test_meld'), batch_size=BATCH_SIZE, shuffle=False)
test_resd_dataloader = DataLoader(dataset=Dataset_MELD_RESD('test_resd'), batch_size=BATCH_SIZE, shuffle=False)

In [27]:
%%capture --no-stdout
result = []
for dropout in [0, 0.2]:
    print(f"dropout = {dropout}")
    model_transformer = TransformerModelWithAttention(model_name='jina', pooling=None, num_layers=3, input_dim=1024, hidden_dim=256, num_heads=8, num_classes=7, dropout=dropout).to(device)
    optimizer = optim.Adam(params = model_transformer.parameters(), lr = 1e-4)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_transformer, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_transformer.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_transformer, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_transformer, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_transformer, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_transformer, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"dropout" : dropout}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

dropout = 0
Ранняя остановка на эпохе 21 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 43.760115324244566, 'war': 51.307484220018026, 'mf1': 40.984003349788594, 'wf1': 51.884065874298294}
Метрики на валидационной выборке RESD:  {'uar': 31.163003365597447, 'war': 31.64179104477612, 'mf1': 31.03862160384451, 'wf1': 31.558441206030935}
Метрики на тестовой выборке MELD:  {'uar': 39.40059567269243, 'war': 50.22988505747127, 'mf1': 35.980644119935086, 'wf1': 52.00485504126547}
Метрики на тестовой выборке RESD:  {'uar': 33.605211072316344, 'war': 34.285714285714285, 'mf1': 33.251974525558694, 'wf1': 33.804783984814264}
dropout = 0.2
Ранняя остановка на эпохе 25 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 41.836808702699344, 'war': 52.02885482416592, 'mf1': 40.07473216115678, 'wf1': 52.518264958978264}
Метрики на валидационной выборке RESD:  {'uar': 30.766568463134046, 'war': 3

In [20]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["dropout", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_dropout.csv"))

In [4]:
columns = ["dropout", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df = pd.read_csv(os.path.join(PATH_TO_MODEL, "result_dropout.csv"), index_col=0)
df.columns=columns

In [5]:
df['average_dev_meld'] = (df['uar_dev_meld'] + df['war_dev_meld'] + df['mf1_dev_meld'] + df['wf1_dev_meld']) / 4.0
df['average_dev_resd'] = (df['uar_dev_resd'] + df['war_dev_resd'] + df['mf1_dev_resd'] + df['wf1_dev_resd']) / 4.0
df['average_test_meld'] = (df['uar_test_meld'] + df['war_test_meld'] + df['mf1_test_meld'] + df['wf1_test_meld']) / 4.0
df['average_test_resd'] = (df['uar_test_resd'] + df['war_test_resd'] + df['mf1_test_resd'] + df['wf1_test_resd']) / 4.0
df['avg_test'] = 0.5 * (df['average_test_meld'] + df['average_test_resd'])

In [6]:
df.sort_values('avg_test' , ascending=False)

Unnamed: 0,dropout,uar_dev_meld,war_dev_meld,mf1_dev_meld,wf1_dev_meld,uar_dev_resd,war_dev_resd,mf1_dev_resd,wf1_dev_resd,uar_test_meld,war_test_meld,mf1_test_meld,wf1_test_meld,uar_test_resd,war_test_resd,mf1_test_resd,wf1_test_resd,путь,average_dev_meld,average_dev_resd,average_test_meld,average_test_resd,avg_test
1,0.1,41.382602,52.38954,39.443991,52.827297,28.474246,28.656716,28.083259,28.491675,41.326915,53.94636,38.652223,55.603465,33.801203,33.928571,33.438886,33.732468,TransformerModelWithAttention_jina_10_37.51_ch...,46.510857,28.426474,47.382241,33.725282,40.553761
0,0.0,43.760115,51.307484,40.984003,51.884066,31.163003,31.641791,31.038622,31.558441,39.400596,50.229885,35.980644,52.004855,33.605211,34.285714,33.251975,33.804784,TransformerModelWithAttention_jina_11_38.27_ch...,46.983917,31.350464,44.403995,33.736921,39.070458
2,0.2,41.836809,52.028855,40.074732,52.518265,30.766568,31.044776,30.78986,31.270953,39.068197,52.10728,36.929452,53.800828,29.857294,30.357143,30.023877,30.665311,TransformerModelWithAttention_jina_15_38.37_ch...,46.614665,30.968039,45.476439,30.225906,37.851173


Best training hyperparameters: optimizer=Adam, batch_size=32, lr=1e-4, dropout=0.1

### Transformer + xml-roberta-base

In [7]:
PATH_TO_MODEL = os.path.join(ROOT_DIR, "Models_transformer_xlm-roberta-base")

In [12]:
%%capture --no-stdout
result = []
num_heads = 4
num_layers = 1
for hidden_dim in [64, 128, 256, 512]:
    print(f"hidden_dim={hidden_dim}, num_layers={num_layers}, num_heads={num_heads}")
    model_transformer = TransformerModelWithAttention(model_name='xlm-roberta-base', pooling=None,  num_layers=num_layers, input_dim=768, hidden_dim=hidden_dim, num_heads=num_heads, num_classes=7).to(device)
    optimizer = optim.Adam(params = model_transformer.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_transformer, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_transformer.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_transformer, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_transformer, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_transformer, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_transformer, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"hidden_dim" : hidden_dim, "num_layers": num_layers, "num_heads" : num_heads}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

hidden_dim=64, num_layers=1, num_heads=4
Ранняя остановка на эпохе 21 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 35.57863700861466, 'war': 50.85662759242561, 'mf1': 34.98341588705367, 'wf1': 51.07036635479477}
Метрики на валидационной выборке RESD:  {'uar': 24.99504826085023, 'war': 24.776119402985074, 'mf1': 22.831334106072404, 'wf1': 23.234569479680083}
Метрики на тестовой выборке MELD:  {'uar': 38.800852119021656, 'war': 53.83141762452107, 'mf1': 37.42238027492847, 'wf1': 54.96580779558047}
Метрики на тестовой выборке RESD:  {'uar': 27.738256882993724, 'war': 28.214285714285715, 'mf1': 24.194334178303496, 'wf1': 24.823550327253948}
hidden_dim=128, num_layers=1, num_heads=4
Ранняя остановка на эпохе 40 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 38.159594909589146, 'war': 50.225428313796215, 'mf1': 37.01319917447696, 'wf1': 51.31336839597478}
Метрики на валидационн

In [13]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["hidden_dim", "num_layers", "num_heads", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_1_num_heads_4_hidden_dim.csv"))

In [14]:
%%capture --no-stdout
result = []
num_heads = 4
num_layers = 2
for hidden_dim in [64, 128, 256, 512]:
    print(f"hidden_dim={hidden_dim}, num_layers={num_layers}, num_heads={num_heads}")
    model_transformer = TransformerModelWithAttention(model_name='xlm-roberta-base', pooling=None,  num_layers=num_layers, input_dim=768, hidden_dim=hidden_dim, num_heads=num_heads, num_classes=7).to(device)
    optimizer = optim.Adam(params = model_transformer.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_transformer, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_transformer.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_transformer, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_transformer, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_transformer, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_transformer, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"hidden_dim" : hidden_dim, "num_layers": num_layers, "num_heads" : num_heads}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

hidden_dim=64, num_layers=2, num_heads=4
Ранняя остановка на эпохе 22 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 35.51718705429633, 'war': 50.225428313796215, 'mf1': 34.89277740743557, 'wf1': 51.01179421496702}
Метрики на валидационной выборке RESD:  {'uar': 27.894702808659698, 'war': 28.65671641791045, 'mf1': 25.48576438630688, 'wf1': 25.97702560680078}
Метрики на тестовой выборке MELD:  {'uar': 37.94873338371743, 'war': 50.76628352490421, 'mf1': 35.428671694766976, 'wf1': 52.75879489921468}
Метрики на тестовой выборке RESD:  {'uar': 26.099475244212083, 'war': 27.142857142857142, 'mf1': 23.64981730280041, 'wf1': 24.23756553366297}
hidden_dim=128, num_layers=2, num_heads=4
Ранняя остановка на эпохе 25 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 36.95511215442572, 'war': 50.85662759242561, 'mf1': 36.15691148920713, 'wf1': 51.987036432811266}
Метрики на валидационной в

In [15]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["hidden_dim", "num_layers", "num_heads", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_2_num_heads_4_hidden_dim.csv"))

In [16]:
%%capture --no-stdout
result = []
num_heads = 4
num_layers = 3
for hidden_dim in [64, 128, 256, 512]:
    print(f"hidden_dim={hidden_dim}, num_layers={num_layers}, num_heads={num_heads}")
    model_transformer = TransformerModelWithAttention(model_name='xlm-roberta-base', pooling=None,  num_layers=num_layers, input_dim=768, hidden_dim=hidden_dim, num_heads=num_heads, num_classes=7).to(device)
    optimizer = optim.Adam(params = model_transformer.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_transformer, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_transformer.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_transformer, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_transformer, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_transformer, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_transformer, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"hidden_dim" : hidden_dim, "num_layers": num_layers, "num_heads" : num_heads}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

hidden_dim=64, num_layers=3, num_heads=4
Ранняя остановка на эпохе 25 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 36.307120818454, 'war': 48.33183047790803, 'mf1': 34.947634030654015, 'wf1': 49.6449482586411}
Метрики на валидационной выборке RESD:  {'uar': 27.884924017916347, 'war': 28.059701492537314, 'mf1': 27.187160548903744, 'wf1': 27.448274476290834}
Метрики на тестовой выборке MELD:  {'uar': 38.42179444931243, 'war': 49.88505747126437, 'mf1': 35.72008304933746, 'wf1': 52.02124880532328}
Метрики на тестовой выборке RESD:  {'uar': 28.233743398217083, 'war': 28.92857142857143, 'mf1': 27.1904834919798, 'wf1': 27.843951252411703}
hidden_dim=128, num_layers=3, num_heads=4
Ранняя остановка на эпохе 20 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 36.763323233842364, 'war': 47.52028854824166, 'mf1': 35.92845861428388, 'wf1': 49.119767649685606}
Метрики на валидационной вы

In [17]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["hidden_dim", "num_layers", "num_heads", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_3_num_heads_4_hidden_dim.csv"))

In [18]:
columns = ["hidden_dim", "num_layers", "num_heads", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df = pd.concat([pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_1_num_heads_4_hidden_dim.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_2_num_heads_4_hidden_dim.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_3_num_heads_4_hidden_dim.csv"), index_col=0)])
df.columns=columns

In [20]:
df['average_dev_meld'] = (df['uar_dev_meld'] + df['war_dev_meld'] + df['mf1_dev_meld'] + df['wf1_dev_meld']) / 4.0
df['average_dev_resd'] = (df['uar_dev_resd'] + df['war_dev_resd'] + df['mf1_dev_resd'] + df['wf1_dev_resd']) / 4.0
df['average_test_meld'] = (df['uar_test_meld'] + df['war_test_meld'] + df['mf1_test_meld'] + df['wf1_test_meld']) / 4.0
df['average_test_resd'] = (df['uar_test_resd'] + df['war_test_resd'] + df['mf1_test_resd'] + df['wf1_test_resd']) / 4.0
df['avg_test'] = 0.5 * (df['average_test_meld'] + df['average_test_resd'])

In [21]:
df.sort_values('avg_test' , ascending=False)

Unnamed: 0,hidden_dim,num_layers,num_heads,uar_dev_meld,war_dev_meld,mf1_dev_meld,wf1_dev_meld,uar_dev_resd,war_dev_resd,mf1_dev_resd,...,uar_test_resd,war_test_resd,mf1_test_resd,wf1_test_resd,путь,average_dev_meld,average_dev_resd,average_test_meld,average_test_resd,avg_test
2,256,1,4,36.037512,50.405771,35.94391,51.502962,27.523097,27.164179,26.497393,...,32.7437,32.857143,31.530294,31.788006,TransformerModelWithAttention_xlm-roberta-base...,43.472539,26.909116,46.17451,32.229786,39.202148
3,512,3,4,36.548794,50.946799,36.406091,52.089056,29.942136,30.447761,29.526981,...,33.201993,32.857143,32.174148,32.198374,TransformerModelWithAttention_xlm-roberta-base...,43.997685,30.075706,45.786255,32.607914,39.197085
2,256,2,4,37.644416,50.3156,36.658844,51.482092,30.515179,30.149254,30.169543,...,32.384651,32.142857,31.70834,31.899326,TransformerModelWithAttention_xlm-roberta-base...,44.025238,30.307519,45.117499,32.033794,38.575646
1,128,1,4,38.159595,50.225428,37.013199,51.313368,27.529423,26.865672,26.422185,...,30.763181,30.714286,29.482089,29.62053,TransformerModelWithAttention_xlm-roberta-base...,44.177898,26.753304,46.09596,30.145022,38.120491
3,512,1,4,37.68403,51.577998,36.968284,52.704279,28.506916,29.552239,28.30441,...,28.179473,28.214286,28.469256,28.289756,TransformerModelWithAttention_xlm-roberta-base...,44.733648,28.816619,47.76899,28.288193,38.028591
3,512,2,4,36.947095,48.422002,35.610704,50.679881,27.798299,28.059701,27.449219,...,30.165509,30.357143,30.010572,30.297596,TransformerModelWithAttention_xlm-roberta-base...,42.91492,27.817125,45.019847,30.207705,37.613776
1,128,2,4,36.955112,50.856628,36.156911,51.987036,26.428862,26.865672,25.561859,...,28.755974,29.642857,27.851788,28.487755,TransformerModelWithAttention_xlm-roberta-base...,43.988922,26.187268,45.218274,28.684594,36.951434
0,64,1,4,35.578637,50.856628,34.983416,51.070366,24.995048,24.776119,22.831334,...,27.738257,28.214286,24.194334,24.82355,TransformerModelWithAttention_xlm-roberta-base...,43.122262,23.959268,46.255114,26.242607,36.248861
1,128,3,4,36.763323,47.520289,35.928459,49.119768,30.04523,29.850746,28.758729,...,29.055125,29.642857,27.296259,27.880455,TransformerModelWithAttention_xlm-roberta-base...,42.33296,29.476361,43.863336,28.468674,36.166005
0,64,3,4,36.307121,48.33183,34.947634,49.644948,27.884924,28.059701,27.187161,...,28.233743,28.928571,27.190483,27.843951,TransformerModelWithAttention_xlm-roberta-base...,42.307883,27.645015,44.012046,28.049187,36.030617


In [22]:
%%capture --no-stdout
result = []
for num_heads in [2, 8]:
    for (hidden_dim, num_layers) in [(256, 1), (512, 3), (256, 2)]:
        print(f"hidden_dim={hidden_dim}, num_layers={num_layers}, num_heads={num_heads}")
        model_transformer = TransformerModelWithAttention(model_name='xlm-roberta-base', pooling=None,  num_layers=num_layers, input_dim=768, hidden_dim=hidden_dim, num_heads=num_heads, num_classes=7).to(device)
        optimizer = optim.Adam(params = model_transformer.parameters(), lr = LEARNING_RATE)
        loss_fn = nn.CrossEntropyLoss(weight=class_weights)
        trainer = ModelTrainer(model_transformer, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
        trainer.train(PATH_TO_MODEL)
        checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
        model_transformer.load_state_dict(checkpoint['model_state_dict'])
        metrics_dev_meld = evaluate_metrics(model_transformer, dev_meld_dataloader)
        metrics_dev_resd = evaluate_metrics(model_transformer, dev_resd_dataloader)
        print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
        print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
        metrics_test_meld = evaluate_metrics(model_transformer, test_meld_dataloader)
        metrics_test_resd = evaluate_metrics(model_transformer, test_resd_dataloader)
        print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
        print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
        result.append([{"hidden_dim" : hidden_dim, "num_layers": num_layers, "num_heads" : num_heads}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

hidden_dim=256, num_layers=1, num_heads=2
Ранняя остановка на эпохе 25 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 36.51812754659872, 'war': 49.50405770964833, 'mf1': 35.21444243572972, 'wf1': 50.85035326808508}
Метрики на валидационной выборке RESD:  {'uar': 26.865465993160548, 'war': 25.970149253731346, 'mf1': 25.590630751054118, 'wf1': 25.73467020459997}
Метрики на тестовой выборке MELD:  {'uar': 40.030316697494364, 'war': 52.83524904214559, 'mf1': 38.09184817706447, 'wf1': 54.91220438831106}
Метрики на тестовой выборке RESD:  {'uar': 30.1682351353404, 'war': 30.0, 'mf1': 28.5435916838192, 'wf1': 28.936115608067176}
hidden_dim=512, num_layers=3, num_heads=2
Ранняя остановка на эпохе 26 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 38.844139873355836, 'war': 49.77457168620379, 'mf1': 36.92148340523577, 'wf1': 51.577844631499325}
Метрики на валидационной выборке RESD: 

In [23]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["hidden_dim", "num_layers", "num_heads", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_num_heads_hidden_dim.csv"))

In [8]:
columns = ["hidden_dim", "num_layers", "num_heads", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df = pd.concat([pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_1_num_heads_4_hidden_dim.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_2_num_heads_4_hidden_dim.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_3_num_heads_4_hidden_dim.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_num_heads_hidden_dim.csv"), index_col=0)])
df.columns=columns

In [9]:
df['average_dev_meld'] = (df['uar_dev_meld'] + df['war_dev_meld'] + df['mf1_dev_meld'] + df['wf1_dev_meld']) / 4.0
df['average_dev_resd'] = (df['uar_dev_resd'] + df['war_dev_resd'] + df['mf1_dev_resd'] + df['wf1_dev_resd']) / 4.0
df['average_test_meld'] = (df['uar_test_meld'] + df['war_test_meld'] + df['mf1_test_meld'] + df['wf1_test_meld']) / 4.0
df['average_test_resd'] = (df['uar_test_resd'] + df['war_test_resd'] + df['mf1_test_resd'] + df['wf1_test_resd']) / 4.0
df['avg_test'] = 0.5 * (df['average_test_meld'] + df['average_test_resd'])

In [10]:
df.sort_values('avg_test' , ascending=False)

Unnamed: 0,hidden_dim,num_layers,num_heads,uar_dev_meld,war_dev_meld,mf1_dev_meld,wf1_dev_meld,uar_dev_resd,war_dev_resd,mf1_dev_resd,wf1_dev_resd,uar_test_meld,war_test_meld,mf1_test_meld,wf1_test_meld,uar_test_resd,war_test_resd,mf1_test_resd,wf1_test_resd,путь,average_dev_meld,average_dev_resd,average_test_meld,average_test_resd,avg_test
2,256,1,4,36.037512,50.405771,35.94391,51.502962,27.523097,27.164179,26.497393,26.451794,39.123811,53.48659,37.207217,54.880423,32.7437,32.857143,31.530294,31.788006,TransformerModelWithAttention_xlm-roberta-base...,43.472539,26.909116,46.17451,32.229786,39.202148
3,512,3,4,36.548794,50.946799,36.406091,52.089056,29.942136,30.447761,29.526981,30.385944,38.687542,52.835249,37.082205,54.540024,33.201993,32.857143,32.174148,32.198374,TransformerModelWithAttention_xlm-roberta-base...,43.997685,30.075706,45.786255,32.607914,39.197085
2,256,2,4,37.644416,50.3156,36.658844,51.482092,30.515179,30.149254,30.169543,30.3961,38.724198,51.762452,36.237337,53.746009,32.384651,32.142857,31.70834,31.899326,TransformerModelWithAttention_xlm-roberta-base...,44.025238,30.307519,45.117499,32.033794,38.575646
5,256,2,8,35.952419,50.766456,35.651277,51.466249,29.066247,28.955224,28.545076,28.637861,38.599559,53.333333,37.312028,55.008785,31.29527,31.428571,30.560398,30.859488,TransformerModelWithAttention_xlm-roberta-base...,43.4591,28.801102,46.063426,31.035932,38.549679
1,512,3,2,38.84414,49.774572,36.921483,51.577845,28.086869,27.761194,26.948308,27.101637,40.518984,50.766284,36.616366,53.222744,31.643457,31.428571,30.470961,30.456247,TransformerModelWithAttention_xlm-roberta-base...,44.27951,27.474502,45.281094,30.999809,38.140452
1,128,1,4,38.159595,50.225428,37.013199,51.313368,27.529423,26.865672,26.422185,26.195935,39.665381,52.950192,37.224444,54.543824,30.763181,30.714286,29.482089,29.62053,TransformerModelWithAttention_xlm-roberta-base...,44.177898,26.753304,46.09596,30.145022,38.120491
3,512,1,4,37.68403,51.577998,36.968284,52.704279,28.506916,29.552239,28.30441,28.902912,40.601991,54.827586,38.840416,56.805966,28.179473,28.214286,28.469256,28.289756,TransformerModelWithAttention_xlm-roberta-base...,44.733648,28.816619,47.76899,28.288193,38.028591
0,256,1,2,36.518128,49.504058,35.214442,50.850353,26.865466,25.970149,25.590631,25.73467,40.030317,52.835249,38.091848,54.912204,30.168235,30.0,28.543592,28.936116,TransformerModelWithAttention_xlm-roberta-base...,43.021745,26.040229,46.467405,29.411986,37.939695
3,512,2,4,36.947095,48.422002,35.610704,50.679881,27.798299,28.059701,27.449219,27.961282,40.468902,49.961686,36.762862,52.885938,30.165509,30.357143,30.010572,30.297596,TransformerModelWithAttention_xlm-roberta-base...,42.91492,27.817125,45.019847,30.207705,37.613776
2,256,2,2,37.46194,48.422002,35.988057,50.561836,28.903242,29.850746,27.730544,28.320175,38.417925,48.544061,35.309901,51.226707,31.409009,32.5,29.96252,30.677303,TransformerModelWithAttention_xlm-roberta-base...,43.108459,28.701177,43.374648,31.137208,37.255928


### Transformer + canine-c

In [11]:
PATH_TO_MODEL = os.path.join(ROOT_DIR, "Models_transformer_canine-c")

In [28]:
%%capture --no-stdout
result = []
num_heads = 4
num_layers = 1
for hidden_dim in [64, 128, 256, 512]:
    print(f"hidden_dim={hidden_dim}, num_layers={num_layers}, num_heads={num_heads}")
    model_transformer = TransformerModelWithAttention(model_name='canine-c', pooling=None,  num_layers=num_layers, input_dim=768, hidden_dim=hidden_dim, num_heads=num_heads, num_classes=7).to(device)
    optimizer = optim.Adam(params = model_transformer.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_transformer, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_transformer.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_transformer, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_transformer, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_transformer, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_transformer, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"hidden_dim" : hidden_dim, "num_layers": num_layers, "num_heads" : num_heads}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

hidden_dim=64, num_layers=1, num_heads=4
Ранняя остановка на эпохе 14 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 30.261225182100205, 'war': 46.61857529305681, 'mf1': 28.41658280344999, 'wf1': 44.811054496498606}
Метрики на валидационной выборке RESD:  {'uar': 18.130585567560356, 'war': 17.01492537313433, 'mf1': 10.216867004426195, 'wf1': 10.257532497122323}
Метрики на тестовой выборке MELD:  {'uar': 31.086210453767883, 'war': 47.547892720306514, 'mf1': 29.398798043136864, 'wf1': 47.69098028443407}
Метрики на тестовой выборке RESD:  {'uar': 19.515132015132014, 'war': 20.0, 'mf1': 12.972442786107383, 'wf1': 13.423947802829792}
hidden_dim=128, num_layers=1, num_heads=4
Ранняя остановка на эпохе 17 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 30.68210179819784, 'war': 48.151487826871055, 'mf1': 30.53318165309749, 'wf1': 46.78294852741793}
Метрики на валидационной выборке 

In [29]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["hidden_dim", "num_layers", "num_heads", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_1_num_heads_4_hidden_dim.csv"))

In [30]:
%%capture --no-stdout
result = []
num_heads = 4
num_layers = 2
for hidden_dim in [64, 128, 256, 512]:
    print(f"hidden_dim={hidden_dim}, num_layers={num_layers}, num_heads={num_heads}")
    model_transformer = TransformerModelWithAttention(model_name='canine-c', pooling=None,  num_layers=num_layers, input_dim=768, hidden_dim=hidden_dim, num_heads=num_heads, num_classes=7).to(device)
    optimizer = optim.Adam(params = model_transformer.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_transformer, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_transformer.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_transformer, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_transformer, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_transformer, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_transformer, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"hidden_dim" : hidden_dim, "num_layers": num_layers, "num_heads" : num_heads}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

hidden_dim=64, num_layers=2, num_heads=4
Ранняя остановка на эпохе 32 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 31.720232548856526, 'war': 46.43823264201984, 'mf1': 31.470483252492254, 'wf1': 47.13869858675984}
Метрики на валидационной выборке RESD:  {'uar': 17.804581736076074, 'war': 18.208955223880597, 'mf1': 15.841637414278006, 'wf1': 16.43787646232888}
Метрики на тестовой выборке MELD:  {'uar': 33.02763800511464, 'war': 46.70498084291187, 'mf1': 31.493496109580093, 'wf1': 49.14711255500986}
Метрики на тестовой выборке RESD:  {'uar': 22.195019695019695, 'war': 22.142857142857142, 'mf1': 19.428777714884774, 'wf1': 19.534431556174347}
hidden_dim=128, num_layers=2, num_heads=4
Ранняя остановка на эпохе 19 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 31.90118074744947, 'war': 46.43823264201984, 'mf1': 31.311025084828298, 'wf1': 46.76120817045802}
Метрики на валидацион

In [31]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["hidden_dim", "num_layers", "num_heads", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_2_num_heads_4_hidden_dim.csv"))

In [15]:
%%capture --no-stdout
result = []
num_heads = 4
num_layers = 3
for hidden_dim in [64, 128, 256, 512]:
    print(f"hidden_dim={hidden_dim}, num_layers={num_layers}, num_heads={num_heads}")
    model_transformer = TransformerModelWithAttention(model_name='canine-c', pooling=None,  num_layers=num_layers, input_dim=768, hidden_dim=hidden_dim, num_heads=num_heads, num_classes=7).to(device)
    optimizer = optim.Adam(params = model_transformer.parameters(), lr = LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    trainer = ModelTrainer(model_transformer, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
    trainer.train(PATH_TO_MODEL)
    checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
    model_transformer.load_state_dict(checkpoint['model_state_dict'])
    metrics_dev_meld = evaluate_metrics(model_transformer, dev_meld_dataloader)
    metrics_dev_resd = evaluate_metrics(model_transformer, dev_resd_dataloader)
    print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
    print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
    metrics_test_meld = evaluate_metrics(model_transformer, test_meld_dataloader)
    metrics_test_resd = evaluate_metrics(model_transformer, test_resd_dataloader)
    print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
    print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
    result.append([{"hidden_dim" : hidden_dim, "num_layers": num_layers, "num_heads" : num_heads}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

hidden_dim=64, num_layers=3, num_heads=4
Ранняя остановка на эпохе 26 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 31.855969638784813, 'war': 44.183949504057715, 'mf1': 30.36379993811452, 'wf1': 44.67174036157032}
Метрики на валидационной выборке RESD:  {'uar': 20.57549686541283, 'war': 20.0, 'mf1': 16.547244042684774, 'wf1': 16.93580893542786}
Метрики на тестовой выборке MELD:  {'uar': 32.71481248251951, 'war': 44.48275862068966, 'mf1': 29.928973293328813, 'wf1': 46.223495699130936}
Метрики на тестовой выборке RESD:  {'uar': 23.239270114270116, 'war': 23.214285714285715, 'mf1': 18.473091740743815, 'wf1': 18.73557557715718}
hidden_dim=128, num_layers=3, num_heads=4
Ранняя остановка на эпохе 28 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 30.369993877718258, 'war': 42.29035166816952, 'mf1': 29.654051397649123, 'wf1': 44.10163738353653}
Метрики на валидационной выборке RE

In [16]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["hidden_dim", "num_layers", "num_heads", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_3_num_heads_4_hidden_dim.csv"))

In [17]:
columns = ["hidden_dim", "num_layers", "num_heads", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df = pd.concat([pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_1_num_heads_4_hidden_dim.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_2_num_heads_4_hidden_dim.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_3_num_heads_4_hidden_dim.csv"), index_col=0)])
df.columns=columns

In [18]:
df['average_dev_meld'] = (df['uar_dev_meld'] + df['war_dev_meld'] + df['mf1_dev_meld'] + df['wf1_dev_meld']) / 4.0
df['average_dev_resd'] = (df['uar_dev_resd'] + df['war_dev_resd'] + df['mf1_dev_resd'] + df['wf1_dev_resd']) / 4.0
df['average_test_meld'] = (df['uar_test_meld'] + df['war_test_meld'] + df['mf1_test_meld'] + df['wf1_test_meld']) / 4.0
df['average_test_resd'] = (df['uar_test_resd'] + df['war_test_resd'] + df['mf1_test_resd'] + df['wf1_test_resd']) / 4.0
df['avg_test'] = 0.5 * (df['average_test_meld'] + df['average_test_resd'])

In [19]:
df.sort_values('avg_test' , ascending=False)

Unnamed: 0,hidden_dim,num_layers,num_heads,uar_dev_meld,war_dev_meld,mf1_dev_meld,wf1_dev_meld,uar_dev_resd,war_dev_resd,mf1_dev_resd,...,uar_test_resd,war_test_resd,mf1_test_resd,wf1_test_resd,путь,average_dev_meld,average_dev_resd,average_test_meld,average_test_resd,avg_test
3,512,3,4,31.992195,46.618575,31.621051,46.54775,19.104448,19.701493,17.868353,...,22.303594,22.857143,21.321595,21.778316,TransformerModelWithAttention_canine-c_8_28.48...,39.194893,18.769139,40.560499,22.065162,31.312831
3,512,2,4,31.303989,46.618575,30.592242,46.301186,19.669503,19.402985,16.714783,...,24.278548,25.0,21.165629,21.847729,TransformerModelWithAttention_canine-c_6_28.08...,38.703998,18.25754,39.313318,23.072977,31.193147
2,256,2,4,30.778421,44.364292,29.876125,44.906983,17.929407,19.402985,16.164591,...,23.402558,24.285714,20.549348,21.158027,TransformerModelWithAttention_canine-c_14_27.2...,37.481455,17.725893,39.81154,22.348912,31.080226
2,256,3,4,30.704383,44.634806,29.820035,44.822653,19.079879,19.701493,17.812722,...,24.616873,24.285714,21.940406,21.995565,TransformerModelWithAttention_canine-c_10_27.6...,37.495469,18.773356,37.81194,23.20964,30.51079
0,64,2,4,31.720233,46.438233,31.470483,47.138699,17.804582,18.208955,15.841637,...,22.19502,22.142857,19.428778,19.534432,TransformerModelWithAttention_canine-c_22_27.4...,39.191912,17.073263,40.093307,20.825272,30.459289
1,128,2,4,31.901181,46.438233,31.311025,46.761208,17.489307,19.104478,13.394692,...,21.869857,23.214286,17.154367,17.929579,TransformerModelWithAttention_canine-c_9_28.12...,39.102912,16.130776,40.509493,20.042022,30.275758
3,512,1,4,31.798159,48.96303,31.389655,47.766115,16.920101,18.208955,12.45123,...,20.924496,22.142857,16.388958,17.132506,TransformerModelWithAttention_canine-c_6_28.05...,39.97924,15.224955,40.493124,19.147204,29.820164
0,64,3,4,31.85597,44.18395,30.3638,44.67174,20.575497,20.0,16.547244,...,23.23927,23.214286,18.473092,18.735576,TransformerModelWithAttention_canine-c_16_28.0...,37.768865,18.514637,38.33751,20.915556,29.626533
1,128,3,4,30.369994,42.290352,29.654051,44.101637,19.919015,19.402985,18.623219,...,21.365196,21.071429,19.435972,19.484852,TransformerModelWithAttention_canine-c_18_27.2...,36.604009,19.206754,38.032174,20.339362,29.185768
1,128,1,4,30.682102,48.151488,30.533182,46.782949,15.348962,15.522388,9.682158,...,19.424262,20.357143,13.209322,13.612233,TransformerModelWithAttention_canine-c_7_26.2_...,39.03743,12.649809,40.532055,16.65074,28.591397


In [20]:
%%capture --no-stdout
result = []
for num_heads in [2, 8]:
    for (hidden_dim, num_layers) in [(512, 3), (512, 2)]:
        print(f"hidden_dim={hidden_dim}, num_layers={num_layers}, num_heads={num_heads}")
        model_transformer = TransformerModelWithAttention(model_name='canine-c', pooling=None,  num_layers=num_layers, input_dim=768, hidden_dim=hidden_dim, num_heads=num_heads, num_classes=7).to(device)
        optimizer = optim.Adam(params = model_transformer.parameters(), lr = LEARNING_RATE)
        loss_fn = nn.CrossEntropyLoss(weight=class_weights)
        trainer = ModelTrainer(model_transformer, train_dataloader, dev_meld_dataloader, dev_resd_dataloader, test_meld_dataloader, test_resd_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_ACC, optimizer, loss_fn)
        trainer.train(PATH_TO_MODEL)
        checkpoint = torch.load(os.path.join(PATH_TO_MODEL, trainer._best_model_name))
        model_transformer.load_state_dict(checkpoint['model_state_dict'])
        metrics_dev_meld = evaluate_metrics(model_transformer, dev_meld_dataloader)
        metrics_dev_resd = evaluate_metrics(model_transformer, dev_resd_dataloader)
        print("Метрики на валидационной выборке MELD: ", metrics_dev_meld)
        print("Метрики на валидационной выборке RESD: ", metrics_dev_resd)
        metrics_test_meld = evaluate_metrics(model_transformer, test_meld_dataloader)
        metrics_test_resd = evaluate_metrics(model_transformer, test_resd_dataloader)
        print("Метрики на тестовой выборке MELD: ", metrics_test_meld)
        print("Метрики на тестовой выборке RESD: ", metrics_test_resd)
        result.append([{"hidden_dim" : hidden_dim, "num_layers": num_layers, "num_heads" : num_heads}, metrics_dev_meld, metrics_dev_resd, metrics_test_meld, metrics_test_resd, trainer._best_model_name])

hidden_dim=512, num_layers=3, num_heads=2
Ранняя остановка на эпохе 48 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 29.04946262288646, 'war': 47.339945897204686, 'mf1': 29.466037739844413, 'wf1': 45.13956473392521}
Метрики на валидационной выборке RESD:  {'uar': 21.83748475235505, 'war': 22.388059701492537, 'mf1': 21.10543527380262, 'wf1': 21.880673829714343}
Метрики на тестовой выборке MELD:  {'uar': 28.19094511899659, 'war': 49.272030651341, 'mf1': 28.359440236474747, 'wf1': 47.853216809759665}
Метрики на тестовой выборке RESD:  {'uar': 20.93839097128571, 'war': 20.714285714285715, 'mf1': 20.542902912056316, 'wf1': 20.34920359262509}
hidden_dim=512, num_layers=2, num_heads=2
Ранняя остановка на эпохе 16 из-за отсутствия улучшения точности на тестовой выборке
Метрики на валидационной выборке MELD:  {'uar': 30.130874174026275, 'war': 44.54463480613165, 'mf1': 29.462192331490872, 'wf1': 44.90212173430195}
Метрики на валидационно

In [21]:
df = pd.DataFrame(result, columns=["параметры", "метрики dev meld", "метрики dev resd", "метрики test meld", "метрики test resd", "путь"])
df = pd.concat([df["параметры"].apply(pd.Series), df["метрики dev meld"].apply(pd.Series), df["метрики dev resd"].apply(pd.Series), df["метрики test meld"].apply(pd.Series), df["метрики test resd"].apply(pd.Series), df["путь"]], axis=1)
df.columns = ["hidden_dim", "num_layers", "num_heads", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df.to_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_num_heads_hidden_dim.csv"))

In [12]:
columns = ["hidden_dim", "num_layers", "num_heads", "uar_dev_meld", "war_dev_meld", "mf1_dev_meld", "wf1_dev_meld", "uar_dev_resd", "war_dev_resd", "mf1_dev_resd", "wf1_dev_resd", "uar_test_meld", "war_test_meld", "mf1_test_meld", "wf1_test_meld", "uar_test_resd", "war_test_resd", "mf1_test_resd", "wf1_test_resd", "путь"]
df = pd.concat([pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_1_num_heads_4_hidden_dim.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_2_num_heads_4_hidden_dim.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_3_num_heads_4_hidden_dim.csv"), index_col=0), pd.read_csv(os.path.join(PATH_TO_MODEL, "result_num_layers_num_heads_hidden_dim.csv"), index_col=0)])
df.columns=columns

In [13]:
df['average_dev_meld'] = (df['uar_dev_meld'] + df['war_dev_meld'] + df['mf1_dev_meld'] + df['wf1_dev_meld']) / 4.0
df['average_dev_resd'] = (df['uar_dev_resd'] + df['war_dev_resd'] + df['mf1_dev_resd'] + df['wf1_dev_resd']) / 4.0
df['average_test_meld'] = (df['uar_test_meld'] + df['war_test_meld'] + df['mf1_test_meld'] + df['wf1_test_meld']) / 4.0
df['average_test_resd'] = (df['uar_test_resd'] + df['war_test_resd'] + df['mf1_test_resd'] + df['wf1_test_resd']) / 4.0
df['avg_test'] = 0.5 * (df['average_test_meld'] + df['average_test_resd'])

In [14]:
df.sort_values('avg_test' , ascending=False)

Unnamed: 0,hidden_dim,num_layers,num_heads,uar_dev_meld,war_dev_meld,mf1_dev_meld,wf1_dev_meld,uar_dev_resd,war_dev_resd,mf1_dev_resd,wf1_dev_resd,uar_test_meld,war_test_meld,mf1_test_meld,wf1_test_meld,uar_test_resd,war_test_resd,mf1_test_resd,wf1_test_resd,путь,average_dev_meld,average_dev_resd,average_test_meld,average_test_resd,avg_test
3,512,3,4,31.992195,46.618575,31.621051,46.54775,19.104448,19.701493,17.868353,18.402264,32.406976,48.505747,31.673334,49.655941,22.303594,22.857143,21.321595,21.778316,TransformerModelWithAttention_canine-c_8_28.48...,39.194893,18.769139,40.560499,22.065162,31.312831
1,512,2,2,30.130874,44.544635,29.462192,44.902122,19.035795,18.208955,16.570499,16.968939,32.096795,46.590038,30.525109,47.790487,24.794961,24.285714,21.887845,21.905399,TransformerModelWithAttention_canine-c_6_27.79...,37.259956,17.696047,39.250607,23.21848,31.234544
3,512,2,4,31.303989,46.618575,30.592242,46.301186,19.669503,19.402985,16.714783,17.242887,31.966047,46.934866,30.281637,48.070722,24.278548,25.0,21.165629,21.847729,TransformerModelWithAttention_canine-c_6_28.08...,38.703998,18.25754,39.313318,23.072977,31.193147
2,256,2,4,30.778421,44.364292,29.876125,44.906983,17.929407,19.402985,16.164591,17.406591,32.725019,46.896552,30.88497,48.73962,23.402558,24.285714,20.549348,21.158027,TransformerModelWithAttention_canine-c_14_27.2...,37.481455,17.725893,39.81154,22.348912,31.080226
2,256,3,4,30.704383,44.634806,29.820035,44.822653,19.079879,19.701493,17.812722,18.499329,31.321957,44.636015,29.103495,46.186291,24.616873,24.285714,21.940406,21.995565,TransformerModelWithAttention_canine-c_10_27.6...,37.495469,18.773356,37.81194,23.20964,30.51079
2,512,3,8,31.75554,44.454463,31.217717,45.903711,19.442494,19.104478,18.357098,18.547326,31.880195,44.712644,30.894707,47.202988,23.27404,23.571429,20.970439,21.24612,TransformerModelWithAttention_canine-c_10_27.7...,38.332858,18.862849,38.672633,22.265507,30.46907
0,64,2,4,31.720233,46.438233,31.470483,47.138699,17.804582,18.208955,15.841637,16.437876,33.027638,46.704981,31.493496,49.147113,22.19502,22.142857,19.428778,19.534432,TransformerModelWithAttention_canine-c_22_27.4...,39.191912,17.073263,40.093307,20.825272,30.459289
1,128,2,4,31.901181,46.438233,31.311025,46.761208,17.489307,19.104478,13.394692,14.534625,32.943781,48.199234,31.41383,49.481127,21.869857,23.214286,17.154367,17.929579,TransformerModelWithAttention_canine-c_9_28.12...,39.102912,16.130776,40.509493,20.042022,30.275758
3,512,1,4,31.798159,48.96303,31.389655,47.766115,16.920101,18.208955,12.45123,13.319532,32.042032,49.463602,30.829617,49.637245,20.924496,22.142857,16.388958,17.132506,TransformerModelWithAttention_canine-c_6_28.05...,39.97924,15.224955,40.493124,19.147204,29.820164
0,64,3,4,31.85597,44.18395,30.3638,44.67174,20.575497,20.0,16.547244,16.935809,32.714812,44.482759,29.928973,46.223496,23.23927,23.214286,18.473092,18.735576,TransformerModelWithAttention_canine-c_16_28.0...,37.768865,18.514637,38.33751,20.915556,29.626533
