In [1]:
from datasets import load_dataset
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup

import torch

import os
import re
import random
import numpy as np

from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from sklearn import metrics
from sklearn.metrics import f1_score

2024-12-15 12:34:08.750324: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
else:
    exit()

NVIDIA A100-SXM4-80GB


In [4]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

In [6]:
# Загрузим датасет
data = load_dataset("csv", data_files={"train": "train.csv", "validation": "validation.csv", "test": "test.csv"})
# Список с метками классов
labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]

Generating train split: 112040 examples [00:00, 203962.15 examples/s]
Generating validation split: 28302 examples [00:00, 209281.88 examples/s]
Generating test split: 15962 examples [00:00, 218497.82 examples/s]


In [7]:
model_name = 'ai-forever/ruBert-large'
tokenizer = BertTokenizer.from_pretrained(model_name)
max_len = 64
epochs = 20

In [8]:
# Выполним предобработку данных
def cleaner(example):
    example["text"] = example["text"].lower()
    example["text"] = re.sub(r"[^a-zа-я\d]", " ", example["text"])
    example["text"] = re.sub(r"\s+", " ", example["text"])
    example["text"] = example["text"].strip()
    return example

def one_hot_to_list(example):
    emotions = []
    for emotion in labels:
        emotions.append(example[emotion])
    example["one_hot_labels"] = emotions

    return example

In [12]:
data = data.map(one_hot_to_list)
data = data.map(lambda x: tokenizer(x["text"], padding="max_length", truncation=True, max_length=max_len), batched=True)

Map: 100%|██████████| 112040/112040 [00:12<00:00, 9106.61 examples/s]
Map: 100%|██████████| 28302/28302 [00:03<00:00, 8119.42 examples/s] 
Map: 100%|██████████| 15962/15962 [00:01<00:00, 9014.99 examples/s]
Map: 100%|██████████| 112040/112040 [00:41<00:00, 2682.97 examples/s]
Map: 100%|██████████| 28302/28302 [00:10<00:00, 2710.40 examples/s]
Map: 100%|██████████| 15962/15962 [00:05<00:00, 2723.19 examples/s]


In [14]:
class EmotionDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        return {
            'input_ids': torch.tensor(self.dataset[index]["input_ids"], dtype=torch.long),
            'attention_mask': torch.tensor(self.dataset[index]["attention_mask"], dtype=torch.long),
            'token_type_ids': torch.tensor(self.dataset[index]["token_type_ids"], dtype=torch.long),
            'labels': torch.tensor(self.dataset[index]["one_hot_labels"], dtype=torch.float)
        }

In [15]:
train_dataset = EmotionDataset(data["train"])
valid_dataset = EmotionDataset(data["validation"])
test_dataset = EmotionDataset(data["test"])

In [16]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=64, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [17]:
class CustomBertModel(nn.Module):
    def __init__(self, pretrained_model_name, num_classes):
        super(CustomBertModel, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [18]:
model = CustomBertModel(pretrained_model_name=model_name, num_classes=7)
model = model.to(device)

  return self.fget.__get__(instance, owner)()


In [19]:
# Инициализируем оптимизатор и scheduler
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
total_steps = len(train_dataloader) * epochs  # количество итераций для обучения
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) # scheduler

In [20]:
def train(model, criterion, optimizer, scheduler, dataloader) -> float:
    """
    Обучение модели.
    :param model: Модель
    :param criterion: Функция потерь
    :param optimizer: Оптимизатор
    :param scheduler: Scheduler
    :param dataloader: DataLoader
    :return: Значение функции потерь
    """
    model.train()
    train_loss = 0.0
    for data in dataloader:
        optimizer.zero_grad()
        ids = data["input_ids"].to(device)
        mask = data["attention_mask"].to(device)
        token_type_ids = data["token_type_ids"].to(device)
        labels = data["labels"].to(device)

        outputs = model(ids, mask, token_type_ids)
        loss = criterion(outputs, labels)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    return train_loss / len(dataloader)

In [21]:
def validation(model, criterion, dataloader) -> float:
    """
    Валидация модели.
    :param model: Модель
    :param criterion: Функция потерь
    :param dataloader: DataLoader
    :return: 
    """
    model.eval()
    val_loss = 0.0
    val_targets, val_outputs = [], []
    with torch.no_grad():
        for data in dataloader:
            ids = data["input_ids"].to(device)
            mask = data["attention_mask"].to(device)
            token_type_ids = data["token_type_ids"].to(device)
            labels = data["labels"].to(device)

            outputs = model(ids, mask, token_type_ids)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            val_targets.extend(labels.cpu().numpy())
            val_outputs.extend(torch.sigmoid(outputs).cpu().numpy())

    return val_loss / len(dataloader), val_outputs, val_targets

In [22]:
# Обучение и валидация
best_f1 = 0 # Лучшее значение F1-score
patience = 2 # Количество эпох без улучшения
patience_counter = 0 # Счетчик эпох без улучшения

In [23]:
for epoch in range(epochs):
    print(f"Epoch: {epoch+1}")
    train_loss = train(model, criterion, optimizer, scheduler, train_dataloader)
    val_loss, val_outputs, val_targets = validation(model, criterion, valid_dataloader)

    # Вычисление F1-score
    val_f1 = f1_score(val_targets, (np.array(val_outputs) > 0.5).astype(int), average='weighted')
    print(f"Train loss: {train_loss}, Valid loss: {val_loss}, Valid F1: {val_f1}")

    # Ранняя остановка
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pt')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping")
            break

Epoch: 1
Train loss: 0.28231398447250244, Valid loss: 0.24859218948581688, Valid F1: 0.629651707401147


IOStream.flush timed out


Epoch: 2
Train loss: 0.23522706578317062, Valid loss: 0.24657899750140128, Valid F1: 0.6527617982742269
Epoch: 3
Train loss: 0.20684157962512453, Valid loss: 0.24555651789728727, Valid F1: 0.6682189441597192
Epoch: 4
Train loss: 0.18176642398555778, Valid loss: 0.2582021620351628, Valid F1: 0.6647883593491621
Epoch: 5
Train loss: 0.162016900270105, Valid loss: 0.26827253162457226, Valid F1: 0.664905310295258
Early stopping


In [24]:
loss, outputs, targets = validation(model, criterion, valid_dataloader)
outputs = np.array(outputs) >= 0.5
print(metrics.classification_report(targets, outputs, target_names=labels, zero_division=0))

              precision    recall  f1-score   support

       anger       0.51      0.59      0.55      3594
     disgust       0.59      0.35      0.44      1555
        fear       0.73      0.55      0.63      1080
         joy       0.58      0.70      0.64      4784
     sadness       0.63      0.57      0.60      2825
    surprise       0.50      0.52      0.51      2155
     neutral       0.83      0.70      0.76     15084

   micro avg       0.68      0.64      0.66     31077
   macro avg       0.62      0.57      0.59     31077
weighted avg       0.70      0.64      0.66     31077
 samples avg       0.67      0.66      0.66     31077



In [25]:
# Тестирование
model.load_state_dict(torch.load('best_model.pt'))
test_loss, test_outputs, test_targets = validation(model, criterion, test_dataloader)
test_f1 = f1_score(test_targets, (np.array(test_outputs) > 0.5).astype(int), average='weighted')
print(f"Test F1: {test_f1}")

Test F1: 0.6667327916253339


In [26]:
loss, outputs, targets = validation(model, criterion, test_dataloader)
outputs = np.array(outputs) >= 0.5
print(metrics.classification_report(targets, outputs, target_names=labels, zero_division=0))

              precision    recall  f1-score   support

       anger       0.47      0.61      0.53      1844
     disgust       0.56      0.30      0.39      1001
        fear       0.62      0.55      0.58       701
         joy       0.53      0.71      0.60      1853
     sadness       0.60      0.59      0.60      1553
    surprise       0.54      0.55      0.54      1064
     neutral       0.85      0.70      0.77      9218

   micro avg       0.68      0.64      0.66     17234
   macro avg       0.60      0.57      0.57     17234
weighted avg       0.71      0.64      0.67     17234
 samples avg       0.66      0.66      0.65     17234



In [27]:
import pandas as pd

# Загрузим данные для предсказания и преобразуем их
submit_data =  load_dataset("csv", data_files={"submission": "test_without_answers.csv"})

Generating submission split: 8742 examples [00:00, 226970.35 examples/s]


In [28]:
submit_data = submit_data.map(one_hot_to_list)
submit_data = submit_data.map(lambda x: tokenizer(x["text"], padding="max_length", truncation=True, max_length=max_len), batched=True)

Map: 100%|██████████| 8742/8742 [00:00<00:00, 9165.49 examples/s] 
Map: 100%|██████████| 8742/8742 [00:03<00:00, 2912.00 examples/s]


In [29]:
submit_dataset = EmotionDataset(submit_data["submission"])
submit_dataloader = DataLoader(submit_dataset, batch_size=1, shuffle=False)

In [31]:
# Получим предсказания для submit датасета
loss, outputs, _ = validation(model, criterion, submit_dataloader)
outputs = np.array(outputs) >= 0.5

In [32]:
# Загрузим данные для предсказания и преобразуем их
df = pd.read_csv("test_without_answers.csv")
df[labels] = outputs.astype(int)
df.drop(columns=["text"], inplace=True)
df.insert(0, "id", range(1, 1 + len(df)))

In [33]:
df.head()

Unnamed: 0,id,anger,disgust,fear,joy,sadness,surprise,neutral
0,1,0,0,0,1,0,0,0
1,2,0,0,0,1,0,0,0
2,3,1,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1


In [None]:
# Сохраним предсказания в файл
df.to_csv("submissions/submission_0_59818.csv", index=False)