In [1]:
import pandas as pd
from datasets import load_dataset
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup

import torch

import os
import re
import random
import numpy as np

from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from sklearn import metrics
from sklearn.metrics import f1_score

2024-12-18 05:44:00.462896: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
else:
    exit()

NVIDIA A100-SXM4-80GB


In [3]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

In [4]:
# Загрузим датасет
data = load_dataset("csv", data_files={"train": "train.csv", "validation": "validation.csv", "test": "test.csv"})
# Список с метками классов
labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]

In [5]:
model_name = 'ai-forever/ruBert-large'
tokenizer = BertTokenizer.from_pretrained(model_name)
max_len = 256
epochs = 20

In [6]:
# Выполним предобработку данных
def cleaner(example):
    example["deep_"] = example["text"].lower()
    example["text"] = re.sub(r"[^a-zа-я\d]", " ", example["text"])
    example["text"] = re.sub(r"\s+", " ", example["text"])
    example["text"] = example["text"].strip()
    return example

def one_hot_to_list(example):
    emotions = []
    for emotion in labels:
        emotions.append(example[emotion])
    example["one_hot_labels"] = emotions

    return example

In [7]:
#data = data.map(cleaner)
#data = data.map(lemmatize_text)
#data = data.map(remove_stopwords)
data = data.map(one_hot_to_list)
data = data.map(lambda x: tokenizer(x["text"], padding="max_length", truncation=True, max_length=max_len), batched=True)

In [8]:
class EmotionDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        return {
            'input_ids': torch.tensor(self.dataset[index]["input_ids"], dtype=torch.long),
            'attention_mask': torch.tensor(self.dataset[index]["attention_mask"], dtype=torch.long),
            'token_type_ids': torch.tensor(self.dataset[index]["token_type_ids"], dtype=torch.long),
            'labels': torch.tensor(self.dataset[index]["one_hot_labels"], dtype=torch.float)
        }

In [9]:
train_dataset = EmotionDataset(data["train"])
valid_dataset = EmotionDataset(data["validation"])
test_dataset = EmotionDataset(data["test"])

In [10]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=64, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [11]:
class CustomBertModel(nn.Module):
    def __init__(self, pretrained_model_name, num_classes):
        super(CustomBertModel, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [12]:
model = CustomBertModel(pretrained_model_name=model_name, num_classes=7)
model = model.to(device)

  return self.fget.__get__(instance, owner)()


In [13]:
# Инициализируем оптимизатор и scheduler
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [14]:
def train(model, criterion, optimizer, scheduler, dataloader):
    model.train()
    train_loss = 0.0
    for data in dataloader:
        optimizer.zero_grad()
        ids = data["input_ids"].to(device)
        mask = data["attention_mask"].to(device)
        token_type_ids = data["token_type_ids"].to(device)
        labels = data["labels"].to(device)

        outputs = model(ids, mask, token_type_ids)
        loss = criterion(outputs, labels)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    return train_loss / len(dataloader)

In [15]:
def validation(model, criterion, dataloader):
    model.eval()
    val_loss = 0.0
    val_targets, val_outputs = [], []
    with torch.no_grad():
        for data in dataloader:
            ids = data["input_ids"].to(device)
            mask = data["attention_mask"].to(device)
            token_type_ids = data["token_type_ids"].to(device)
            labels = data["labels"].to(device)

            outputs = model(ids, mask, token_type_ids)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            val_targets.extend(labels.cpu().numpy())
            val_outputs.extend(torch.sigmoid(outputs).cpu().numpy())

    return val_loss / len(dataloader), val_outputs, val_targets

In [16]:
# Обучение и валидация
best_f1 = 0
patience = 3
patience_counter = 0

In [17]:
import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("training_ruBert-large.log"),
        logging.StreamHandler()
    ]
)
logging.info(f"Start model training with parameters:\n\n")
logging.info(f"Model name: {model_name}")
logging.info(f"Tokenization max length: {max_len}")
logging.info(f"DataLoader batch size: {64}")
logging.info(f"Optimizer: AdamW with lr 1e-5")
logging.info(f"Dataset cfg: extended.\n\n")
logging.info(f"GOOG LUCK:)\n\n")

2024-12-18 05:44:20,383 - INFO - Start model training with parameters:


2024-12-18 05:44:20,383 - INFO - Model name: ai-forever/ruBert-large
2024-12-18 05:44:20,384 - INFO - Tokenization max length: 256
2024-12-18 05:44:20,385 - INFO - DataLoader batch size: 64
2024-12-18 05:44:20,385 - INFO - Optimizer: AdamW with lr 1e-5
2024-12-18 05:44:20,386 - INFO - Dataset cfg: extended.


2024-12-18 05:44:20,386 - INFO - GOOG LUCK:)




In [18]:
for epoch in range(epochs):
    logging.info(f"Epoch: {epoch+1}.")
    train_loss = train(model, criterion, optimizer, scheduler, train_dataloader)
    val_loss, val_outputs, val_targets = validation(model, criterion, valid_dataloader)

    # Вычисление F1-score
    val_f1 = f1_score(val_targets, (np.array(val_outputs) > 0.5).astype(int), average='weighted')
    logging.info(f"Train loss: {train_loss}, Valid loss: {val_loss}, Valid F1: {val_f1}.")

    # Ранняя остановка
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pt')
        logging.info(f"Model saved.")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            logging.info("Early stopping.")
            break

2024-12-18 07:51:29,411 - INFO - Train loss: 0.23500615805132194, Valid loss: 0.2440314388772973, Valid F1: 0.6539695102583561.
2024-12-18 07:52:11,977 - INFO - Model saved.
2024-12-18 07:52:11,978 - INFO - Epoch: 3.
2024-12-18 08:55:22,770 - INFO - Train loss: 0.20693402633981525, Valid loss: 0.2471252618941981, Valid F1: 0.6675311884516196.
IOStream.flush timed out
2024-12-18 08:56:05,805 - INFO - Model saved.
2024-12-18 08:56:05,805 - INFO - Epoch: 4.
2024-12-18 09:59:17,279 - INFO - Train loss: 0.181764113050335, Valid loss: 0.2623018699426952, Valid F1: 0.6645075617785835.
2024-12-18 09:59:17,281 - INFO - Epoch: 5.
2024-12-18 11:02:31,385 - INFO - Train loss: 0.16235150498519416, Valid loss: 0.26639952744372003, Valid F1: 0.6633012602542193.
2024-12-18 11:02:31,387 - INFO - Epoch: 6.
2024-12-18 12:05:44,207 - INFO - Train loss: 0.1465960784997687, Valid loss: 0.28182867569659686, Valid F1: 0.6599971544976037.
2024-12-18 12:05:44,209 - INFO - Early stopping.


In [19]:
loss, outputs, targets = validation(model, criterion, valid_dataloader)
outputs = np.array(outputs) >= 0.5
logging.info('Validation report on validation dataset.')
logging.info(metrics.classification_report(targets, outputs, target_names=labels, zero_division=0))
val_f1 = f1_score(targets, (np.array(outputs) > 0.5).astype(int), average='weighted')
logging.info(f"Test F1-score (weighted) on validation dataset: {val_f1}.")

2024-12-18 12:11:09,087 - INFO - Validation report on validation dataset.
2024-12-18 12:11:09,141 - INFO -               precision    recall  f1-score   support

       anger       0.53      0.55      0.54      3594
     disgust       0.61      0.36      0.45      1555
        fear       0.66      0.60      0.63      1080
         joy       0.57      0.71      0.63      4784
     sadness       0.65      0.54      0.59      2825
    surprise       0.49      0.51      0.50      2155
     neutral       0.83      0.70      0.76     15084

   micro avg       0.68      0.63      0.66     31077
   macro avg       0.62      0.57      0.58     31077
weighted avg       0.70      0.63      0.66     31077
 samples avg       0.67      0.66      0.65     31077

2024-12-18 12:11:09,194 - INFO - Test F1-score (weighted) on validation dataset: 0.6599971544976037.


In [20]:
# Тестирование
logging.info('Validation report on test dataset.')
model.load_state_dict(torch.load('best_model.pt'))
test_loss, test_outputs, test_targets = validation(model, criterion, test_dataloader)
outputs = np.array(test_outputs) >= 0.5
logging.info(metrics.classification_report(test_targets, outputs, target_names=labels, zero_division=0))
test_f1 = f1_score(test_targets, (np.array(test_outputs) > 0.5).astype(int), average='weighted')
logging.info(f"Test F1-score (weighted) on test dataset: {test_f1}.")

2024-12-18 12:11:09,226 - INFO - Validation report on test dataset.
2024-12-18 12:14:12,880 - INFO -               precision    recall  f1-score   support

       anger       0.49      0.62      0.54      1844
     disgust       0.57      0.31      0.40      1001
        fear       0.62      0.55      0.58       701
         joy       0.51      0.73      0.60      1853
     sadness       0.60      0.58      0.59      1553
    surprise       0.54      0.54      0.54      1064
     neutral       0.85      0.70      0.77      9218

   micro avg       0.68      0.65      0.66     17234
   macro avg       0.60      0.58      0.58     17234
weighted avg       0.71      0.65      0.67     17234
 samples avg       0.66      0.66      0.66     17234

2024-12-18 12:14:12,915 - INFO - Test F1-score (weighted) on test dataset: 0.6678870423176024.


In [21]:
import pandas as pd

submit_data =  load_dataset("csv", data_files={"submission": "test_without_answers.csv"})

In [22]:
submit_data = submit_data.map(one_hot_to_list)
submit_data = submit_data.map(lambda x: tokenizer(x["text"], padding="max_length", truncation=True, max_length=max_len), batched=True)

Map: 100%|██████████| 8742/8742 [00:03<00:00, 2407.57 examples/s]


In [23]:
submit_dataset = EmotionDataset(submit_data["submission"])
submit_dataloader = DataLoader(submit_dataset, batch_size=1, shuffle=False)

In [24]:
# Получим предсказания
loss, outputs, _ = validation(model, criterion, submit_dataloader)
outputs = np.array(outputs) >= 0.5

In [25]:
# Загрузим данные
df = pd.read_csv("test_without_answers.csv")
df[labels] = outputs.astype(int)
df.drop(columns=["text"], inplace=True)
df.insert(0, "id", range(1, 1 + len(df)))

In [26]:
df.head()

Unnamed: 0,id,anger,disgust,fear,joy,sadness,surprise,neutral
0,1,0,0,0,1,0,0,0
1,2,0,0,0,1,0,0,0
2,3,1,0,0,0,0,0,0
3,4,0,0,0,0,1,0,0
4,5,0,0,0,0,0,0,1


In [27]:
df.to_csv("submission_training_ruBert-base_CL_dataset_Fdeep.csv", index=False)

In [None]:
logging.info(f"Submission public score: 0.57910")