## Задание: обучите модель классификации букв для задачи расстановки ударения с помощью методов из библиотеки transformers.
## Датасет для обучения можно взять отсюда: https://github.com/Koziev/NLP_Datasets/blob/master/Stress/all_accents.zip

### 1. Напишите класс для Dataset/Dataloder и разбейте данные на случайные train / test сплиты в соотношении 50:50. (1 балл)
### 2. Попробуйте обучить одну или несколько из моделей: Bert, Albert, Deberta. Посчитайте метрику Accuracy на train и test. (1 балл). При преодолении порога в Accuracy на test 0.8: (+1 балл), 0.85: (+2 балла), 0.89: (+3 балла).
### Пример конфигурации для deberta: https://huggingface.co/IlyaGusev/ru-word-stress-transformer/blob/main/config.json

In [2]:
!git clone https://github.com/KuzmaKhrabrov/character-tokenizer.git

Cloning into 'character-tokenizer'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 20 (delta 5), reused 10 (delta 3), pack-reused 0[K
Receiving objects: 100% (20/20), 5.89 KiB | 1.96 MiB/s, done.
Resolving deltas: 100% (5/5), done.


In [3]:
!pip install transformers



In [4]:
import string
import sys
sys.path.append("/kaggle/working/character-tokenizer")
from charactertokenizer import CharacterTokenizer

device = 'cuda'


chars = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"
model_max_length = 64
tokenizer = CharacterTokenizer(chars, model_max_length)

In [5]:
example = "Привет"
tokens = tokenizer(example)
print(tokens)
print(tokenizer.decode(tokens['input_ids']))

{'input_ids': [0, 39, 42, 26, 12, 18, 46, 1], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}




[CLS]Привет[SEP]


## Импортируем датасет

In [6]:
import pandas as pd
data = pd.read_csv('/kaggle/input/accents/all_accents.tsv', sep='\t')
data

Unnamed: 0,-де,-д^е
0,-ка,-к^а
1,-либо,-л^ибо
2,-нибудь,-ниб^удь
3,-с,-с
4,-таки,-так^и
...,...,...
1680529,ӂюль-верновский,ӂюль-в^ерновский
1680530,ӂюрить,ӂюр^ить
1680531,ӂӂение,ӂӂ^ение
1680532,ӂӂенный,ӂӂенный


In [7]:
data.rename(columns={"-де": "words", "-д^е": "answers"}, inplace=True)

In [8]:
data

Unnamed: 0,words,answers
0,-ка,-к^а
1,-либо,-л^ибо
2,-нибудь,-ниб^удь
3,-с,-с
4,-таки,-так^и
...,...,...
1680529,ӂюль-верновский,ӂюль-в^ерновский
1680530,ӂюрить,ӂюр^ить
1680531,ӂӂение,ӂӂ^ение
1680532,ӂӂенный,ӂӂенный


In [9]:
data.shape

(1680534, 2)

In [10]:
# проверка, все ли строки валидные

data[(data['answers'].str.contains('^')==True)].shape

(1680534, 2)

In [11]:
from sklearn.utils import shuffle
data = shuffle(data)

In [12]:
def get_emphasis_number(word: str):
    vowel_list = 'еуыаоэюия'
    order = 0
    for idx, letter in enumerate(word):
        if letter in vowel_list:
            order += 1
        
        if '^' not in word[idx:]:
            if (order == 0):
                return 1
            return order
        
    
print(get_emphasis_number('ӂӂ^ение'))
print(get_emphasis_number('ӂӂени^е'))

1
3


In [13]:
data["labels"] = data["answers"].apply(get_emphasis_number)

In [14]:
data.labels.unique()

array([ 4,  3,  2,  5,  1,  6,  7,  9,  8, 13, 10, 11, 12, 15, 21])

In [15]:
data.labels.value_counts()

labels
3     586477
2     563967
1     234078
4     209830
5      64004
6      16992
7       4056
8        900
9        171
10        38
11        12
13         4
12         3
15         1
21         1
Name: count, dtype: int64

In [16]:
print(data[data['labels']==21].words)
print(data[data['labels']==15].words)

624999    лланвайрпуллгуингиллгогерихуирндробуллллантиси...
Name: words, dtype: object
790570    никотинамидадениндинуклеотидфосфат
Name: words, dtype: object


In [17]:
data = data[data['labels']<9]

In [18]:
number_of_classes = len(data.labels.unique())
print(number_of_classes)

8


In [19]:
new_data = data.copy()

In [20]:
classes = []
n_samples = 4000

for label in range(1, number_of_classes + 1):
    classes.append(new_data[new_data['labels']==label])
    if len(classes[-1]) > n_samples:
        classes[-1] = classes[-1].sample(n=n_samples)

In [21]:
for cl in classes:
    print(cl.shape)

(4000, 3)
(4000, 3)
(4000, 3)
(4000, 3)
(4000, 3)
(4000, 3)
(4000, 3)
(900, 3)


In [22]:
prepared_data = pd.concat(classes)

In [23]:
prepared_data.labels.value_counts()

labels
1    4000
2    4000
3    4000
4    4000
5    4000
6    4000
7    4000
8     900
Name: count, dtype: int64

In [24]:
prepared_data.reset_index(drop=True, inplace=True)

## Напишем кастомный датасет

In [25]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split

In [26]:
class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
#         elem = self.words.iloc[idx]
#         word = self.tokenizer(elem["words"], padding=True, return_tensors='pt')
# #         ans = self.tokenizer(elem["answers"])
#         label = elem["label"]
#         return {"word": word, "label": label}
        return (self.x[idx], self.y[idx])
    
def data_collator(batch):
#     y = torch.Tensor([p[1] for p in batch]).to(model.device)
#     print("Before")
#     print("Batch:", batch)
    y = [torch.zeros(number_of_classes) for i in range(len(batch))]
    for vec, p in zip(y, batch):
        label = p[1]
        vec[p[1] - 1] = 1
    y = torch.stack(y, dim=0).to(device)
        
#     print("After")
#     x = tokenizer([p[0] for p in batch], return_tensors='pt', padding=True).to(model.device)
#     x = tokenizer([p['word']['input_ids'] for p in batch], return_tensors='pt', padding=True)
#     x = [p['word'] for p in batch]
    x = tokenizer([p[0] for p in batch], return_tensors='pt', padding=True).to(device)
    return (x, torch.Tensor(y))
        

In [27]:
dataset = CustomDataset(prepared_data.words.values, prepared_data.labels)
train_size = int(len(dataset) * 0.5)
test_size = int(len(dataset) - train_size)
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
val_size = int(len(train_dataset) * 0.1)
train_dataset, val_dataset = random_split(train_dataset, [len(train_dataset) - val_size, val_size])

In [28]:
len(train_dataset)

13005

In [29]:
BATCH_SIZE = 128
N = 0

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,num_workers=N,
                          collate_fn=data_collator, drop_last = True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=N, 
                          collate_fn=data_collator, drop_last = True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=N,
                         collate_fn=data_collator, drop_last = False)

In [30]:
batch = next(iter(train_loader))

In [31]:
batch[0][:2]

{'input_ids': tensor([[ 0, 40,  8, 42, 46, 14, 38, 44, 36, 38, 34, 18, 36, 30, 32,  8, 46, 48,
          42,  8,  1,  4,  4,  4,  4],
         [ 0, 40, 42, 38, 14, 48, 16, 26,  1,  4,  4,  4,  4,  4,  4,  4,  4,  4,
           4,  4,  4,  4,  4,  4,  4]], device='cuda:0'),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
          0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0]], device='cuda:0')}

In [32]:
batch[1][:2]

tensor([[0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0.]], device='cuda:0')

## Тренировочный цикл

In [33]:
from tqdm import tqdm
import torch
from torch.optim.lr_scheduler import StepLR
from torch.nn.functional import softmax

SAVE_PATH = "model_params"


def train_loop(
    model, train_dataloader, val_dataloader, 
    max_epochs=10, 
    lr=1e-5,
    eval_steps = 10000
):
    optimizer = torch.optim.Adam(params = model.parameters(), lr=lr)
    scheduler = StepLR(optimizer, step_size = 3, gamma=0.5)
    best_f1 = float('-inf')
    
    for epoch in range(max_epochs):
        print('EPOCH', epoch)
        losses = list()
        for i, (x, y) in tqdm(enumerate(train_dataloader)):
#             inputs = [elem['input_ids'] for elem in x]
#             attention_mask = [elem['attention_mask'] for elem in x]
#             token_type_ids = [elem['token_type_ids'] for elem in x]
            output = model(
                input_ids=x.input_ids,
                attention_mask=x.attention_mask,
                token_type_ids=x.token_type_ids,
                labels=y,
                return_dict=True
            )
#             print("labels", labels)
#             print("output", output)
            loss = output.loss
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            losses.append(loss.item())
            
            if i % eval_steps == 0:
                model.eval()
                train_loss = np.mean(losses[-eval_steps:])
                eval_loss, eval_f1, eval_accuracy, _ = evaluate_model(model, val_dataloader)
                if eval_f1 > best_f1:
                    best_f1 = eval_f1
                    torch.save(model.state_dict(), SAVE_PATH)
                print(f'step {i} train_loss: {train_loss:.3} eval_loss: {eval_loss:.3} eval_f1: {eval_f1:.3} eval_accuracy: {eval_accuracy:.3}')
                model.train()
        scheduler.step()

In [34]:
from sklearn.metrics import f1_score, accuracy_score


def evaluate_model(model, test_dataloader):
    num = 0
    den = 0
    y_true = list()
    y_pred = list()
    y_pred_prob = list()
    f1_valid = .0
    for x, y in test_dataloader:
        with torch.no_grad():
            output = model(
                input_ids=x.input_ids,
                attention_mask=x.attention_mask,
                token_type_ids=x.token_type_ids,
                labels=y,
                return_dict=True
            )
            loss = output.loss
            
            num += len(x) * loss.item()
            den += len(x)
            
            y_pred.extend(torch.argmax(output.logits, 1).tolist())
            y_pred_prob.extend(softmax(output.logits, dim = 1)[:, 1].tolist())
            y_true.extend(torch.argmax(y, 1).tolist())
            
    val_loss = num / den
    f1_valid = f1_score(y_true, y_pred, average = 'micro')
    accuracy_valid = accuracy_score(y_true, y_pred)
    return val_loss, f1_valid, accuracy_valid, y_pred_prob

## Подгрузим модель

In [35]:
from transformers import BertForSequenceClassification, DebertaV2ForTokenClassification, DebertaV2Config


# encoder =  BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=0, eos_token_id=1)
# decoder = BertGenerationDecoder.from_pretrained("bert-l-uncased", add_cross_attention=True, is_decoder=True,
#                                                bos_token_id=0, eos_token_id=1)

# configuration = DebertaV2Config()

model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-uncased", problem_type="multi_label_classification", num_labels=number_of_classes).cuda()

Downloading config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Цикл обучения

In [36]:
EPOCHS = 40

train_loop(model, train_loader, val_loader, max_epochs=EPOCHS, lr=8e-5, eval_steps=40)

EPOCH 0


1it [00:04,  4.68s/it]

step 0 train_loss: 0.674 eval_loss: 0.617 eval_f1: 0.0831 eval_accuracy: 0.0831


41it [00:20,  1.17s/it]

step 40 train_loss: 0.379 eval_loss: 0.319 eval_f1: 0.262 eval_accuracy: 0.262


81it [00:37,  1.19s/it]

step 80 train_loss: 0.314 eval_loss: 0.288 eval_f1: 0.396 eval_accuracy: 0.396


101it [00:43,  2.30it/s]


EPOCH 1


1it [00:01,  1.59s/it]

step 0 train_loss: 0.282 eval_loss: 0.295 eval_f1: 0.368 eval_accuracy: 0.368


41it [00:17,  1.13s/it]

step 40 train_loss: 0.273 eval_loss: 0.272 eval_f1: 0.428 eval_accuracy: 0.428


81it [00:33,  1.14s/it]

step 80 train_loss: 0.257 eval_loss: 0.25 eval_f1: 0.469 eval_accuracy: 0.469


101it [00:40,  2.48it/s]


EPOCH 2


1it [00:03,  3.04s/it]

step 0 train_loss: 0.244 eval_loss: 0.251 eval_f1: 0.491 eval_accuracy: 0.491


41it [00:19,  1.13s/it]

step 40 train_loss: 0.24 eval_loss: 0.222 eval_f1: 0.569 eval_accuracy: 0.569


81it [00:33,  1.40it/s]

step 80 train_loss: 0.225 eval_loss: 0.217 eval_f1: 0.551 eval_accuracy: 0.551


101it [00:40,  2.50it/s]


EPOCH 3


1it [00:02,  2.96s/it]

step 0 train_loss: 0.238 eval_loss: 0.211 eval_f1: 0.575 eval_accuracy: 0.575


41it [00:19,  1.15s/it]

step 40 train_loss: 0.211 eval_loss: 0.194 eval_f1: 0.629 eval_accuracy: 0.629


81it [00:35,  1.11s/it]

step 80 train_loss: 0.202 eval_loss: 0.189 eval_f1: 0.638 eval_accuracy: 0.638


101it [00:42,  2.40it/s]


EPOCH 4


1it [00:03,  3.00s/it]

step 0 train_loss: 0.196 eval_loss: 0.191 eval_f1: 0.654 eval_accuracy: 0.654


41it [00:19,  1.13s/it]

step 40 train_loss: 0.198 eval_loss: 0.178 eval_f1: 0.679 eval_accuracy: 0.679


81it [00:33,  1.42it/s]

step 80 train_loss: 0.194 eval_loss: 0.191 eval_f1: 0.656 eval_accuracy: 0.656


101it [00:40,  2.49it/s]


EPOCH 5


1it [00:01,  1.52s/it]

step 0 train_loss: 0.181 eval_loss: 0.197 eval_f1: 0.631 eval_accuracy: 0.631


41it [00:17,  1.11s/it]

step 40 train_loss: 0.185 eval_loss: 0.177 eval_f1: 0.691 eval_accuracy: 0.691


81it [00:32,  1.42it/s]

step 80 train_loss: 0.177 eval_loss: 0.186 eval_f1: 0.665 eval_accuracy: 0.665


101it [00:39,  2.58it/s]


EPOCH 6


1it [00:02,  2.98s/it]

step 0 train_loss: 0.166 eval_loss: 0.163 eval_f1: 0.712 eval_accuracy: 0.712


41it [00:19,  1.13s/it]

step 40 train_loss: 0.167 eval_loss: 0.157 eval_f1: 0.736 eval_accuracy: 0.736


81it [00:33,  1.39it/s]

step 80 train_loss: 0.162 eval_loss: 0.163 eval_f1: 0.726 eval_accuracy: 0.726


101it [00:40,  2.51it/s]


EPOCH 7


1it [00:01,  1.54s/it]

step 0 train_loss: 0.163 eval_loss: 0.162 eval_f1: 0.728 eval_accuracy: 0.728


41it [00:17,  1.13s/it]

step 40 train_loss: 0.155 eval_loss: 0.155 eval_f1: 0.754 eval_accuracy: 0.754


81it [00:33,  1.14s/it]

step 80 train_loss: 0.156 eval_loss: 0.143 eval_f1: 0.756 eval_accuracy: 0.756


101it [00:40,  2.51it/s]


EPOCH 8


1it [00:02,  2.96s/it]

step 0 train_loss: 0.155 eval_loss: 0.146 eval_f1: 0.761 eval_accuracy: 0.761


41it [00:17,  1.42it/s]

step 40 train_loss: 0.154 eval_loss: 0.154 eval_f1: 0.746 eval_accuracy: 0.746


81it [00:32,  1.43it/s]

step 80 train_loss: 0.155 eval_loss: 0.148 eval_f1: 0.758 eval_accuracy: 0.758


101it [00:39,  2.59it/s]


EPOCH 9


1it [00:01,  1.62s/it]

step 0 train_loss: 0.159 eval_loss: 0.145 eval_f1: 0.757 eval_accuracy: 0.757


41it [00:16,  1.42it/s]

step 40 train_loss: 0.144 eval_loss: 0.145 eval_f1: 0.759 eval_accuracy: 0.759


81it [00:32,  1.14s/it]

step 80 train_loss: 0.142 eval_loss: 0.139 eval_f1: 0.774 eval_accuracy: 0.774


101it [00:39,  2.58it/s]


EPOCH 10


1it [00:03,  3.07s/it]

step 0 train_loss: 0.133 eval_loss: 0.137 eval_f1: 0.775 eval_accuracy: 0.775


41it [00:19,  1.13s/it]

step 40 train_loss: 0.136 eval_loss: 0.138 eval_f1: 0.776 eval_accuracy: 0.776


81it [00:35,  1.11s/it]

step 80 train_loss: 0.141 eval_loss: 0.135 eval_f1: 0.778 eval_accuracy: 0.778


101it [00:41,  2.41it/s]


EPOCH 11


1it [00:01,  1.59s/it]

step 0 train_loss: 0.145 eval_loss: 0.136 eval_f1: 0.778 eval_accuracy: 0.778


41it [00:17,  1.12s/it]

step 40 train_loss: 0.134 eval_loss: 0.132 eval_f1: 0.785 eval_accuracy: 0.785


81it [00:32,  1.41it/s]

step 80 train_loss: 0.133 eval_loss: 0.136 eval_f1: 0.778 eval_accuracy: 0.778


101it [00:39,  2.58it/s]


EPOCH 12


1it [00:01,  1.57s/it]

step 0 train_loss: 0.144 eval_loss: 0.136 eval_f1: 0.776 eval_accuracy: 0.776


41it [00:16,  1.43it/s]

step 40 train_loss: 0.131 eval_loss: 0.132 eval_f1: 0.785 eval_accuracy: 0.785


81it [00:32,  1.14s/it]

step 80 train_loss: 0.127 eval_loss: 0.131 eval_f1: 0.792 eval_accuracy: 0.792


101it [00:39,  2.56it/s]


EPOCH 13


1it [00:01,  1.53s/it]

step 0 train_loss: 0.12 eval_loss: 0.133 eval_f1: 0.786 eval_accuracy: 0.786


41it [00:16,  1.41it/s]

step 40 train_loss: 0.127 eval_loss: 0.131 eval_f1: 0.792 eval_accuracy: 0.792


81it [00:30,  1.39it/s]

step 80 train_loss: 0.127 eval_loss: 0.13 eval_f1: 0.786 eval_accuracy: 0.786


101it [00:37,  2.69it/s]


EPOCH 14


1it [00:01,  1.71s/it]

step 0 train_loss: 0.109 eval_loss: 0.13 eval_f1: 0.789 eval_accuracy: 0.789


41it [00:17,  1.13s/it]

step 40 train_loss: 0.125 eval_loss: 0.127 eval_f1: 0.803 eval_accuracy: 0.803


81it [00:32,  1.42it/s]

step 80 train_loss: 0.125 eval_loss: 0.129 eval_f1: 0.789 eval_accuracy: 0.789


101it [00:39,  2.58it/s]


EPOCH 15


1it [00:01,  1.59s/it]

step 0 train_loss: 0.106 eval_loss: 0.128 eval_f1: 0.795 eval_accuracy: 0.795


41it [00:16,  1.43it/s]

step 40 train_loss: 0.121 eval_loss: 0.127 eval_f1: 0.796 eval_accuracy: 0.796


81it [00:30,  1.35it/s]

step 80 train_loss: 0.122 eval_loss: 0.126 eval_f1: 0.801 eval_accuracy: 0.801


101it [00:37,  2.68it/s]


EPOCH 16


1it [00:01,  1.57s/it]

step 0 train_loss: 0.122 eval_loss: 0.13 eval_f1: 0.791 eval_accuracy: 0.791


41it [00:16,  1.43it/s]

step 40 train_loss: 0.119 eval_loss: 0.126 eval_f1: 0.794 eval_accuracy: 0.794


81it [00:30,  1.42it/s]

step 80 train_loss: 0.121 eval_loss: 0.126 eval_f1: 0.802 eval_accuracy: 0.802


101it [00:37,  2.68it/s]


EPOCH 17


1it [00:01,  1.57s/it]

step 0 train_loss: 0.115 eval_loss: 0.126 eval_f1: 0.8 eval_accuracy: 0.8


41it [00:16,  1.40it/s]

step 40 train_loss: 0.118 eval_loss: 0.127 eval_f1: 0.795 eval_accuracy: 0.795


81it [00:31,  1.44it/s]

step 80 train_loss: 0.117 eval_loss: 0.128 eval_f1: 0.793 eval_accuracy: 0.793


101it [00:37,  2.67it/s]


EPOCH 18


1it [00:03,  3.02s/it]

step 0 train_loss: 0.116 eval_loss: 0.123 eval_f1: 0.804 eval_accuracy: 0.804


41it [00:17,  1.38it/s]

step 40 train_loss: 0.119 eval_loss: 0.125 eval_f1: 0.799 eval_accuracy: 0.799


81it [00:32,  1.42it/s]

step 80 train_loss: 0.117 eval_loss: 0.124 eval_f1: 0.801 eval_accuracy: 0.801


101it [00:39,  2.57it/s]


EPOCH 19


1it [00:01,  1.57s/it]

step 0 train_loss: 0.118 eval_loss: 0.125 eval_f1: 0.8 eval_accuracy: 0.8


41it [00:16,  1.39it/s]

step 40 train_loss: 0.119 eval_loss: 0.124 eval_f1: 0.8 eval_accuracy: 0.8


81it [00:30,  1.42it/s]

step 80 train_loss: 0.116 eval_loss: 0.126 eval_f1: 0.795 eval_accuracy: 0.795


101it [00:37,  2.67it/s]


EPOCH 20


1it [00:01,  1.54s/it]

step 0 train_loss: 0.107 eval_loss: 0.126 eval_f1: 0.799 eval_accuracy: 0.799


41it [00:16,  1.42it/s]

step 40 train_loss: 0.116 eval_loss: 0.123 eval_f1: 0.803 eval_accuracy: 0.803


81it [00:32,  1.12s/it]

step 80 train_loss: 0.118 eval_loss: 0.124 eval_f1: 0.806 eval_accuracy: 0.806


101it [00:39,  2.58it/s]


EPOCH 21


1it [00:01,  1.56s/it]

step 0 train_loss: 0.0989 eval_loss: 0.125 eval_f1: 0.803 eval_accuracy: 0.803


41it [00:16,  1.42it/s]

step 40 train_loss: 0.118 eval_loss: 0.126 eval_f1: 0.793 eval_accuracy: 0.793


81it [00:30,  1.41it/s]

step 80 train_loss: 0.118 eval_loss: 0.125 eval_f1: 0.802 eval_accuracy: 0.802


101it [00:37,  2.67it/s]


EPOCH 22


1it [00:01,  1.57s/it]

step 0 train_loss: 0.106 eval_loss: 0.124 eval_f1: 0.804 eval_accuracy: 0.804


41it [00:16,  1.42it/s]

step 40 train_loss: 0.115 eval_loss: 0.124 eval_f1: 0.804 eval_accuracy: 0.804


81it [00:30,  1.42it/s]

step 80 train_loss: 0.116 eval_loss: 0.124 eval_f1: 0.799 eval_accuracy: 0.799


101it [00:37,  2.69it/s]


EPOCH 23


1it [00:01,  1.59s/it]

step 0 train_loss: 0.107 eval_loss: 0.123 eval_f1: 0.802 eval_accuracy: 0.802


41it [00:16,  1.42it/s]

step 40 train_loss: 0.114 eval_loss: 0.123 eval_f1: 0.802 eval_accuracy: 0.802


81it [00:31,  1.42it/s]

step 80 train_loss: 0.117 eval_loss: 0.124 eval_f1: 0.795 eval_accuracy: 0.795


101it [00:37,  2.66it/s]


EPOCH 24


1it [00:01,  1.56s/it]

step 0 train_loss: 0.11 eval_loss: 0.124 eval_f1: 0.797 eval_accuracy: 0.797


41it [00:16,  1.39it/s]

step 40 train_loss: 0.113 eval_loss: 0.124 eval_f1: 0.8 eval_accuracy: 0.8


81it [00:31,  1.40it/s]

step 80 train_loss: 0.115 eval_loss: 0.124 eval_f1: 0.8 eval_accuracy: 0.8


101it [00:37,  2.67it/s]


EPOCH 25


1it [00:01,  1.53s/it]

step 0 train_loss: 0.119 eval_loss: 0.126 eval_f1: 0.795 eval_accuracy: 0.795


81it [00:31,  1.40it/s]

step 80 train_loss: 0.114 eval_loss: 0.124 eval_f1: 0.799 eval_accuracy: 0.799


101it [00:37,  2.66it/s]


EPOCH 26


1it [00:01,  1.55s/it]

step 0 train_loss: 0.125 eval_loss: 0.123 eval_f1: 0.8 eval_accuracy: 0.8


41it [00:16,  1.41it/s]

step 40 train_loss: 0.114 eval_loss: 0.125 eval_f1: 0.798 eval_accuracy: 0.798


81it [00:31,  1.42it/s]

step 80 train_loss: 0.113 eval_loss: 0.124 eval_f1: 0.799 eval_accuracy: 0.799


101it [00:37,  2.68it/s]


EPOCH 27


1it [00:01,  1.55s/it]

step 0 train_loss: 0.0879 eval_loss: 0.124 eval_f1: 0.797 eval_accuracy: 0.797


41it [00:16,  1.43it/s]

step 40 train_loss: 0.117 eval_loss: 0.123 eval_f1: 0.801 eval_accuracy: 0.801


81it [00:31,  1.40it/s]

step 80 train_loss: 0.112 eval_loss: 0.124 eval_f1: 0.801 eval_accuracy: 0.801


101it [00:37,  2.67it/s]


EPOCH 28


1it [00:01,  1.55s/it]

step 0 train_loss: 0.0994 eval_loss: 0.123 eval_f1: 0.801 eval_accuracy: 0.801


41it [00:16,  1.42it/s]

step 40 train_loss: 0.117 eval_loss: 0.121 eval_f1: 0.806 eval_accuracy: 0.806


81it [00:31,  1.44it/s]

step 80 train_loss: 0.114 eval_loss: 0.123 eval_f1: 0.8 eval_accuracy: 0.8


101it [00:38,  2.66it/s]


EPOCH 29


1it [00:01,  1.54s/it]

step 0 train_loss: 0.138 eval_loss: 0.125 eval_f1: 0.799 eval_accuracy: 0.799


41it [00:16,  1.42it/s]

step 40 train_loss: 0.115 eval_loss: 0.124 eval_f1: 0.798 eval_accuracy: 0.798


81it [00:31,  1.44it/s]

step 80 train_loss: 0.114 eval_loss: 0.125 eval_f1: 0.796 eval_accuracy: 0.796


101it [00:37,  2.68it/s]


EPOCH 30


1it [00:01,  1.58s/it]

step 0 train_loss: 0.0942 eval_loss: 0.123 eval_f1: 0.8 eval_accuracy: 0.8


41it [00:16,  1.39it/s]

step 40 train_loss: 0.116 eval_loss: 0.124 eval_f1: 0.803 eval_accuracy: 0.803


81it [00:30,  1.39it/s]

step 80 train_loss: 0.114 eval_loss: 0.124 eval_f1: 0.798 eval_accuracy: 0.798


101it [00:37,  2.67it/s]


EPOCH 31


1it [00:01,  1.57s/it]

step 0 train_loss: 0.106 eval_loss: 0.125 eval_f1: 0.8 eval_accuracy: 0.8


41it [00:16,  1.41it/s]

step 40 train_loss: 0.115 eval_loss: 0.125 eval_f1: 0.799 eval_accuracy: 0.799


81it [00:30,  1.43it/s]

step 80 train_loss: 0.113 eval_loss: 0.123 eval_f1: 0.799 eval_accuracy: 0.799


101it [00:37,  2.69it/s]


EPOCH 32


1it [00:01,  1.60s/it]

step 0 train_loss: 0.105 eval_loss: 0.125 eval_f1: 0.795 eval_accuracy: 0.795


41it [00:16,  1.43it/s]

step 40 train_loss: 0.118 eval_loss: 0.124 eval_f1: 0.799 eval_accuracy: 0.799


81it [00:31,  1.43it/s]

step 80 train_loss: 0.11 eval_loss: 0.124 eval_f1: 0.798 eval_accuracy: 0.798


101it [00:37,  2.67it/s]


EPOCH 33


1it [00:01,  1.55s/it]

step 0 train_loss: 0.139 eval_loss: 0.124 eval_f1: 0.799 eval_accuracy: 0.799


41it [00:16,  1.38it/s]

step 40 train_loss: 0.113 eval_loss: 0.124 eval_f1: 0.798 eval_accuracy: 0.798


81it [00:31,  1.41it/s]

step 80 train_loss: 0.113 eval_loss: 0.124 eval_f1: 0.8 eval_accuracy: 0.8


101it [00:37,  2.67it/s]


EPOCH 34


1it [00:01,  1.57s/it]

step 0 train_loss: 0.0923 eval_loss: 0.122 eval_f1: 0.799 eval_accuracy: 0.799


41it [00:16,  1.42it/s]

step 40 train_loss: 0.115 eval_loss: 0.123 eval_f1: 0.8 eval_accuracy: 0.8


81it [00:31,  1.42it/s]

step 80 train_loss: 0.115 eval_loss: 0.124 eval_f1: 0.797 eval_accuracy: 0.797


101it [00:37,  2.67it/s]


EPOCH 35


1it [00:01,  1.69s/it]

step 0 train_loss: 0.111 eval_loss: 0.123 eval_f1: 0.798 eval_accuracy: 0.798


41it [00:16,  1.41it/s]

step 40 train_loss: 0.112 eval_loss: 0.124 eval_f1: 0.797 eval_accuracy: 0.797


81it [00:31,  1.44it/s]

step 80 train_loss: 0.111 eval_loss: 0.125 eval_f1: 0.797 eval_accuracy: 0.797


101it [00:37,  2.67it/s]


EPOCH 36


1it [00:01,  1.58s/it]

step 0 train_loss: 0.125 eval_loss: 0.124 eval_f1: 0.798 eval_accuracy: 0.798


41it [00:16,  1.41it/s]

step 40 train_loss: 0.115 eval_loss: 0.124 eval_f1: 0.798 eval_accuracy: 0.798


81it [00:31,  1.44it/s]

step 80 train_loss: 0.112 eval_loss: 0.125 eval_f1: 0.795 eval_accuracy: 0.795


101it [00:37,  2.67it/s]


EPOCH 37


1it [00:01,  1.57s/it]

step 0 train_loss: 0.13 eval_loss: 0.124 eval_f1: 0.799 eval_accuracy: 0.799


41it [00:16,  1.41it/s]

step 40 train_loss: 0.115 eval_loss: 0.126 eval_f1: 0.793 eval_accuracy: 0.793


81it [00:30,  1.42it/s]

step 80 train_loss: 0.115 eval_loss: 0.124 eval_f1: 0.795 eval_accuracy: 0.795


101it [00:37,  2.68it/s]


EPOCH 38


1it [00:01,  1.54s/it]

step 0 train_loss: 0.117 eval_loss: 0.123 eval_f1: 0.799 eval_accuracy: 0.799


41it [00:16,  1.42it/s]

step 40 train_loss: 0.111 eval_loss: 0.124 eval_f1: 0.8 eval_accuracy: 0.8


81it [00:31,  1.41it/s]

step 80 train_loss: 0.117 eval_loss: 0.124 eval_f1: 0.797 eval_accuracy: 0.797


101it [00:37,  2.67it/s]


EPOCH 39


1it [00:01,  1.55s/it]

step 0 train_loss: 0.114 eval_loss: 0.124 eval_f1: 0.798 eval_accuracy: 0.798


41it [00:16,  1.34it/s]

step 40 train_loss: 0.113 eval_loss: 0.125 eval_f1: 0.795 eval_accuracy: 0.795


81it [00:30,  1.42it/s]

step 80 train_loss: 0.113 eval_loss: 0.125 eval_f1: 0.796 eval_accuracy: 0.796


101it [00:37,  2.68it/s]


In [37]:
num = 0
den = 0
y_true = list()
y_pred = list()
y_pred_prob = list()
f1_valid = .0


for x, y in val_loader:
        with torch.no_grad():
            output = model(
                input_ids=x.input_ids,
                attention_mask=x.attention_mask,
                token_type_ids=x.token_type_ids,
                labels=y,
                return_dict=True
            )
            loss = output.loss
            
            y_pred.extend(torch.argmax(output.logits, 1).tolist())
            y_pred_prob.extend(softmax(output.logits, dim = 1)[:, 1].tolist())
            y_true.extend(torch.argmax(y, 1).tolist())
            
            print(y_pred)
            print(y_true)
            break

[6, 5, 5, 3, 2, 3, 2, 3, 5, 0, 3, 6, 6, 3, 4, 6, 3, 6, 2, 2, 3, 3, 3, 6, 1, 5, 6, 5, 4, 5, 5, 1, 5, 4, 2, 4, 0, 4, 3, 0, 2, 5, 5, 0, 6, 1, 6, 6, 3, 1, 0, 5, 4, 2, 0, 3, 5, 1, 7, 2, 3, 6, 1, 6, 5, 4, 1, 6, 5, 4, 2, 6, 3, 6, 2, 6, 1, 6, 1, 6, 6, 5, 4, 0, 4, 5, 3, 1, 5, 0, 4, 1, 0, 4, 4, 6, 4, 5, 4, 6, 6, 2, 5, 3, 2, 6, 5, 4, 7, 5, 5, 2, 4, 1, 3, 2, 0, 5, 3, 4, 6, 6, 2, 1, 3, 2, 4, 1]
[6, 4, 5, 3, 2, 3, 2, 3, 5, 0, 3, 6, 6, 3, 4, 6, 2, 6, 2, 2, 3, 2, 3, 6, 1, 5, 5, 5, 4, 5, 5, 1, 5, 4, 2, 4, 2, 4, 3, 0, 2, 5, 5, 0, 6, 1, 6, 5, 3, 0, 0, 5, 5, 2, 3, 3, 5, 0, 7, 1, 3, 6, 1, 6, 5, 4, 1, 4, 5, 4, 2, 6, 4, 6, 2, 6, 1, 6, 2, 6, 6, 5, 4, 1, 4, 5, 3, 2, 5, 0, 4, 2, 1, 4, 4, 6, 4, 5, 4, 6, 6, 2, 4, 3, 2, 6, 5, 4, 7, 5, 5, 1, 4, 1, 2, 1, 0, 5, 3, 4, 6, 6, 2, 2, 3, 1, 4, 0]


# Проверка на тестовой выборке

In [38]:
y_true = list()
y_pred = list()
y_pred_prob = list()
f1_valid = .0


for x, y in val_loader:
        with torch.no_grad():
            output = model(
                input_ids=x.input_ids,
                attention_mask=x.attention_mask,
                token_type_ids=x.token_type_ids,
                labels=y,
                return_dict=True
            )
            loss = output.loss
            
            y_pred.extend(torch.argmax(output.logits, 1).tolist())
            y_pred_prob.extend(softmax(output.logits, dim = 1)[:, 1].tolist())
            y_true.extend(torch.argmax(y, 1).tolist())
            
accuracy_test = accuracy_score(y_true, y_pred)
            

In [39]:
accuracy_test

0.7904829545454546

# Если загрузить веса с лучшей эпохи

In [45]:
model_best = model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-uncased", problem_type="multi_label_classification", num_labels=number_of_classes).cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
model_best.load_state_dict(torch.load(SAVE_PATH))

<All keys matched successfully>

In [48]:
model_best.eval()


y_true = list()
y_pred = list()
y_pred_prob = list()
f1_valid = .0


for x, y in val_loader:
        with torch.no_grad():
            output = model(
                input_ids=x.input_ids,
                attention_mask=x.attention_mask,
                token_type_ids=x.token_type_ids,
                labels=y,
                return_dict=True
            )
            loss = output.loss
            
            y_pred.extend(torch.argmax(output.logits, 1).tolist())
            y_pred_prob.extend(softmax(output.logits, dim = 1)[:, 1].tolist())
            y_true.extend(torch.argmax(y, 1).tolist())
            
accuracy_test = accuracy_score(y_true, y_pred)
            

In [49]:
accuracy_test

0.8082386363636364

# Итог: с помощью BERT получилось добиться accuracy = 0.808