In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from tqdm import tqdm

from transformers import BertTokenizer, BertModel, AdamW
from datasets import load_dataset

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
device = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Загрузка токенизатора для модели BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Загрузка самой модели BERT
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

# Инфо по слоям
```python
    (word_embeddings): Embedding(30522, 768, padding_idx=0): число "30522" - это размер словаря модели
    
    (token_type_embeddings): Embedding(2, 768): слой для того, чтоб разделить две последовательности входных данных (обычно это "запрос" и "ответ")
    
    (query), (key), (value): Представляют собой "метаданные", которые хранят в себе информацию о том, как смысл слов влияет на смысл друг друга
    
    (pooler) - слой, который объединяет в себе всю полученную информацию для вывода
    
```

In [None]:
ds = load_dataset("dair-ai/emotion", "split")
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [None]:
df_train = ds['train'].to_pandas()
display(df_train)
df_val = ds['validation'].to_pandas()
display(df_val)
df_test = ds['test'].to_pandas()
display(df_test)

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,1
15998,i feel like this was such a rude comment and i...,3


Unnamed: 0,text,label
0,im feeling quite sad and sorry for myself but ...,0
1,i feel like i am still looking at a blank canv...,0
2,i feel like a faithful servant,2
3,i am just feeling cranky and blue,3
4,i can have for a treat or if i am feeling festive,1
...,...,...
1995,im having ssa examination tomorrow in the morn...,0
1996,i constantly worry about their fight against n...,1
1997,i feel its important to share this info for th...,1
1998,i truly feel that if you are passionate enough...,1


Unnamed: 0,text,label
0,im feeling rather rotten so im not very ambiti...,0
1,im updating my blog because i feel shitty,0
2,i never make her separate from me because i do...,0
3,i left with my bouquet of red and yellow tulip...,1
4,i was feeling a little vain when i did this one,0
...,...,...
1995,i just keep feeling like someone is being unki...,3
1996,im feeling a little cranky negative after this...,3
1997,i feel that i am useful to my people and that ...,1
1998,im feeling more comfortable with derby i feel ...,1


In [None]:
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=64):
        self.tokenizer = tokenizer
        self.texts = dataframe['text'].tolist()
        self.labels = dataframe['label'].tolist()
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        # токенизируем
        inputs = self.tokenizer.encode_plus(
            text=text, # наши данные
            text_pair=None, # это для задачи вопросно-ответной системы, т.е. не для нас
            add_special_tokens=True, # добавление спец-токенов, отвечающих за "начало предложения" [CLS] и "конец предложения" [SEP]
            max_length=self.max_len, # максимальная длина последовательности
            padding='max_length', # если в предложении меньше 64 токенов, то остальные заменяем на пустые
            truncation=True, # если в предложениее 64+ токенов, то мы просто обрезаем их
            return_token_type_ids=False, # это для задачи вопросно-ответной системы, т.е. не для нас
            return_attention_mask=True, # это говорит нашей модели, какие токены важны, а какие просто как padding или [CLS] и т.д.
            return_tensors='pt' # формат выдачи токенизатора, в нашем случае - torch тензор
        )

        # то что мы запихнем в модель
        return {
            'input_ids': inputs['input_ids'].flatten(), # это наши цифровые токены (т.е. для токена 'привет' будет какое-нибудь '105')
            'attention_mask': inputs['attention_mask'].flatten(), # это наши маски
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# делаем всё как надо
train_dataset = TextDataset(df_train, tokenizer)
val_dataset = TextDataset(df_val, tokenizer)
test_dataset = TextDataset(df_test, tokenizer)

# запихиваем в батчи
batch_size = 16
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Изначально BERT не настроен на классификацию, поэтому добавим два слоя - линейный и softmax
class BertClassifier(torch.nn.Module):
    def __init__(self, n_classes):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = torch.nn.Dropout(p=0.3)
        self.out = torch.nn.Linear(self.bert.config.hidden_size, n_classes) # (768, 6)
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output = self.drop(output.pooler_output) # применим к пулеру Dropout слой, это надо чтоб наша модель меньше переобучалась
        return self.softmax(self.out(output))

model = BertClassifier(n_classes=6)
model = model.to(device)
model

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model.train()
    losses = []
    correct_predictions = 0

    for data in tqdm(data_loader):
        optimizer.zero_grad() # зануляем градиенты, чтоб они не накапливались и не взорвались)))

        input_ids = data['input_ids'].to(device) # токены
        attention_mask = data['attention_mask'].to(device) # маски
        labels = data['labels'].to(device) # класс

        outputs = model(input_ids=input_ids, attention_mask=attention_mask) # результат модели
        _, preds = torch.max(outputs, dim=1) # получение класса, который предсказала модель

        loss = loss_fn(outputs, labels) # считаем потерю
        losses.append(loss.item()) # сохраняем потерю
        correct_predictions += torch.sum(preds == labels) # это для accuracy

        loss.backward() # обратное распространение ошибки (вычисление градиентов - насколько уменьшаем\увеличиваем веса)
        optimizer.step() # обратное распространение ошибки (непосредственно сам проход)

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for data in data_loader:
            input_ids = data['input_ids'].to(device) # токены
            attention_mask = data['attention_mask'].to(device) # маски
            labels = data['labels'].to(device) # класс

            outputs = model(input_ids=input_ids, attention_mask=attention_mask) # результат модели
            _, preds = torch.max(outputs, dim=1) # получение класса, который предсказала модель

            loss = loss_fn(outputs, labels) # считаем потерю
            correct_predictions += torch.sum(preds == labels) # это для accuracy
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss().to(device)
num_epochs = 3

for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    print('-' * 10)
    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device
    )
    print(f'Train loss {train_loss} accuracy {train_acc}')
    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device
    )
    print(f'Val   loss {val_loss} accuracy {val_acc}')




Epoch 1/3
----------


100%|██████████| 1000/1000 [01:52<00:00,  8.88it/s]


Train loss 1.3473567136526108 accuracy 0.7020000000000001
Val   loss 1.227040020942688 accuracy 0.8160000000000001
Epoch 2/3
----------


100%|██████████| 1000/1000 [01:53<00:00,  8.85it/s]


Train loss 1.1724600353240966 accuracy 0.8726875000000001
Val   loss 1.12622372341156 accuracy 0.9185
Epoch 3/3
----------


100%|██████████| 1000/1000 [01:53<00:00,  8.85it/s]


Train loss 1.1247173202037812 accuracy 0.919125
Val   loss 1.1188422813415528 accuracy 0.925


In [None]:
model.eval()

with torch.no_grad():
    for data in test_data_loader:
        input_ids = data['input_ids'].to(device) # токены
        attention_mask = data['attention_mask'].to(device) # маски
        labels = data['labels'].to(device) # класс

        outputs = model(input_ids=input_ids, attention_mask=attention_mask) # результат модели
        _, preds = torch.max(outputs, dim=1) # получение класса, который предсказала модель

        break

print(preds)
print(labels)

tensor([0, 0, 0, 1, 0, 4, 3, 1, 1, 3, 4, 0, 4, 1, 2, 0], device='cuda:0')
tensor([0, 0, 0, 1, 0, 4, 3, 1, 1, 3, 4, 0, 4, 1, 2, 0], device='cuda:0')


In [None]:
model.eval()
predictions = []

with torch.no_grad():
    for data in test_data_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)

        predictions += (preds.tolist())

df_test['bert_pred'] = predictions
df_test

Unnamed: 0,text,label,bert_pred
0,im feeling rather rotten so im not very ambiti...,0,0
1,im updating my blog because i feel shitty,0,0
2,i never make her separate from me because i do...,0,0
3,i left with my bouquet of red and yellow tulip...,1,1
4,i was feeling a little vain when i did this one,0,0
...,...,...,...
1995,i just keep feeling like someone is being unki...,3,0
1996,im feeling a little cranky negative after this...,3,3
1997,i feel that i am useful to my people and that ...,1,1
1998,im feeling more comfortable with derby i feel ...,1,1


In [None]:
from sklearn.metrics import f1_score


f1_score(df_test['label'], df_test['bert_pred'], average='weighted')

0.9255464734920573