# Обучение

In [None]:
!pip install accelerate -U

In [1]:
import pandas as pd
import re
import torch

In [5]:
!pip install evaluate --quiet
import torch
from transformers import TrainingArguments, Trainer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import evaluate

In [6]:
metric = evaluate.load("accuracy")

## Preprocessing

In [24]:
train_data = pd.read_csv('ats_input_train.csv', sep = '\t')
test_data = pd.read_csv('ats_input_dev.csv', sep = '\t')
dict_map = {'Whole':'впечатление вцелом','Service':'сервис',
          'Food':'еда', 'Interior': 'интерьер',
          'Price': 'цена'}

train_data.category = train_data.category.map(dict_map)
test_data.category = test_data.category.map(dict_map)

In [45]:
def x_data(df):
  res = []
  for _, row in df.iterrows():
    res.append(f'{row.sent} [SEP] {row.aspect}, {row.category}')
  return res

In [47]:
res = x_data(train_data)
res_test = x_data(test_data)

In [None]:
ls = pd.get_dummies(train_data.sentiment).astype('float')

## Fine-Tuning

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('sberbank-ai/ruBert-base')
model = AutoModelForSequenceClassification.from_pretrained('sberbank-ai/ruBert-base', num_labels=4).to('cuda')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sberbank-ai/ruBert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
vals = ls.values
labels = torch.tensor(vals)

X_train, X_test, y_train, y_test = train_test_split(res, labels, test_size = 0.25, random_state=666, shuffle=True )

In [9]:
def prep(text, tokenizer=tokenizer):
    return tokenizer(text, padding = 'max_length', max_length = 128, truncation=True, return_tensors='pt')

In [15]:
X_train = prep(X_train)
X_test = prep(X_test)

In [16]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_test, y_test)


  self.labels = torch.tensor(labels)


In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    labels_ = [np.argmax(i) for i in labels]
    return metric.compute(predictions=predictions, references=labels_)

In [17]:
training_args = TrainingArguments(output_dir="./my_bert_asp",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size = 8,
                                  per_device_eval_batch_size = 8,
                                  save_strategy = 'epoch',
                                  num_train_epochs=5)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [19]:
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.277153,0.783613
2,0.275600,0.239181,0.802521
3,0.151600,0.277653,0.82563
4,0.151600,0.283385,0.839286
5,0.074300,0.29782,0.839286


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


TrainOutput(global_step=1785, training_loss=0.1474945832367371, metrics={'train_runtime': 517.143, 'train_samples_per_second': 27.604, 'train_steps_per_second': 3.452, 'total_flos': 938994440217600.0, 'train_loss': 0.1474945832367371, 'epoch': 5.0})

### Использование чекпоинта

In [28]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('sberbank-ai/ruBert-base')
model = AutoModelForSequenceClassification.from_pretrained('/content/my_bert_asp/checkpoint-1785').to('cuda')

# Загрузка данных модели на HuggingFace

In [34]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [35]:
model.push_to_hub("absa-bert-model")

model.safetensors:   0%|          | 0.00/713M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Bareubara/absa-bert-model/commit/1927cdb9d6f207618ae92b8db21a5888f82f1927', commit_message='Upload BertForSequenceClassification', commit_description='', oid='1927cdb9d6f207618ae92b8db21a5888f82f1927', pr_url=None, pr_revision=None, pr_num=None)

In [37]:
tokenizer.push_to_hub("absa-bert-model")

CommitInfo(commit_url='https://huggingface.co/Bareubara/absa-bert-model/commit/ad2d5e941b01cab5d7f0d92b68abfbecff050293', commit_message='Upload tokenizer', commit_description='', oid='ad2d5e941b01cab5d7f0d92b68abfbecff050293', pr_url=None, pr_revision=None, pr_num=None)

# Загрузка созданной fine-tuned model

In [40]:
tokenizer = AutoTokenizer.from_pretrained('Bareubara/absa-bert-model')
model = AutoModelForSequenceClassification.from_pretrained("Bareubara/absa-bert-model").to('cuda')

In [41]:
inp = tokenizer(res_test, padding = 'max_length', max_length = 128, truncation=True, return_tensors='pt').to('cuda')

with torch.no_grad():
    logits = model(**inp).logits

In [42]:
out_df = pd.DataFrame(logits.cpu()).astype('float')
maping = {0:'both',
          1: 'negative',
          2: 'neutral',
          3: 'positive'}

In [43]:
pred = [maping[np.argmax(i)] for i in out_df.values]

y_test = test_data.sentiment.tolist()

In [50]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(pred, y_test)
print(f'Accuracy of aspect sentiment: {acc}')

Accuracy of aspect sentiment: 0.8200836820083682
