# Fine-tune Tutorial ruBERT-tiny2

Загружаем библиотеки

In [64]:
import pandas as pd
import numpy as np
import nltk
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer, AutoModel
from datasets import Dataset
import evaluate
from io import StringIO

Загружаем отобранные статьи: cтатьи из https://ria.ru/export/rss2/archive/index.xml, которые в течение трех суток были изменены и затем проверены дополнительно через Яндекс Speller

In [2]:
with open('datanews.json', encoding="utf-8") as f:
    read_data = f.read()
read_data = read_data.replace('\n][\n', ',\n')
articles = pd.read_json(StringIO(read_data), orient='records')

Формируем предложения из отобранных статей. Второе предложение типа "МОСКВА, 12 янв – РИА Новости" убираем.
В новый DataFrame записываем предложение и признак, что предложения корректные (правильные)

In [3]:
pst = nltk.PunktSentenceTokenizer()
sentences = pd.DataFrame({"text": [], "label": []})
for ind in articles.index:
    sentArticle = pst.tokenize(articles['Article'][ind])
    m = 0
    for s in sentArticle:
        if m != 1:
            sentences.loc[len(sentences.index)] = [s, 1]
print("Number of correct sentences: ", len(sentences.index))
sentences.head(3)

Number of correct sentences:  4036


Unnamed: 0,text,label
0,В Госдуме предложили сделать старый Новый год ...,1
1,Председатель союза дачников Подмосковья и депу...,1
2,"""После длинных новогодних праздников людям тяж...",1


Добавляем предложения с ошибками. При этом устанавливаем признак, что предложения неправильные

In [4]:
error_list = []
with open('errorsents.txt', encoding="utf-8") as fe:
    error_list = fe.readlines()
for s in error_list:
    sentences.loc[len(sentences.index)] = [s, 0]
print("Number of incorrect sentences: ", len(error_list))
sentences.tail(3)

Number of incorrect sentences:  57


Unnamed: 0,text,label
4090,В октябре прошлого года Евтухов в интервью РИА...,0
4091,Москва прорабатывает вопрос о возращении выжив...,0
4092,"Все пассажи про ""политические репрессии и депо...",0


In [18]:
raw_ds = Dataset.from_pandas(sentences, preserve_index=False)
raw_ds = raw_ds.class_encode_column('label')
raw_ds.features

Stringifying the column:   0%|          | 0/4093 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/4093 [00:00<?, ? examples/s]

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['0', '1'], id=None)}

Datasets:

In [19]:
raw_ds = raw_ds.train_test_split(test_size=0.2, shuffle=True)

In [65]:
from transformers import AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
# model = AutoModelForMaskedLM.from_pretrained("cointegrated/rubert-tiny2", num_labels=2)
# model = AutoModel.from_pretrained("cointegrated/rubert-tiny2", num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained("cointegrated/rubert-tiny2", num_labels=2, is_decoder=True)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
def tokenize_function(data):
    return tokenizer(data["text"], truncation=True)
tokenized_ds = raw_ds.map(tokenize_function, batched=True)
tokenized_ds

Map:   0%|          | 0/3274 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3274
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 819
    })
})

In [67]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [75]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [76]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")

In [77]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [78]:
trainer.train()

  0%|          | 0/1230 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

{'eval_loss': 0.09038888663053513, 'eval_accuracy': 0.9841269841269841, 'eval_f1': 0.9919999999999999, 'eval_runtime': 29.1574, 'eval_samples_per_second': 28.089, 'eval_steps_per_second': 3.533, 'epoch': 1.0}


Checkpoint destination directory test-trainer/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0718, 'learning_rate': 2.9674796747967482e-05, 'epoch': 1.22}


  0%|          | 0/103 [00:00<?, ?it/s]

{'eval_loss': 0.08517743647098541, 'eval_accuracy': 0.9841269841269841, 'eval_f1': 0.9919999999999999, 'eval_runtime': 30.0588, 'eval_samples_per_second': 27.247, 'eval_steps_per_second': 3.427, 'epoch': 2.0}


Checkpoint destination directory test-trainer/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0824, 'learning_rate': 9.34959349593496e-06, 'epoch': 2.44}


  0%|          | 0/103 [00:00<?, ?it/s]

{'eval_loss': 0.08639433979988098, 'eval_accuracy': 0.9841269841269841, 'eval_f1': 0.9919999999999999, 'eval_runtime': 33.948, 'eval_samples_per_second': 24.125, 'eval_steps_per_second': 3.034, 'epoch': 3.0}
{'train_runtime': 1375.8338, 'train_samples_per_second': 7.139, 'train_steps_per_second': 0.894, 'train_loss': 0.07505875641737526, 'epoch': 3.0}


TrainOutput(global_step=1230, training_loss=0.07505875641737526, metrics={'train_runtime': 1375.8338, 'train_samples_per_second': 7.139, 'train_steps_per_second': 0.894, 'train_loss': 0.07505875641737526, 'epoch': 3.0})