In [1]:
import logging
from transformers.trainer import logger as noisy_logger
noisy_logger.setLevel(logging.WARNING)

In [2]:
import numpy as np
import pandas as pd

import os

from datasets import load_dataset, load_metric

In [3]:
corpus = load_dataset('merionum/ru_paraphraser')

Using custom data configuration merionum--ru_paraphraser-e39dafb2b050eb83
Found cached dataset json (C:/Users/nikit/.cache/huggingface/datasets/merionum___json/merionum--ru_paraphraser-e39dafb2b050eb83/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
corpus

DatasetDict({
    train: Dataset({
        features: ['id', 'id_1', 'id_2', 'text_1', 'text_2', 'class'],
        num_rows: 7227
    })
    test: Dataset({
        features: ['id', 'id_1', 'id_2', 'text_1', 'text_2', 'class'],
        num_rows: 1924
    })
})

In [5]:
model_name = "IlyaGusev/xlm_roberta_large_headline_cause_simple"
batch_size = 16

In [6]:
from transformers import AutoTokenizer, BertTokenizerFast

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
label_list = sorted(set(corpus['train']['class']))
label_list

['-1', '0', '1']

In [8]:
labels2id = { key:id for id, key in enumerate(label_list)}
labels2id

{'-1': 0, '0': 1, '1': 2}

In [9]:
id2labels = { id:key for id, key in enumerate(label_list)}
id2labels

{0: '-1', 1: '0', 2: '1'}

In [10]:
def tokenize_and_align_labels(tokenizer, labels2id):
    def tokenize_and_align_labels_(examples):
        tokenized_inputs = tokenizer(examples['text_1'],examples['text_2'], truncation=True)
        tokenized_inputs["labels"] = [labels2id[label] for label in examples['class']]
        return tokenized_inputs
    return tokenize_and_align_labels_

In [11]:
tokenized_input = tokenize_and_align_labels(tokenizer, labels2id)(corpus['train'][20:24])
tokenized_input

{'input_ids': [[0, 18449, 1068, 1951, 169066, 59, 42136, 59392, 529, 68056, 49869, 159288, 6, 32345, 14539, 25455, 5, 2, 2, 6, 144048, 145158, 184643, 18449, 28196, 529, 68056, 49869, 159288, 6, 32345, 14539, 25455, 5, 2], [0, 417, 33372, 526, 70847, 132653, 326, 175857, 529, 861, 29034, 7329, 155333, 24863, 5, 2, 2, 417, 33372, 526, 70847, 132653, 326, 175857, 529, 861, 29034, 534, 11591, 149673, 5, 2], [0, 417, 14100, 695, 1200, 49, 30037, 1269, 326, 3699, 108854, 1857, 56869, 90229, 546, 209, 9561, 5, 2, 2, 417, 68642, 69, 53177, 29, 108854, 1857, 47745, 49, 14100, 695, 1200, 90229, 546, 209, 9561, 5, 2], [0, 72085, 138109, 87830, 29, 23262, 2294, 90569, 11974, 24744, 59, 161679, 8568, 5, 2, 2, 69242, 64393, 41976, 4560, 1560, 182076, 29, 29230, 23262, 49, 72085, 138109, 5, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [12]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"][0])
print(tokens)

['<s>', '‚ñÅ–ß–µ', '—á', '–Ω—è', '‚ñÅ–ø–æ–ø—Ä–æ—Å–∏–ª', '–∞', '‚ñÅ–Ω–∞—Ü–∏–æ–Ω–∞–ª', '–∏—Å—Ç–æ–≤', '‚ñÅ—Å–æ', '—Å—Ç–∞–≤–∏—Ç—å', '‚ñÅ–∫–æ–¥–µ–∫—Å', '‚ñÅ–ø–æ–≤–µ–¥–µ–Ω–∏—è', '‚ñÅ', '–∫–∞–≤', '–∫–∞–∑', '—Ü–µ–≤', '.', '</s>', '</s>', '‚ñÅ', '–ù–∞—Ü–∏–æ–Ω–∞–ª', '–∏—Å—Ç—ã', '‚ñÅ–ø–æ–º–æ–≥—É—Ç', '‚ñÅ–ß–µ', '—á–Ω–µ', '‚ñÅ—Å–æ', '—Å—Ç–∞–≤–∏—Ç—å', '‚ñÅ–∫–æ–¥–µ–∫—Å', '‚ñÅ–ø–æ–≤–µ–¥–µ–Ω–∏—è', '‚ñÅ', '–∫–∞–≤', '–∫–∞–∑', '—Ü–µ–≤', '.', '</s>']


In [13]:
corpus['train'][20]

{'id': '21',
 'id_1': '3',
 'id_2': '4116',
 'text_1': '–ß–µ—á–Ω—è –ø–æ–ø—Ä–æ—Å–∏–ª–∞ –Ω–∞—Ü–∏–æ–Ω–∞–ª–∏—Å—Ç–æ–≤ —Å–æ—Å—Ç–∞–≤–∏—Ç—å –∫–æ–¥–µ–∫—Å –ø–æ–≤–µ–¥–µ–Ω–∏—è –∫–∞–≤–∫–∞–∑—Ü–µ–≤.',
 'text_2': '–ù–∞—Ü–∏–æ–Ω–∞–ª–∏—Å—Ç—ã –ø–æ–º–æ–≥—É—Ç –ß–µ—á–Ω–µ —Å–æ—Å—Ç–∞–≤–∏—Ç—å –∫–æ–¥–µ–∫—Å –ø–æ–≤–µ–¥–µ–Ω–∏—è –∫–∞–≤–∫–∞–∑—Ü–µ–≤.',
 'class': '0'}

In [14]:
tokenized_datasets = corpus.map(tokenize_and_align_labels(tokenizer, labels2id), batched=True)

Loading cached processed dataset at C:/Users/nikit/.cache/huggingface/datasets/merionum___json/merionum--ru_paraphraser-e39dafb2b050eb83/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab\cache-7d1970e02f698707.arrow
Loading cached processed dataset at C:/Users/nikit/.cache/huggingface/datasets/merionum___json/merionum--ru_paraphraser-e39dafb2b050eb83/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab\cache-c816f4ded8d8e856.arrow


In [15]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'id_1', 'id_2', 'text_1', 'text_2', 'class', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 7227
    })
    test: Dataset({
        features: ['id', 'id_1', 'id_2', 'text_1', 'text_2', 'class', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1924
    })
})

In [16]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_list))
model.config.id2label = id2labels
model.config.label2id = labels2id

In [18]:
from datasets import load_metric

metric = load_metric("accuracy")

  metric = load_metric("accuracy")


In [19]:
def compute_metrics(eval_pred):

    logits, labels = eval_pred

    predictions = np.argmax(logits, axis=-1)

    return metric.compute(predictions=predictions, references=labels)

In [20]:
args = TrainingArguments(
    "paraphras",
    evaluation_strategy = "epoch",
    learning_rate=2e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.05,
    save_strategy='no',
    report_to='none',
)

In [21]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [22]:
trainer.train()

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=2260, training_loss=0.5833041233299053, metrics={'train_runtime': 29037.5976, 'train_samples_per_second': 1.244, 'train_steps_per_second': 0.078, 'total_flos': 3034369670565798.0, 'train_loss': 0.5833041233299053, 'epoch': 5.0})

In [23]:
example = tokenized_datasets["test"][40:80]

In [25]:
import torch
tokens = tokenizer(example['text_1'], example['text_1'], padding=True, truncation=True, return_tensors='pt')
#tokens = tokens.to('cuda:0')

with torch.no_grad():
    outputs = model(**tokens)

predicted = outputs.logits.argmax(dim=-1).cpu().numpy()
classes = [id2labels[id_label] for id_label in predicted]

In [26]:
df_example =pd.DataFrame({'text_1':example['text_1'], 'text_2':example['text_2'], 'class':example['class'], 'predict':classes})
df_example

Unnamed: 0,text_1,text_2,class,predict
0,–í–ª–∞–¥–∏–º–∏—Ä –ü—É—Ç–∏–Ω –æ—Å–≤–æ–±–æ–¥–∏–ª –æ—Ç –¥–æ–ª–∂–Ω–æ—Å—Ç–∏ –ø–æ—á—Ç–∏ 20...,–ü—É—Ç–∏–Ω —Å–Ω—è–ª —Å –¥–æ–ª–∂–Ω–æ—Å—Ç–µ–π –±–æ–ª–µ–µ 20 —Ä—É–∫–æ–≤–æ–¥–∏—Ç–µ–ª–µ–π...,0,1
1,–ò—Å—Ç–æ—á–Ω–∏–∫: –Ø–∫—É–Ω–∏–Ω –æ–∑–≤—É—á–∏–ª —Å–≤–æ—é –º–µ—Å—è—á–Ω—É—é –∑–∞—Ä–ø–ª–∞—Ç...,–Ø–∫—É–Ω–∏–Ω –Ω–∞–∑–≤–∞–ª —Ä–∞–∑–º–µ—Ä —Å–≤–æ–µ–π –∑–∞—Ä–ø–ª–∞—Ç—ã,0,1
2,Apple –ø–æ–ª—É—á–∏–ª–∞ –ø–∞—Ç–µ–Ω—Ç –Ω–∞ —Å–æ—Ü–∏–∞–ª—å–Ω—É—é —Å–µ—Ç—å,Apple –ø–æ–ª—É—á–∏–ª–∞ –ø–∞—Ç–µ–Ω—Ç –Ω–∞ —Å–æ–±—Å—Ç–≤–µ–Ω–Ω—É—é —Å–æ—Ü—Å–µ—Ç—å,1,1
3,–ü–æ–¥–≤–µ—Å–Ω–æ–π –ø–æ—Ç–æ–ª–æ–∫ –æ–±—Ä—É—à–∏–ª—Å—è –≤ –∞—ç—Ä–æ–ø–æ—Ä—Ç—É –°—Ç–∞–º–±—É...,–ß–µ—Ç–≤–µ—Ä–æ –ø–æ—Å—Ç—Ä–∞–¥–∞–ª–∏ –ø—Ä–∏ –æ–±—Ä—É—à–µ–Ω–∏–∏ –ø–æ–¥–≤–µ—Å–Ω–æ–≥–æ –ø–æ...,1,1
4,–†–æ—Å—Å–∏—è –ø—Ä–µ–¥—Å—Ç–∞–≤–ª—è–µ—Ç –ø—Ä–∏–Ω—Ü–∏–ø–∏–∞–ª—å–Ω–æ –Ω–æ–≤—ã–π —Ç–∞–Ω–∫,–í–∞—à–∏–Ω–≥—Ç–æ–Ω –ø—Ä–∏–≥—Ä–æ–∑–∏–ª –†–æ—Å—Å–∏–∏ –Ω–æ–≤—ã–º–∏ —Å–∞–Ω–∫—Ü–∏—è–º–∏,-1,1
5,–ì–µ–Ω—Å–µ–∫ –û–û–ù –∏ –ø—Ä–µ–º—å–µ—Ä –£–∫—Ä–∞–∏–Ω—ã –æ–±—Å—É–¥–∏–ª–∏ –º–∏–Ω—Å–∫–∏–µ ...,–ì–µ–Ω—Å–µ–∫ –û–û–ù –∏ –ø—Ä–µ–º—å–µ—Ä-–º–∏–Ω–∏—Å—Ç—Ä –£–∫—Ä–∞–∏–Ω—ã –æ–±—Å—É–¥–∏–ª–∏ ...,1,1
6,–†–µ–∑—É–ª—å—Ç–∞—Ç—ã –≤—ã–±–æ—Ä–æ–≤ –≤ –í–µ–ª–∏–∫–æ–±—Ä–∏—Ç–∞–Ω–∏–∏ —É–∫—Ä–µ–ø–∏–ª–∏ —Ñ—É–Ω—Ç,–°—Ç–∞–ª–∏ –∏–∑–≤–µ—Å—Ç–Ω—ã –æ–∫–æ–Ω—á–∞—Ç–µ–ª—å–Ω—ã–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –≤—ã–±–æ—Ä–æ...,-1,1
7,–ù–∏–∫–æ–ª–∏—á: –°–µ—Ä–±–∏—è –Ω–µ –≥–æ–≤–æ—Ä–∏–ª–∞ –æ –Ω–µ–∂–µ–ª–∞–Ω–∏–∏ —Å—Ç—Ä–æ–∏—Ç...,–ü—Ä–µ–∑–∏–¥–µ–Ω—Ç –°–µ—Ä–±–∏–∏ –æ–ø—Ä–æ–≤–µ—Ä–≥ —Å–≤–æ–∏ —Å–ª–æ–≤–∞ –æ–± –æ—Ç–∫–∞–∑–µ...,0,1
8,–°–ú–ò —Å–æ–æ–±—â–∞—é—Ç –æ–± –æ–±—ã—Å–∫–∞—Ö –≤ –ú–∏–Ω–∫—É–ª—å—Ç–µ –ø–æ –¥–µ–ª—É –æ ...,–ú–∏–Ω–∫—É–ª—å—Ç—É—Ä—ã –æ—Ç—á–∏—Ç–∞–ª–æ—Å—å –ø–µ—Ä–µ–¥ —Å–ª–µ–¥—Å—Ç–≤–∏–µ–º –ø–æ –¥–µ–ª...,-1,1
9,–õ—é–±–ª—è–Ω–∞ –æ—Ç–ø—Ä–∞–∑–¥–Ω—É–µ—Ç –î–µ–Ω—å –ü–æ–±–µ–¥—ã –≤–º–µ—Å—Ç–µ —Å –ú–æ—Å–∫–≤–æ–π,–ü—É—Ç–µ–≤–æ–¥–∏—Ç–µ–ª—å –ø–æ –î–Ω—é –ü–æ–±–µ–¥—ã: –∫–∞–∫ –ø—Ä–æ–≤–µ—Å—Ç–∏ 9 –º–∞—è...,-1,1
