# Урок 13. Модель BERT и GPT

### Задание
#### Взять датасет https://huggingface.co/datasets/merionum/ru_paraphraser решить задачу парафраза

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
from datasets import load_dataset

corpus = load_dataset('merionum/ru_paraphraser')

Using custom data configuration merionum--ru_paraphraser-e39dafb2b050eb83
Found cached dataset json (C:/Users/Mn_Specter/.cache/huggingface/datasets/merionum___json/merionum--ru_paraphraser-e39dafb2b050eb83/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
corpus

DatasetDict({
    train: Dataset({
        features: ['id', 'id_1', 'id_2', 'text_1', 'text_2', 'class'],
        num_rows: 7227
    })
    test: Dataset({
        features: ['id', 'id_1', 'id_2', 'text_1', 'text_2', 'class'],
        num_rows: 1924
    })
})

In [7]:
label_list = sorted(set(corpus['train']['class']))
labels2id = { key:id for id, key in enumerate(label_list)}
id2labels = { id:key for id, key in enumerate(label_list)}

In [8]:
from transformers import AutoTokenizer, BertTokenizerFast
model_name = 'IlyaGusev/xlm_roberta_large_headline_cause_simple'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [9]:
def tokenize_and_align_labels(tokenizer, labels2id):    
    def tokenize_and_align_labels_(examples):
        tokenized_inputs = tokenizer(examples['text_1'],examples['text_2'], truncation=True)
        tokenized_inputs["labels"] = [labels2id[label] for label in examples['class']]
        return tokenized_inputs
    return tokenize_and_align_labels_

In [10]:
tokenized_datasets = corpus.map(tokenize_and_align_labels(tokenizer, labels2id), batched=True)

Loading cached processed dataset at C:\Users\Mn_Specter\.cache\huggingface\datasets\merionum___json\merionum--ru_paraphraser-e39dafb2b050eb83\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-8e96b3b831061d39.arrow
Loading cached processed dataset at C:\Users\Mn_Specter\.cache\huggingface\datasets\merionum___json\merionum--ru_paraphraser-e39dafb2b050eb83\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-8f5b8107551e462c.arrow


In [11]:
import torch
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification, TrainingArguments, Trainer

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_list), ignore_mismatched_sizes=True)
model.config.id2label = id2labels 
model.config.label2id = labels2id 

In [12]:
import numpy as np
from datasets import load_metric
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)    
    return metric.compute(predictions=predictions, references=labels)

In [13]:
batch_size = 16
args = TrainingArguments(
    "paraphras",
    evaluation_strategy = "epoch",
    learning_rate=2e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.05,
    save_strategy='no',
    report_to='none',
)

In [14]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [15]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: id_2, class, id_1, text_1, text_2, id. If id_2, class, id_1, text_1, text_2, id are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 7227
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1356
  Number of trainable parameters = 559893507
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.716056,0.661123
2,0.755500,0.707279,0.689189
3,0.595800,0.694048,0.704262


The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: id_2, class, id_1, text_1, text_2, id. If id_2, class, id_1, text_1, text_2, id are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1924
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: id_2, class, id_1, text_1, text_2, id. If id_2, class, id_1, text_1, text_2, id are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1924
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: id_2, class, id_1, text_1, text_2,

TrainOutput(global_step=1356, training_loss=0.6449485936347714, metrics={'train_runtime': 17351.254, 'train_samples_per_second': 1.25, 'train_steps_per_second': 0.078, 'total_flos': 1819183492473300.0, 'train_loss': 0.6449485936347714, 'epoch': 3.0})

In [19]:
import pandas as pd
example = tokenized_datasets["test"][:100]
tokens = tokenizer(example['text_1'], example['text_1'], padding=True, truncation=True, return_tensors='pt')
tokens = tokens.to('cpu')
with torch.no_grad():
    outputs = model(**tokens)
predicted = outputs.logits.argmax(dim=-1).cpu().numpy()
classes = [id2labels[id_label] for id_label in predicted]
df_example =pd.DataFrame({'text_1':example['text_1'], 'text_2':example['text_2'], 'class':example['class'], 'predict':classes})
df_example

Unnamed: 0,text_1,text_2,class,predict
0,Цены на нефть восстанавливаются,Парламент Словакии поблагодарил народы бывшего...,-1,1
1,"""Гоголь-центр"" покажет видеозапись скандальног...",Кехман запретил «Гоголь-центру» показывать вид...,-1,1
2,Агент: РФС вновь задерживает зарплату Фабио Ка...,СМИ: Агент Фабио Капелло грозится подать в суд...,-1,1
3,День Победы в Москве обещает выдаться облачным,Любляна отпразднует День Победы вместе с Москвой,-1,1
4,Посол РФ в США: Россия будет бороться с попытк...,Правительство запланировало заработать на лоте...,-1,1
...,...,...,...,...
95,День Победы в режиме онлайн,Парад в честь Дня Победы проходит в Донецке,-1,-1
96,В Индии боевики-маоисты захватили 200 человек,В Индии маоисты захватили в заложники 500 человек,-1,1
97,Боевики-маоисты взяли в заложники 200 человек ...,В Индии боевики-маоисты захватили 200 человек,1,1
98,Боевики-маоисты взяли в заложники 200 человек ...,В Индии маоисты захватили в заложники 500 человек,0,1
