# Лабораторная работа 3. NLP

**Выполнил**: Подцепко И.С., уч. группа M4138.

In [2]:
import torch
import evaluate
import numpy as np

from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    IntervalStrategy,
)

In [3]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

mps


### opus-mt-en-ru

* source languages: en
* target languages: ru
*  OPUS readme: [en-ru](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/models/en-ru/README.md)

*  dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download original weights: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-ru/opus-2020-02-11.zip)
* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ru/opus-2020-02-11.test.txt)
* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ru/opus-2020-02-11.eval.txt)

## Benchmarks

| testset               | BLEU  |
|-----------------------|-------|
| newstest2012.en.ru 	| 31.1 	|
| newstest2013.en.ru 	| 23.5 	|
| newstest2015-enru.en.ru 	| 27.5 	|
| newstest2016-enru.en.ru 	| 26.4 	|
| newstest2017-enru.en.ru 	| 29.1 	|
| newstest2018-enru.en.ru 	| 25.4 	|
| newstest2019-enru.en.ru 	| 27.1 	|
| Tatoeba.en.ru 	| 48.4 	|

In [4]:
checkpoint = "Helsinki-NLP/opus-mt-en-ru"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)

In [14]:
model

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(62518, 512, padding_idx=62517)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(62518, 512, padding_idx=62517)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

Выбранный набор данных: https://huggingface.co/datasets/gmnlp/tico19

In [10]:
dataset = load_dataset("gmnlp/tico19", "en-ru")  # COVID-19
dataset = concatenate_datasets([dataset["test"], dataset["validation"]])

In [30]:
dataset.shape

(3071, 8)

In [31]:
dataset[0]

{'sourceLang': 'en',
 'targetlang': 'ru',
 'sourceString': ' about how long have these symptoms been going on? ',
 'targetString': 'о том, как долго присутствуют эти симптомы?',
 'stringID': 'CMU_1:1',
 'url': 'http://www.speech.cs.cmu.edu/haitian/text/1600_medical_domain_sentences.en',
 'license': 'public',
 'translator_id': 'User 03'}

In [9]:
MAX_LENGTH = 384

In [11]:
def preprocess_function(examples):
    return tokenizer(
        examples["sourceString"],
        text_target=examples["targetString"],
        max_length=MAX_LENGTH,
        truncation=True,
    )


tokenized_dataset = dataset.map(
    preprocess_function, batched=True, remove_columns=dataset.column_names
)

In [33]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3071
})

In [34]:
train_test_split = tokenized_dataset.train_test_split(
    test_size=0.05, seed=42, shuffle=True
)

train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

In [35]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2917
})

In [36]:
test_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 154
})

In [37]:
EXAMPLE_ID = 0

print(tokenizer.decode(test_dataset[EXAMPLE_ID]["input_ids"], skip_special_tokens=True))
print(tokenizer.decode(test_dataset[EXAMPLE_ID]["labels"], skip_special_tokens=True))

The Czech Republic and Slovakia banned going out in public without wearing a mask or covering one's nose and mouth.
В Чешской Республике и Словакии гражданам запрещено выходить на улицу без масок, закрывающих нос и рот.


In [38]:
metric = evaluate.load("sacrebleu")

In [39]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [40]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [5]:
def translate(input: str):
    tokenized = tokenizer(
        input,
        max_length=MAX_LENGTH,
        return_tensors="pt",
        truncation=True,
    )

    generated_ids = model.generate(
        input_ids=tokenized.input_ids.to(device),
        attention_mask=tokenized.attention_mask.to(device),
        max_length=MAX_LENGTH,
        num_beams=4,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True,
    )

    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

In [42]:
translate("I love you")

'Я люблю тебя.'

In [None]:
translate("Alcohol-based sanitizer")  # Дезинфицирующее средство на спиртовой основе

'Санитарный спирт на основе алкоголя'

In [44]:
predictions = []
references = []


def compute_metrics(eval_predictions, compute_result=False):
    global predictions, references

    predicted_ids, label_ids = eval_predictions
    if isinstance(predicted_ids, tuple):
        predicted_ids = predicted_ids[0]

    label_ids = label_ids.cpu().numpy()
    label_ids = np.where(label_ids != -100, label_ids, tokenizer.pad_token_id)

    batch_predictions, batch_references = postprocess_text(
        tokenizer.batch_decode(predicted_ids, skip_special_tokens=True),
        tokenizer.batch_decode(label_ids, skip_special_tokens=True),
    )

    if not compute_result:
        predictions.extend(batch_predictions)
        references.extend(batch_references)
        return {}

    bleu = metric.compute(predictions=predictions, references=references)["score"]

    predictions, references = [], []
    return {"BLEU": bleu}

In [45]:
import logging


class RemoveTrainerTokenizerSpam(logging.Filter):
    def filter(self, record):
        return not record.getMessage().startswith("Trainer.tokenizer")


logging.getLogger("transformers.trainer").addFilter(RemoveTrainerTokenizerSpam())

In [46]:
training_args = Seq2SeqTrainingArguments(
    output_dir="fine-tuning",
    eval_strategy=IntervalStrategy.EPOCH,
    save_strategy=IntervalStrategy.EPOCH,
    logging_strategy=IntervalStrategy.EPOCH,
    learning_rate=2e-5,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    save_total_limit=3,
    predict_with_generate=True,
    lr_scheduler_type="linear",
    num_train_epochs=5,
    batch_eval_metrics=True,
    eval_on_start=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [47]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu
0,No log,1.457265,24.237839
1,1.428200,1.232249,28.449169
2,1.197000,1.186939,30.21753
3,1.082700,1.171463,31.213094
4,1.010400,1.163533,31.325812
5,0.965500,1.161022,31.152773




TrainOutput(global_step=610, training_loss=1.1367431015264793, metrics={'train_runtime': 387.6228, 'train_samples_per_second': 37.627, 'train_steps_per_second': 1.574, 'total_flos': 315975663747072.0, 'train_loss': 1.1367431015264793, 'epoch': 5.0})

In [54]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
torch.save(
    model.state_dict(), "./drive/MyDrive/Colab Notebooks/fine-tuned-opus-mt-en-ru.pth"
)

In [6]:
model.load_state_dict(
    torch.load("./fine-tuned-opus-mt-en-ru.pth", weights_only=True, map_location=device)
)

<All keys matched successfully>

In [12]:
translate("I love you")

'Я люблю тебя'

In [13]:
translate("Alcohol-based sanitizer")  # Дезинфицирующее средство на спиртовой основе

'Дезинфицирующее средство на спиртовой основе'

**Выводы**:
1. Исходная модель `Helsinki-NLP/opus-mt-en-ru` для перевода с русского языка на английский имеет высокую метрику BLEU на наборе данных OPUS en-ru, однако недостаточно хорошо подходит для специфичных задач.
2. Набор данных `gmnlp/tico19` содержит небольшое количество хороших данных, собранных из медицинских журналов в период эпидемии Covid-19.
3. Fine tuning модели `Helsinki-NLP/opus-mt-en-ru` на наборе данных `gmnlp/tico19` позволяет получить значение метрики BLEU = 31.15, что является высоким результатом.