# Подход 2
Тут будет использован finetuning готовой [модели](https://huggingface.co/Helsinki-NLP/opus-mt-ru-en)
В качестве примера был использован [ноутбук](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb) из huggungface
* Была переделана обработка датасета, чтобы был нужный формат
* Были натроены гиперпараметры модели(warmup, scheduler, смягчение лейблов)
* Был настроен паддинг внутри батча(по дефолту почему-то id=-100)

In [1]:
import os
path_do_data = '../../datasets/Machine_translation_EN_RU/data.txt'
if not os.path.exists(path_do_data):
    print("Dataset not found locally. Downloading from github.")
    !wget https://raw.githubusercontent.com/neychev/made_nlp_course/master/datasets/Machine_translation_EN_RU/data.txt -nc
    path_do_data = './data.txt'

Dataset not found locally. Downloading from github.
File ‘data.txt’ already there; not retrieving.



In [2]:
model_checkpoint = "Helsinki-NLP/opus-mt-ru-en"

STEP 1: Prepare_dataset

In [3]:
from typing import List
import pandas as pd
from datasets import Dataset, DatasetDict


def prepare_dataset(path_to_data: str, ratio: List[float] = (0.8, 0.15, 0.05)):
    data = pd.read_csv(path_to_data, header=None, sep='\t')
    data.columns = ['en', 'ru']
    left_border = 0
    res = {}
    for name, size in [('train', ratio[0]), ('validation', ratio[1]), ('test', ratio[2])]:
        split_data = data.iloc[int(data.shape[0] * left_border): int(data.shape[0] * (left_border + size))]
        split_data = {'translation': [{'en': row['en'], 'ru': row['ru']} for idx, row in split_data.iterrows()]}
        res[name] = Dataset.from_dict(split_data)
        left_border += size
    return DatasetDict(res)


In [4]:
from datasets import load_dataset, load_metric


raw_datasets = prepare_dataset(path_do_data)
metric = load_metric("sacrebleu")

In [5]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 40000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 7500
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2500
    })
})

In [6]:
raw_datasets["train"][0]

{'translation': {'en': 'Cordelia Hotel is situated in Tbilisi, a 3-minute walk away from Saint Trinity Church.',
  'ru': 'Отель Cordelia расположен в Тбилиси, в 3 минутах ходьбы от Свято-Троицкого собора.'}}

In [7]:
import datasets
import random
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [8]:
show_random_elements(raw_datasets["train"])

Unnamed: 0,translation
0,"{'en': 'The property offers free parking.', 'ru': 'На территории гостевого дома обустроена бесплатная парковка.'}"
1,"{'en': 'Offering an outdoor pool and a fitness centre, Guest House Albatros is located in Dagomys. Free WiFi access is available.', 'ru': 'Гостевой дом «Альбатрос» находится в поселке Дагомыс. К услугам гостей открытый бассейн, фитнес-центр и бесплатный WiFi.'}"
2,"{'en': 'Parking near the property is free.', 'ru': 'Неподалеку от апартаментов есть бесплатная парковка.'}"
3,"{'en': 'Berggasthof Haldenhof offers a sauna and regional cuisine.', 'ru': 'В гостевом доме Berggasthof Haldenhof вы сможете воспользоваться сауной и отведать блюда региональной кухни.'}"
4,"{'en': 'The 3-star Hotel Ladenmühle offers spacious, comfortably furnished rooms with modern amenities, including free wireless internet access and lovely views.', 'ru': 'К услугам гостей 3-звездочного отеля Ladenmühle просторные комфортабельные номера с живописным видом и современными удобствами, включая бесплатный Wi-Fi.'}"


Используется стандартный токенайзер и модель

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [10]:
tokenizer(["Hello, this one sentence!", "This is another sentence."])

{'input_ids': [[902, 7680, 573, 2, 21, 537, 2674, 25, 171, 144, 4008, 6287, 56, 0], [1089, 471, 2674, 21, 2674, 13, 3876, 537, 402, 144, 4008, 6287, 3, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [11]:
max_input_length = 128
max_target_length = 128
source_lang = "ru"
target_lang = "en"

In [12]:
prefix=""

def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [13]:
preprocess_function(raw_datasets['train'][:2])

{'input_ids': [[1110, 2390, 367, 29508, 3345, 3934, 19488, 6, 47980, 2, 6, 165, 3732, 1004, 6706, 426, 3118, 58, 17808, 70, 11, 29595, 70, 53, 14700, 57, 193, 3952, 41, 3, 0], [49, 608, 95, 260, 1304, 15412, 4290, 7033, 1089, 918, 1798, 14450, 9656, 6405, 818, 17330, 20088, 21, 20603, 41, 4324, 7, 16, 141, 12215, 11, 15071, 3, 324, 1001, 388, 6987, 1002, 84, 12780, 2045, 2799, 7, 43070, 6, 5054, 3, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[19713, 20981, 3934, 23312, 34, 25144, 10, 45870, 2, 13, 165, 11, 32721, 6441, 1449, 65, 15697, 53405, 16788, 3, 0], [552, 18492, 20148, 27540, 68, 10485, 136, 7923, 33, 85, 1052, 13, 598, 11, 19543, 4438, 14556, 2, 2318, 1405, 2, 8, 13, 43729, 6399, 3, 0]]}

In [14]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/40 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [15]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [16]:
model

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(62518, 512, padding_idx=62517)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(62518, 512, padding_idx=62517)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0): MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLUActivation()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1

In [17]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 76,147,712 trainable parameters


In [18]:
batch_size = 30 
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    warmup_steps=1000,
    lr_scheduler_type="cosine_with_restarts",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=5,
    num_train_epochs=15,
    predict_with_generate=True,
    fp16=True,
    dataloader_num_workers=6,
    load_best_model_at_end=True,
    label_smoothing_factor=0.01,
)

In [19]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer, 
    model=model, 
    label_pad_token_id=tokenizer.pad_token_id
) # паддит инпуты и лейблы внутри батча

In [20]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [21]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp half precision backend


In [22]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation. If translation are not expected by `MarianMTModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 40000
  Num Epochs = 15
  Instantaneous batch size per device = 30
  Total train batch size (w. parallel, distributed & accumulation) = 30
  Gradient Accumulation steps = 1
  Total optimization steps = 20010


Epoch,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation. If translation are not expected by `MarianMTModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 7500
  Batch size = 30
Saving model checkpoint to opus-mt-ru-en-finetuned-ru-to-en/checkpoint-1334
Configuration saved in opus-mt-ru-en-finetuned-ru-to-en/checkpoint-1334/config.json
Model weights saved in opus-mt-ru-en-finetuned-ru-to-en/checkpoint-1334/pytorch_model.bin
tokenizer config file saved in opus-mt-ru-en-finetuned-ru-to-en/checkpoint-1334/tokenizer_config.json
Special tokens file saved in opus-mt-ru-en-finetuned-ru-to-en/checkpoint-1334/special_tokens_map.json
Deleting older checkpoint [opus-mt-ru-en-finetuned-ru-to-en/checkpoint-6670] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored:

TrainOutput(global_step=20010, training_loss=0.4568776904672816, metrics={'train_runtime': 7391.2026, 'train_samples_per_second': 81.178, 'train_steps_per_second': 2.707, 'total_flos': 1.063842450112512e+16, 'train_loss': 0.4568776904672816, 'epoch': 15.0})

In [23]:
preds = trainer.predict(tokenized_datasets["test"])

The following columns in the test set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation. If translation are not expected by `MarianMTModel.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2500
  Batch size = 30


In [24]:
preds.metrics

{'test_loss': 0.6019531488418579,
 'test_bleu': 39.3273,
 'test_gen_len': 21.0808,
 'test_runtime': 95.8882,
 'test_samples_per_second': 26.072,
 'test_steps_per_second': 0.876}

In [25]:
generated_text = tokenizer.batch_decode(preds.predictions, skip_special_tokens=True)
original_text = tokenizer.batch_decode(preds.label_ids, skip_special_tokens=True)

In [26]:
random_idx = np.random.choice(len(generated_text), 10)
for idx in random_idx:
    print(f"generated: {generated_text[idx]}\noriginal: {original_text[idx]}")

generated: Set in the centre of Kovachevitsa, Pension Ristevata offers a garden with free barbecue facilities and free luggage storage.
original: Ristevata Guest House enjoys a central location in Kovachevitsa and offers a garden with free barbecue facilities and free luggage storage.
generated: Guests can visit the on-site restaurant bar with a private beach, just 300 metres away.
original: A bar restaurant, featuring its own private beach area, is just 300 metres away.
generated: There is a 24-hour front desk at the property.
original: The front desk is available 24/7.
generated: Set in Brasov, this apartment features a balcony with mountain views.
original: Set in Braşov, this apartment features a balcony with mountains views.
generated: Apartment Volguntes Street is located in a quiet residential green area of Riga, 5 km from the city centre.
original: Apartment Volguntes Street is housed in a quiet and green residential district of Riga, within 5 km from the city centre.
generated

По семплам видно, что перевод прекрасный. Текст хорошо читается. Из каких-то явных минусов - поскольку модель является char-level, то в именах нарицательных есть опечатки.
Что касаемо метрик - они великолпные. Впрочем, это объяснимо тем, что сама по себе модель уже с первой же итерации показывает отличный скор.