# **Fine-tuning mBART50 for En-Vi Machine Translation**

In [3]:
!pip install -q transformers sentencepiece datasets accelerate evaluate sacrebleu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import os
from tqdm import tqdm
from datasets import load_dataset
import numpy as np
import evaluate
import matplotlib.pyplot as plt

import torch
from transformers import (
    AutoTokenizer,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    pipeline,
    DefaultFlowCallback
)
import sacrebleu
import warnings

warnings.filterwarnings("ignore")

## **Dataset**

In [13]:
from datasets import load_dataset

ds = load_dataset("thainq107/iwslt2015-en-vi")

README.md:   0%|          | 0.00/522 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/181k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/133317 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1268 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1268 [00:00<?, ? examples/s]

In [14]:
ds

DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 133317
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
})

In [15]:
ds['train'][0]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}

## **Tokenizer**

In [16]:
from transformers import AutoTokenizer

model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [17]:
len(tokenizer)

250054

## **Encoding**

In [18]:
import torch

MAX_LEN = 75

def preprocess_function(examples):
    input_ids = tokenizer(
        examples["en"], padding='max_length', truncation=True, max_length=MAX_LEN
        )['input_ids']

    labels = tokenizer(
        examples["vi"], padding='max_length', truncation=True, max_length=MAX_LEN
        )['input_ids']
    labels = [
        [-100 if item == tokenizer.pad_token_id else item for item in label]
        for label in labels]

    return {
        "input_ids": torch.tensor(input_ids),
        "labels": torch.tensor(labels)
    }

preprocessed_ds = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/133317 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

In [19]:
preprocessed_ds['train'][0]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu',
 'input_ids': [250004,
  127055,
  66937,
  13,
  152,
  581,
  41664,
  50155,
  10,
  153552,
  10336,
  2256,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [250004,
  67766,
  2546,
  218877,
  858,
  889,
  10037,
  6248,
  1893,
  17964,
  42254,
  2,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -10

## **Model**

In [11]:
from transformers import AutoModelForSeq2SeqLM

model_name = "facebook/mbart-large-50-many-to-many-mmt"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [20]:
model

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

## **Evaluate**

In [21]:
import numpy as np
import evaluate
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    preds= np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(
        preds, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )

    labels= np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(
        labels, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )

    decoded_preds, decoded_labels = postprocess_text(
        decoded_preds, decoded_labels
    )

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

## **Trainer**

In [None]:
# Disable wandb
import os
os.environ['WANDB_DISABLED'] = 'true'

# # Use wandb
# import wandb
# wandb.init(
#     project="en-vi-machine-translation",
#     name="mbart50" #
# )

In [None]:
output_dir = "./en-vi-mbart50"

training_args = Seq2SeqTrainingArguments(
    output_dir="./en-vi-mbart50",
    logging_strategy="steps",
    logging_steps=100,
    predict_with_generate=True,
    eval_strategy="steps",
    eval_steps=5000,
    save_steps=5000,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_total_limit=1,
    num_train_epochs=1,
    load_best_model_at_end=True,
    fp16=True,
    dataloader_num_workers=4,
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_ds["train"],
    eval_dataset=preprocessed_ds["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss,Bleu
5000,1.2695,1.369765,31.797854
10000,1.1996,1.298862,33.623493
15000,1.1621,1.237235,34.600268


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=16665, training_loss=1.2288790639012632, metrics={'train_runtime': 9598.7033, 'train_samples_per_second': 13.889, 'train_steps_per_second': 1.736, 'total_flos': 2.11607841263616e+16, 'train_loss': 1.2288790639012632, 'epoch': 1.0})

In [22]:
tokenizer.save_pretrained('nhutan410/en-vi-mbart50')
model.save_pretrained('nhutan410/en-vi-mbart50')

In [None]:
# model.push_to_hub("nhutan410/en-vi-mbart50")
# tokenizer.push_to_hub("nhutan410/en-vi-mbart50")

In [None]:
# trainer.push_to_hub(token="...")

## **Inference**

In [23]:
model_name = "nhutan410/en-vi-mbart50"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [24]:
translator = pipeline("translation", model=model, tokenizer=tokenizer)

Device set to use cuda:0


### **Greedy Search**

In [25]:
translated_text = translator("I go to school", src_lang="en_XX", tgt_lang="vi_VN", num_beams=1)
translated_text

[{'translation_text': 'Tôi đi học'}]

In [26]:
pred_sentences = []
for text in ds["test"]["en"]:
    output = translator(
        text,
        max_length=MAX_LEN,
        num_beams=1,
        do_sample=False,
        src_lang="en_XX",
        tgt_lang="vi_VN"
    )
    pred_sentences.append(output[0]["translation_text"])

references = [[ref] for ref in ds["test"]["vi"]]
bleu_score = sacrebleu.corpus_bleu(pred_sentences, references, force=True)
print(f"BLEU Score (greedy): {bleu_score.score:.2f}")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Your input_length: 71 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 77 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 78 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 74 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 72 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 70 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. t

BLEU Score (greedy): 47.26


### **Beam search**

In [27]:
translated_text = translator("In the next step, we consider the next possible tokens for each of the three branches we created in the previous step.", src_lang="en_XX", tgt_lang="vi_VN", num_beams=2)
translated_text

[{'translation_text': 'Trong bước tiếp theo, chúng tôi xem xét các token có thể tiếp theo cho mỗi một trong ba nhánh mà chúng tôi tạo ra trong bước trước.'}]

In [28]:
pred_sentences_beam = []

pred_sentences_beam = []
for text in ds["test"]["en"]:
    output = translator(
        text,
        max_length=MAX_LEN,
        num_beams=5,
        src_lang="en_XX",
        tgt_lang="vi_VN"
    )
    pred_sentences_beam.append(output[0]["translation_text"])


bleu_score_beam = sacrebleu.corpus_bleu(pred_sentences_beam, references, force=True)
print(f"BLEU Score (beam search): {bleu_score_beam.score:.2f}")

Your input_length: 71 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 77 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 78 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 74 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 72 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 70 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 105 is bigger than 0.9 * max_length: 75. You might consid

BLEU Score (beam search): 52.71


In [29]:
print(f"BLEU Score (greedy): {bleu_score.score:.2f}")
print(f"BLEU Score (beam search): {bleu_score_beam.score:.2f}")

BLEU Score (greedy): 47.26
BLEU Score (beam search): 52.71
