In [44]:
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
import numpy as np
import evaluate

model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

In [45]:
data = {
    "en": [
        "Hello, how are you?",
        "Good morning!",
        "I love machine learning.",
        "Where is the library?",
        "This is a test.",
        "What is your name?",
        "Have a nice day.",
        "I am learning French.",
        "The weather is nice today.",
        "See you tomorrow."
    ],
    "fr": [
        "Bonjour, comment ça va?",
        "Bonjour!",
        "J'adore l'apprentissage automatique.",
        "Où est la bibliothèque?",
        "Ceci est un test.",
        "Quel est ton nom?",
        "Bonne journée.",
        "J'apprends le français.",
        "Il fait beau aujourd'hui.",
        "À demain."
    ]
}

dataset = Dataset.from_dict(data)

In [46]:
def preprocess(batch):
    inputs = tokenizer(batch["en"], truncation=True)
    labels = tokenizer(batch["fr"], truncation=True)
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Map: 100%|█████████████████████████████| 10/10 [00:00<00:00, 1019.79 examples/s]


In [47]:
bleu = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    print("pred",preds)
    print("labels",labels)
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    return {"bleu": result["score"]}


# BASELINE

In [48]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_steps=1,
    save_strategy="no",
    learning_rate=5e-5,
    fp16=False,
    predict_with_generate=True
)

trainerb = Seq2SeqTrainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainerb.evaluate()

  trainerb = Seq2SeqTrainer(


pred [[59513  8703     2 ... 59513 59513 59513]
 [59513  8703   291 ... 59513 59513 59513]
 [59513   234     6 ... 59513 59513 59513]
 ...
 [59513   234     6 ... 59513 59513 59513]
 [59513    60   398 ... 59513 59513 59513]
 [59513    84  7097 ... 59513 59513 59513]]
labels [[ 3982 10243     2 ... 59513 59513 59513]
 [ 3982 10243   145 ... 59513 59513 59513]
 [  234     6  5916 ... 59513 59513 59513]
 ...
 [  234     6  4786 ... 59513 59513 59513]
 [  104  5901   567 ... 59513 59513 59513]
 [   49 14188     5 ... 59513 59513 59513]]


{'eval_loss': 4.388981819152832,
 'eval_model_preparation_time': 0.0012,
 'eval_bleu': 45.45958956990602,
 'eval_runtime': 0.7765,
 'eval_samples_per_second': 12.878,
 'eval_steps_per_second': 2.576}

# Train

In [49]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_steps=1,
    save_strategy="no",
    learning_rate=5e-5,
    fp16=False,
    predict_with_generate=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Step,Training Loss
1,5.7858
2,0.0
3,0.0
4,0.0
5,0.0


TrainOutput(global_step=5, training_loss=1.157162857055664, metrics={'train_runtime': 1.3244, 'train_samples_per_second': 7.551, 'train_steps_per_second': 3.775, 'total_flos': 16949182464.0, 'train_loss': 1.157162857055664, 'epoch': 1.0})

In [50]:
trainer.evaluate()

pred [[59513     0 59513 ... 59513 59513 59513]
 [59513     0 59513 ... 59513 59513 59513]
 [59513     0 59513 ... 59513 59513 59513]
 ...
 [59513     0 59513 ... 59513 59513 59513]
 [59513     0 59513 ... 59513 59513 59513]
 [59513     0 59513 ... 59513 59513 59513]]
labels [[ 3982 10243     2 ... 59513 59513 59513]
 [ 3982 10243   145 ... 59513 59513 59513]
 [  234     6  5916 ... 59513 59513 59513]
 ...
 [  234     6  4786 ... 59513 59513 59513]
 [  104  5901   567 ... 59513 59513 59513]
 [   49 14188     5 ... 59513 59513 59513]]


{'eval_loss': nan,
 'eval_bleu': 0.0,
 'eval_runtime': 0.2621,
 'eval_samples_per_second': 38.153,
 'eval_steps_per_second': 7.631,
 'epoch': 1.0}