In [1]:
from datasets import load_dataset

books = load_dataset("opus_books", "en-fr")
books = books["train"].train_test_split(test_size=0.05)
books

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 120730
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 6355
    })
})

In [2]:
from transformers import AutoTokenizer

checkpoint = "Helsinki-NLP/opus-mt-fr-en"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [3]:
source_lang = "fr"
target_lang = "en"
prefix = "translate french to english: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [4]:
tokenized_books = books.map(preprocess_function, batched=True)

Map: 100%|██████████| 120730/120730 [00:37<00:00, 3204.29 examples/s]
Map: 100%|██████████| 6355/6355 [00:01<00:00, 3198.34 examples/s]


In [5]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

2023-08-08 20:36:54.095851: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-08 20:36:54.152806: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
import evaluate

metric = evaluate.load("sacrebleu")

In [7]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [8]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_opus_books_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=55,
    per_device_eval_batch_size=55,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_books["train"],
    eval_dataset=tokenized_books["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

preds = trainer.predict(tokenized_books["test"])

In [9]:
preds

PredictionOutput(predictions=array([[59513,  1149,  8130, ..., 59513, 59513, 59513],
       [59513, 18696, 27651, ..., 59513, 59513, 59513],
       [59513,  1149,  8130, ..., 59513, 59513, 59513],
       ...,
       [59513, 18696, 27651, ..., 59513, 59513, 59513],
       [59513,  1149,  8130, ..., 59513, 59513, 59513],
       [59513,  1149,  8130, ..., 59513, 59513, 59513]]), label_ids=array([[22572,     9,     4, ..., 59513, 59513, 59513],
       [  488,    21, 22866, ..., 59513, 59513, 59513],
       [   58, 43835,     2, ..., 59513, 59513, 59513],
       ...,
       [   47,  3406,  2895, ..., 59513, 59513, 59513],
       [ 2314,  8254,   954, ..., 59513, 59513, 59513],
       [  995,  4630,   444, ..., 59513, 59513, 59513]]), metrics={'test_loss': 2.624154806137085, 'test_bleu': 19.8333, 'test_gen_len': 33.0647, 'test_runtime': 404.483, 'test_samples_per_second': 15.711, 'test_steps_per_second': 0.287})

In [10]:
tokenizer.batch_decode(preds[0], skip_special_tokens=True)

['English translation to english: Towards the end of the intermission, our friend left us, and, to return to the entrance of the trailer, was forced to cross a group that had invaded the track and in the middle of which we suddenly saw Jasmin Delouche.',
 'translate french to english: Tonight we can only perform our last duties to our poor friend."',
 'English translation to english: -- Two, Monsignor.',
 'French translation to english: Was it alone to represent France in this mysterious association, obviously composed of individuals of various nationalities?',
 'English translation to english: He ends a very touching little speech with simple words, but whose effect was only better assured.',
 'English translation to english: The distance still prevented us from distinguishing the colors of his flame, which floated like a thin ribbon.',
 'English translation to english: Mason will neither brave me nor harm me voluntarily; but, without wanting it, he can, by a word said too lightly, de