In [None]:
!pip install evaluate

In [None]:
!pip install datasets

In [None]:
from transformers import TFAutoModelForSeq2SeqLM

In [None]:
from transformers import AdamWeightDecay

In [None]:
import numpy as np

In [None]:
import evaluate

In [None]:
from datasets import load_dataset

In [None]:
from transformers import AutoTokenizer

In [None]:
from transformers import DataCollatorForSeq2Seq

In [None]:
data = load_dataset("arinzeo/indo-eng")

In [None]:
train_dataset = data["train"]

In [None]:
train_dataset[:5]

In [None]:
len(train_dataset)

In [None]:
data_load = train_dataset.train_test_split(test_size=0.2)

In [None]:
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
source_lang = "indo"
target_lang = "eng"
prefix = "translate Indonesia to English: "


def preprocess_function(examples):
    inputs = [prefix + example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [None]:
tokenized_data = data_load.map(preprocess_function, batched=True)

Map:   0%|          | 0/28989 [00:00<?, ? examples/s]

Map:   0%|          | 0/7248 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")

In [None]:
metric = evaluate.load("sacrebleu")

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [None]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_data["train"],
    shuffle=True,
    batch_size=8,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_data["test"],
    shuffle=False,
    batch_size=8,
    collate_fn=data_collator,
)

In [None]:
model.compile(optimizer=optimizer, metrics=["accuracy"])

In [None]:
from transformers.keras_callbacks import KerasMetricCallback

In [None]:
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)

In [None]:
callbacks = [metric_callback]

In [None]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=5)

Epoch 1/10
Epoch 2/10
Epoch 3/10