In [2]:
!pip install datasets evaluate sacrebleu transformers



In [3]:
from datasets import load_dataset

dataset = load_dataset("kde4", lang1="ar", lang2="en")
dataset

Downloading builder script:   0%|          | 0.00/4.25k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/8.45k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 116239
    })
})

In [4]:
test_dataset = dataset['train'].train_test_split(test_size=.2)
val_dataset = test_dataset['test'].train_test_split(test_size=.5)
dataset['validation'] = val_dataset['train']
dataset['test'] = val_dataset['test']
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 116239
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 11624
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 11624
    })
})

In [5]:
from transformers import TFAutoModel, AutoTokenizer, AutoConfig

checkpoint = 't5-base'

old_tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [6]:
AutoConfig.from_pretrained(checkpoint)

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
   

In [7]:
def create_inputs_labels(examples):
  inputs = [ex['ar'] for ex in examples['translation']]
  labels = [ex['en'] for ex in examples['translation']]

  return {'inputs': inputs, 'targets': labels}

cleaned_dataset = dataset.map(create_inputs_labels, batched=True, remove_columns=dataset['train'].column_names)
cleaned_dataset

Map:   0%|          | 0/116239 [00:00<?, ? examples/s]

Map:   0%|          | 0/11624 [00:00<?, ? examples/s]

Map:   0%|          | 0/11624 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 116239
    })
    validation: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 11624
    })
    test: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 11624
    })
})

In [8]:
tokenizer = old_tokenizer.train_new_from_iterator(cleaned_dataset['train']['inputs'] + cleaned_dataset['train']['targets'], 32128)

In [9]:
tokenizer.tokenize(dataset['train']['translation'][13333]['en'])

['▁<', '▁pattern', '▁>']

In [10]:
tokenizer.tokenize(dataset['train']['translation'][13333]['ar'])

['▁التشكيلة']

In [11]:
max_len = 128
def tokenize(examples):
  return tokenizer(examples['inputs'], text_target=examples['targets'], max_length=max_len, truncation=True)

In [12]:
tokenized_dataset = cleaned_dataset.map(tokenize, batched=True, remove_columns=cleaned_dataset['train'].column_names)
tokenized_dataset

Map:   0%|          | 0/116239 [00:00<?, ? examples/s]

Map:   0%|          | 0/11624 [00:00<?, ? examples/s]

Map:   0%|          | 0/11624 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 116239
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 11624
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 11624
    })
})

In [26]:
config = {
    'batch_size': 16,
    'epochs': 4
}

In [14]:
from transformers import TFAutoModelForSeq2SeqLM

model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [15]:
from transformers import DataCollatorForSeq2Seq

datacollator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors='tf')

In [27]:
train_ds = tokenized_dataset['train'].to_tf_dataset(
                                          shuffle=True,
                                          batch_size=config['batch_size'],
                                          collate_fn=datacollator,
                                          columns=tokenized_dataset['train'].column_names)
val_ds = tokenized_dataset['validation'].to_tf_dataset(
                                          shuffle=True,
                                          batch_size=config['batch_size'],
                                          collate_fn=datacollator,
                                          columns=tokenized_dataset['validation'].column_names)
test_ds = tokenized_dataset['test'].to_tf_dataset(
                                          shuffle=True,
                                          batch_size=config['batch_size'],
                                          collate_fn=datacollator,
                                          columns=tokenized_dataset['test'].column_names)

In [31]:
for i in test_ds.take(1):
  print(i)

{'input_ids': <tf.Tensor: shape=(16, 109), dtype=int64, numpy=
array([[ 4691,     1,     0, ...,     0,     0,     0],
       [26827,   142,   114, ...,     0,     0,     0],
       [  144,   789,  3212, ...,     0,     0,     0],
       ...,
       [19054,     1,     0, ...,     0,     0,     0],
       [  486,     1,     0, ...,     0,     0,     0],
       [ 3784,   320,  5744, ...,     0,     0,     0]])>, 'attention_mask': <tf.Tensor: shape=(16, 109), dtype=int64, numpy=
array([[1, 1, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>, 'labels': <tf.Tensor: shape=(16, 109), dtype=int64, numpy=
array([[ 1427,     1,  -100, ...,  -100,  -100,  -100],
       [25188,  5930,     1, ...,  -100,  -100,  -100],
       [ 3734,   449,  1154, ...,  -100,  -100,  -100],
       ...,
       [ 4761,   106,     1, ...,  -100,  -100,  -100],
       [  786,   

In [18]:
import evaluate

metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [34]:
import numpy as np
import tensorflow as tf
from tqdm import tqdm

@tf.function(jit_compile=True)
def generate_with_xla(batch, mask):
    return model.generate(
        input_ids=batch,
        attention_mask=mask,
        max_new_tokens=128,
    )


def compute_metrics():
    all_preds = []
    all_labels = []

    for t in tqdm(test_ds):

        batch, mask, labels = t['input_ids'], t['attention_mask'], t['labels']
        predictions = generate_with_xla(batch, mask)
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = labels.numpy()
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]
        all_preds.extend(decoded_preds)
        all_labels.extend(decoded_labels)

    result = metric.compute(predictions=all_preds, references=all_labels)
    return {"bleu": result["score"]}

In [20]:
from transformers import create_optimizer
import tensorflow as tf

num_train_steps = len(train_ds) * config['epochs']

optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

In [28]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=config['epochs']
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.src.callbacks.History at 0x7d7991414910>

In [29]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=2
)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7d7a0f526260>

In [None]:
compute_metrics()

  7%|▋         | 49/727 [08:40<52:17,  4.63s/it]  