In [1]:
!pip install transformers datasets sacrebleu sentencepiece evaluate sacremoses matplotlib accelerate torch



In [2]:
import torch
import numpy as np
from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    MT5Tokenizer,
    MT5ForConditionalGeneration
)
from datasets import load_dataset, Dataset as HFDataset, concatenate_datasets
import evaluate

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [3]:
from itertools import islice

LANGS_NLLB = {
    "en": "eng_Latn",
    "fr": "fra_Latn",
    "it": "ita_Latn",
    "de": "deu_Latn",
    "es": "spa_Latn",
    "ro": "ron_Latn"
}

def load_and_limit_dataset(dataset_name, config, limit=25_000, streaming=True, field="translation"):
    if isinstance(config, dict):
        dataset = load_dataset(
            dataset_name,
            streaming=streaming,
            split="train",
            trust_remote_code=True,
            **config,
        )
    else:
        dataset = load_dataset(
            dataset_name,
            config,
            streaming=streaming,
            split="train",
            trust_remote_code=True
        )

    dataset = dataset.remove_columns([c for c in dataset.column_names if c != field])
    limited = list(islice(dataset, limit))
    return HFDataset.from_list(limited)

def load_and_limit_tatoeba(src_lang, tgt_lang="ro", limit=25_000):
    return load_and_limit_dataset(
        "tatoeba",
        {
            "lang1": src_lang,
            "lang2": tgt_lang
        },
        limit=limit,
        streaming=False,
    )

def load_and_limit_open_subtitles(src_lang, tgt_lang="ro", limit=25_000):
    return load_and_limit_dataset(
        "open_subtitles",
        {
            "lang1": src_lang,
            "lang2": tgt_lang
        },
        limit=limit,
        streaming=True,
    )

def load_and_limit_nllb(src_lang, tgt_lang="ro", limit=25_000):
    src_lang = LANGS_NLLB[src_lang]
    tgt_lang = LANGS_NLLB[tgt_lang]

    return load_and_limit_dataset(
        "allenai/nllb",
        f"{src_lang}-{tgt_lang}",
        limit=limit,
        streaming=True,
    )

def load_and_limit_ccmatrix(src_lang, tgt_lang="ro", limit=25_000):
    return load_and_limit_dataset(
        "yhavinga/ccmatrix",
        f"{src_lang}-{tgt_lang}",
        limit=limit,
        streaming=True,
    )

In [4]:
def reformat(dataset, src_lang, is_nllb=False):
    tgt_lang = "ro"
    if is_nllb:
        src_lang = LANGS_NLLB[src_lang]
        tgt_lang = LANGS_NLLB[tgt_lang]

    def map_fn(example):
        return {
            "src": f'<{src_lang}> {example["translation"][src_lang]}',
            "trg": example["translation"][tgt_lang]
        }
    return dataset.map(map_fn, remove_columns=dataset.column_names)

src_langs = ["en", "fr", "it", "de", "es"]
limit = 5_000

processed_ds = []
for lg in src_langs:
    ccmatrix = load_and_limit_ccmatrix(lg, limit=limit)
    tatoeba = load_and_limit_tatoeba(lg, limit=limit)
    open_subtitles = load_and_limit_open_subtitles(lg, limit=limit)
    nllb = load_and_limit_nllb(lg, limit=limit)

    processed_ds.append(reformat(ccmatrix, lg))
    processed_ds.append(reformat(tatoeba, lg))
    processed_ds.append(reformat(open_subtitles, lg))
    processed_ds.append(reformat(nllb, lg, True))

combined_dataset = concatenate_datasets(processed_ds)

Repo card metadata block was not found. Setting CardData to empty.
Map: 100%|██████████| 5000/5000 [00:00<00:00, 63004.78 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 63412.47 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 62959.20 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 61699.45 examples/s]
Repo card metadata block was not found. Setting CardData to empty.
Map: 100%|██████████| 5000/5000 [00:00<00:00, 60485.29 examples/s]
Map: 100%|██████████| 1965/1965 [00:00<00:00, 62276.11 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 63543.32 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 61621.68 examples/s]
Repo card metadata block was not found. Setting CardData to empty.
Map: 100%|██████████| 5000/5000 [00:00<00:00, 60589.44 examples/s]
Map: 100%|██████████| 1018/1018 [00:00<00:00, 54130.34 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 64421.51 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 65248.90 example

In [5]:
print(len(combined_dataset))
print(combined_dataset[0])

87113
{'src': '<en> We might call them the words of "unforgiveness."', 'trg': 'Le putem numi cuvintele „ne-iertării”.'}


In [6]:
shuffled_dataset = combined_dataset.shuffle(seed=42)

train_val_split = combined_dataset.train_test_split(test_size=0.25, seed=42)
train_val = train_val_split["train"]
test_hf_dataset = train_val_split["test"]

train_val_split_2 = train_val.train_test_split(test_size=0.25, seed=42)
train_hf_dataset = train_val_split_2["train"]
val_hf_dataset = train_val_split_2["test"]

print(f"Train size: {len(train_hf_dataset)}")
print(f"Validation size: {len(val_hf_dataset)}")
print(f"Test size: {len(test_hf_dataset)}")


Train size: 49000
Validation size: 16334
Test size: 21779


In [7]:
model_name = "google/mt5-small"

model = MT5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = MT5Tokenizer.from_pretrained(model_name)

model.to(device)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.mt5.tokenization_mt5.MT5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


MT5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
          

In [8]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["src"],
        max_length=128,
        truncation=True,
        padding="max_length",
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["trg"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset = train_hf_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_hf_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_hf_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 49000/49000 [00:06<00:00, 7314.44 examples/s]
Map: 100%|██████████| 16334/16334 [00:02<00:00, 7637.10 examples/s]
Map: 100%|██████████| 21779/21779 [00:03<00:00, 7178.15 examples/s]


In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5_translation",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    predict_with_generate=True,
    generation_max_length=128,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="bleu"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

metric = evaluate.load("bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in labels (default ignore index)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = metric.compute(
        predictions=decoded_preds,
        references=[[label] for label in decoded_labels]
    )
    return {"bleu": result["bleu"]}

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [12]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.82 GiB. GPU 0 has a total capacity of 11.99 GiB of which 0 bytes is free. Of the allocated memory 25.02 GiB is allocated by PyTorch, and 123.94 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trainer.evaluate()