# Translation using fine tuned hugging face transformer

Dataset -> [KDE4](https://huggingface.co/datasets/kde4)

English to French translation

In [None]:
!pip install datasets

# Load Dataset

In [71]:
from datasets import load_dataset
raw_datasets = load_dataset("kde4",'en-fr')

In [72]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})

In [73]:
raw_datasets['train'][0]

{'id': '0', 'translation': {'en': 'Lauri Watts', 'fr': 'Lauri Watts'}}

In [74]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.7, seed=20)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 147121
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 63052
    })
})

In [75]:
split_datasets = split_datasets["train"].train_test_split(train_size=0.60, seed=20)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 88272
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 58849
    })
})

Rename our 'test' key to 'validation' key

In [76]:
split_datasets["validation"] = split_datasets.pop("test")

# Preprocessing the text

In [77]:
!pip install sentencepiece



In [78]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

Let's process one sample

In [79]:
en_sentence = split_datasets["train"][1]["translation"]["en"]
fr_sentence = split_datasets["train"][1]["translation"]["fr"]

inputs = tokenizer(en_sentence, text_target=fr_sentence)
inputs

{'input_ids': [28944, 19335, 21300, 12, 267, 23104, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1], 'labels': [24226, 8918, 28155, 16572, 51, 17, 8, 5759, 226, 13536, 0]}

In [80]:
wrong_targets = tokenizer(fr_sentence)
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))
print(tokenizer.convert_ids_to_tokens(inputs["labels"]))

['▁Supp', 'r', '▁É', 'dition', '▁Met', 't', 're', '▁à', '▁la', '▁cor', 'b', 'eille', '</s>']
['▁Sup', 'pr', '▁Édition', '▁Mettr', 'e', '▁à', '▁la', '▁cor', 'b', 'eille', '</s>']


Note : Why first one i.e. ['▁Par', '▁dé', 'f', 'aut', ',', '▁dé', 've', 'lop', 'per', '▁les', '▁fil', 's', '▁de', '▁discussion', '</s>'] is wrong? -> Because the tokenizer is trained on english dataset and not on french dataset, so it will tokenize french words considering them as english words

Using the English tokenizer to preprocess a French sentence results in a lot more tokens, since the tokenizer doesn’t know any French words (except those that also appear in the English language, like “discussion”).

In [81]:
max_length = 128


def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

Map:   0%|          | 0/88272 [00:00<?, ? examples/s]

Map:   0%|          | 0/58849 [00:00<?, ? examples/s]

In [82]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 88272
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 58849
    })
})

In [83]:
print(tokenized_datasets['train'][0])

{'input_ids': [1436, 3196, 2939, 0], 'attention_mask': [1, 1, 1, 1], 'labels': [1436, 3196, 2939, 0]}


# Fine tuning using trainer API

In [84]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [85]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [86]:
batch = data_collator([tokenized_datasets["train"][0]])

In [87]:
batch

{'input_ids': tensor([[1436, 3196, 2939,    0]]), 'attention_mask': tensor([[1, 1, 1, 1]]), 'labels': tensor([[1436, 3196, 2939,    0]]), 'decoder_input_ids': tensor([[59513,  1436,  3196,  2939]])}

Data collator is also responsible for preparing the decoder input IDs, which are shifted versions of the labels with a special token at the beginning.

SCARE BLEU Score

In [None]:
!pip install sacrebleu

In [None]:
!pip install evaluate

In [90]:
import evaluate

metric = evaluate.load("sacrebleu")

In [91]:
predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 46.750469682990165,
 'counts': [11, 6, 4, 3],
 'totals': [12, 11, 10, 9],
 'precisions': [91.66666666666667,
  54.54545454545455,
  40.0,
  33.333333333333336],
 'bp': 0.9200444146293233,
 'sys_len': 12,
 'ref_len': 13}

This gets a BLEU score of 46.75, which is rather good — for reference, the original Transformer model in the “Attention Is All You Need” paper achieved a BLEU score of 41.8 on a similar translation task between English and French!

In [92]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [93]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install transformers[torch]

In [None]:
!pip install accelerate -U

In [96]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"marian-finetuned-kde4-en-to-fr",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    push_to_hub=True,
)

In [97]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [98]:
trainer.evaluate(max_length=max_length)

{'eval_loss': 1.693158507347107,
 'eval_bleu': 39.87447785467704,
 'eval_runtime': 4477.1512,
 'eval_samples_per_second': 13.144,
 'eval_steps_per_second': 0.205}

In [99]:
trainer.train()

Step,Training Loss
500,1.4098
1000,1.2339
1500,1.1853
2000,1.1306
2500,1.1042
3000,1.0472
3500,0.9951
4000,0.9767
4500,0.9592
5000,0.9494


TrainOutput(global_step=8277, training_loss=1.0191470776777185, metrics={'train_runtime': 3299.3491, 'train_samples_per_second': 80.263, 'train_steps_per_second': 2.509, 'total_flos': 5262106747207680.0, 'train_loss': 1.0191470776777185, 'epoch': 3.0})

In [100]:
trainer.evaluate(max_length=max_length)

{'eval_loss': 0.9387052655220032,
 'eval_bleu': 51.149111127651565,
 'eval_runtime': 4445.3014,
 'eval_samples_per_second': 13.238,
 'eval_steps_per_second': 0.207,
 'epoch': 3.0}

In [101]:
trainer.push_to_hub(tags="translation", commit_message="Training complete")

events.out.tfevents.1703953963.881584b5ebbb.1603.3:   0%|          | 0.00/407 [00:00<?, ?B/s]

'https://huggingface.co/neural-net-rahul/marian-finetuned-kde4-en-to-fr/tree/main/'

In [102]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "neural-net-rahul/marian-finetuned-kde4-en-to-fr"
translator = pipeline("translation", model=model_checkpoint)

model.safetensors:   0%|          | 0.00/299M [00:00<?, ?B/s]



# Validation

In [108]:
translator('A member of the wealthy South African Musk family, Elon was born in Pretoria and briefly attended the University of Pretoria.')
# Expected -> Membre de la riche famille sud-africaine Musk, Elon est né à Pretoria et a brièvement fréquenté l'Université de Pretoria.

[{'translation_text': "Membre de la riche famille Musk sud-africaine, Elon est né à Pretoria et a brièvement étudié à l'université de Pretoria."}]

In [107]:
translator('In April 2019, Musk, through Emo G Records, released a rap track, RIP Harambe, on SoundCloud.')
# Expected -> En avril 2019, Musk, via Emo G Records, a sorti un morceau de rap, "RIP Harambe", sur SoundCloud.

[{'translation_text': 'En avril 2019, Musk, via Emo G Records, a lancé une piste de rap, RIP Harambe, sur SoundCloud.'}]