<a href="https://colab.research.google.com/github/Mahdi-Golizadeh/Natural-Language-Processing/blob/main/transformers/translators/translation_en_fr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building an EN-FR Translator with Transformers
In this notebook I will build a transformer baes translator by fine-tuning an already existed model

## Install & Import Necessary libraries
Since I will use huggingface transformers library it is not installed by default in google colab so I first install them 

In [24]:
!pip install -q datasets
!pip install -q transformers
!pip install -q sentencepiece
!pip install -q sacrebleu
!pip install -q evaluate
!pip install -q accelerate
!pip install -q sacremoses

In [25]:
import datasets
import transformers
import evaluate
import numpy as np
import accelerate
import torch
from tqdm.auto import tqdm

## Dataset

kde4 dataset is used which contains many more different language pairs of sentences

In [26]:
raw_datasets = datasets.load_dataset("kde4", lang1= "en", lang2= "fr")



  0%|          | 0/1 [00:00<?, ?it/s]

the dataset is a little big and it doesn't have validation split

In [27]:
split_datasets = raw_datasets["train"].select(range(10000)).train_test_split(train_size= .9, seed= 49)

In [28]:
split_datasets["validation"] = split_datasets.pop("test")

##Preprocessin Data
to feed the data into the model we first need to tokenize it

In [29]:
checkpoint = "Helsinki-NLP/opus-mt-en-fr"

In [30]:
tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint, src_lang= "en", tgt_lang= "fr", return_tensors= "pt")

In [31]:
max_length= 128
def preprocess(example):
    inputs = [ex["en"] for ex in example["translation"]]
    targets = [ex["fr"] for ex in example["translation"]]
    model_inputs = tokenizer(inputs, text_target= targets, max_length= max_length, truncation= True)

    return model_inputs

In [32]:
tokenized_datasets = split_datasets.map(preprocess, batched= True, remove_columns= split_datasets["train"].column_names)

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [33]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})

## Model Selection

In [34]:
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

## Data Collation
for the purpose of dynamic padding and taking care of required process in training

In [35]:
data_collator = transformers.DataCollatorForSeq2Seq(
    model= model,
    tokenizer= tokenizer,
)

In [36]:
tokenized_datasets.set_format("torch")

## Preparing Dataset For Custom Loop

In [37]:
train_dataloader = torch.utils.data.DataLoader(
    tokenized_datasets["train"],
    shuffle= True,
    collate_fn= data_collator,
    batch_size= 8
)

In [38]:
eval_dataloader = torch.utils.data.DataLoader(
    tokenized_datasets["validation"],
    collate_fn= data_collator,
    batch_size= 8
)

## Preparing Model for Custom Loop

In [39]:
optimizer = torch.optim.AdamW(model.parameters(), lr= 2e-5)

In [40]:
accl = accelerate.Accelerator()

In [41]:
model, optimizer, train_dataloader, eval_dataloader = accl.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [42]:
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_train_steps = num_train_epochs * num_update_steps_per_epoch

In [43]:
lr_sch = transformers.get_scheduler(
    "linear",
    optimizer = optimizer,
    num_warmup_steps= 0,
    num_training_steps= num_train_steps,
)

In [44]:
def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens= True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens= True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]
    return decoded_preds, decoded_labels

## Evaluation Metric

In [45]:
metric = evaluate.load("sacrebleu")
def compute_metric(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(
        preds, skip_special_tokens= True,
    )
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens= True,)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]
    result = metric.compute(
        predictions= decoded_preds,
        references= decoded_labels,
    )
    return {"BLEU": result["score"]}

## Training Loop

In [46]:
progress_bar = tqdm(range(num_train_steps))
for epoch in range(num_train_epochs):
    # train loop
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accl.backward(loss)
        optimizer.step()
        lr_sch.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    # validation loop
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accl.unwrap_model(model).generate(batch["input_ids"], attention_mask= batch["attention_mask"], max_length= 128)
        labels = batch["labels"]
        generated_tokens = accl.pad_across_processes(
            generated_tokens, dim= 1, pad_index= tokenizer.pad_token_id
        )
        labels = accl.pad_across_processes(labels, dim= 1, pad_index= -100)
        predictions_gathered = accl.gather(generated_tokens)
        labels_gathered = accl.gather(labels)
        decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(
            predictions= decoded_preds,
            references= decoded_labels
        )
    #evaluating and saving the model
    result = metric.compute()
    print(f"epoch {epoch}, BLEU score: {result['score']:.2f}")
    accl.wait_for_everyone()
    unwrapped_model = accl.unwrap_model(model)
    unwrapped_model.save_pretrained("fine-tuned-en-fr",
                                    save_function= accl.save)
    if accl.is_main_process:
        tokenizer.save_pretrained("fine-tuned-en-fr")

  0%|          | 0/3375 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

epoch 0, BLEU score: 52.06


  0%|          | 0/125 [00:00<?, ?it/s]

epoch 1, BLEU score: 53.24


  0%|          | 0/125 [00:00<?, ?it/s]

epoch 2, BLEU score: 53.69


## Using the Fine-Tuned Model

In [47]:
translator = transformers.pipeline("translation_en_to_fr", model= "/content/fine-tuned-en-fr")
translator("I'm done!")

[{'translation_text': "J'ai fini."}]