<a href="https://colab.research.google.com/github/HaywhyCoder/english-yoruba-translator/blob/main/English_Yoruba_translator_mT5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Yoruba - English Translator

#### Import Libraries

In [None]:
# ML Libraries
import pandas as pd
import numpy
import numpy as np
from sklearn.model_selection import train_test_split

# Huggingface, NLP
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from sacrebleu import corpus_bleu
import torch


#### Load Datasets

In [None]:
data = pd.read_csv("/kaggle/input/yoruba-english-pair/JW300_en-yo")
data.head()

Unnamed: 0,yoruba,english
0,Lílo Àkàbà — Ǹjẹ́ O Máa Ń Ṣe Àyẹ̀wò Wọ̀nyí Tó ...,Using Ladders — Do You Make These Safety Checks?
1,Látọwọ́ akọ̀ròyìn Jí!,By Awake!
2,ní Ireland,correspondent in Ireland
3,PAUL fẹ́ pààrọ̀ gílóòbù iná tó wà lóde ilé ẹ̀.,PAUL needed to change a bulb in an outside lig...
4,Ó tún fẹ́ nu àwọn fèrèsé pẹ̀tẹ́ẹ̀sì lọ́wọ́ ita...,He also needed to clean the outside upstairs w...


In [None]:
data.shape

(474988, 2)

In [None]:
# Check for rows with empty source text
print(data['english'].isnull().sum())

12502


In [None]:
data.dropna(inplace=True)  # drop rows with missing source or target texts
data.shape

(459871, 2)

In [None]:
data['yoruba'] = data["yoruba"].astype('str')

In [None]:
new_data = data[data['yoruba'].apply(len) > 10]     # Select sentences longer than 10 chars
new_data = new_data.sample(5000, random_state=12, ignore_index=True)
new_data.head()

Unnamed: 0,yoruba,english
0,[ Àpótí tó wà ní ojú ìwé 6 ],[ Box on page 6 ]
1,"Fọ́tò tí èmi, Eddie àti Bobby yà láìpẹ́ yìí",With Eddie and Bobby recently
2,"Bí àpẹẹrẹ, Ọ̀bọ ń ṣiṣẹ́, ìnàkí ń jẹ ẹ́, èyí tó...","For example, Monkey works, baboon eats, means ..."
3,[ Àwòrán tó wà ní ojú ìwé 7 ],[ Picture on page 7 ]
4,"Bí ọ̀rọ̀ ṣe máa ń rí lára wa yàtọ̀ síra, a sì ...",People have different temperaments and levels ...


In [None]:
new_data.columns = ['yor', 'en']

In [None]:
train_data, val_data = train_test_split(new_data, test_size=.2, random_state=42)
train_data, test_data = train_test_split(train_data, test_size=.1, random_state=42)
print(train_data.shape, val_data.shape, test_data.shape)

(3600, 2) (1000, 2) (400, 2)


In [None]:
# Create huggingface dataset
dataset1 = DatasetDict({
    'train': Dataset.from_pandas(train_data, preserve_index=False),
    'val': Dataset.from_pandas(val_data, preserve_index=False),
    'test': Dataset.from_pandas(test_data, preserve_index=False)
})
dataset1

DatasetDict({
    train: Dataset({
        features: ['yor', 'en'],
        num_rows: 3600
    })
    val: Dataset({
        features: ['yor', 'en'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['yor', 'en'],
        num_rows: 400
    })
})

In [None]:
# Load en-yor dataset from huggingface
datasets = load_dataset("jonathansuru/en_yor")
datasets

README.md:   0%|          | 0.00/553 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/1.13M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/304k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/290k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6644 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1544 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1558 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 6644
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 1544
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1558
    })
})

In [None]:
b = datasets['train'].to_pandas() # Convert huggingface dataset to pandas dataframe for processing

In [None]:
train_index = int(0.8 * len(b))    # train set size
val_index = train_index + int(0.8 * len(b[train_index:]))

In [None]:
# Split the dataset
train_dict = {'en': [item['en'] for item in b['translation'].tolist()[:train_index]], 'yor': [item['yor'] for item in b['translation'].tolist()[:train_index]]}
val_dict = {'en': [item['en'] for item in b['translation'].tolist()[train_index:val_index]], 'yor': [item['yor'] for item in b['translation'].tolist()[train_index:val_index]]}
test_dict = {'en': [item['en'] for item in b['translation'].tolist()[val_index:]], 'yor': [item['yor'] for item in b['translation'].tolist()[val_index:]]}
datasets2 = DatasetDict({"train": Dataset.from_dict(train_dict),
                         "val": Dataset.from_dict(val_dict),
                         "test": Dataset.from_dict(test_dict)})
datasets2

DatasetDict({
    train: Dataset({
        features: ['en', 'yor'],
        num_rows: 5315
    })
    val: Dataset({
        features: ['en', 'yor'],
        num_rows: 1063
    })
    test: Dataset({
        features: ['en', 'yor'],
        num_rows: 266
    })
})

In [None]:
new_datasets = DatasetDict({})  # Empty dataset to store concatenated dataset

In [None]:
new_datasets['train'] = concatenate_datasets([dataset1['train'], datasets2['train']])
new_datasets['val'] = concatenate_datasets([dataset1['val'], datasets2['val']])
new_datasets['test'] = concatenate_datasets([dataset1['test'], datasets2['test']])
new_datasets

DatasetDict({
    train: Dataset({
        features: ['yor', 'en'],
        num_rows: 8915
    })
    val: Dataset({
        features: ['yor', 'en'],
        num_rows: 2063
    })
    test: Dataset({
        features: ['yor', 'en'],
        num_rows: 666
    })
})

In [None]:
# Load the mT5 model and tokenizer
model_name = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
          

In [None]:
def preprocess_function(examples):
  inputs = ["" + example for example in examples['en']]
  model_inputs = tokenizer(inputs, max_length=128, return_tensors='pt', truncation=True, padding="max_length")
  labels = tokenizer(examples['yor'], max_length=128, return_tensors='pt', truncation=True, padding="max_length").input_ids
  model_inputs["labels"] = labels
  return model_inputs

In [None]:
preprocessed_dataset = new_datasets.map(preprocess_function, batched=True, remove_columns=new_datasets['train'].column_names)
preprocessed_dataset

Map:   0%|          | 0/8915 [00:00<?, ? examples/s]

Map:   0%|          | 0/2063 [00:00<?, ? examples/s]

Map:   0%|          | 0/666 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 8915
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2063
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 666
    })
})

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,
                                       model=model,
                                       padding=False,
                                       label_pad_token_id=tokenizer.pad_token_id)

In [None]:
def postprocess_text(preds, labels):
  preds = [pred.strip() for pred in preds]
  labels = [[label.strip()] for label in labels]

  return preds, labels

In [None]:
custom_optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=3e-4,
    eps=1e-8,
    weight_decay=0.01,
)

In [None]:
def compute_metrics(eval_preds):
  preds, labels = eval_preds

  if isinstance(preds, tuple):
    preds = preds[0]

  # Decode predictions
  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

  # Decode labels, handling -100 masking for tokenizers
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

  # Calculate BLEU score
  bleu = corpus_bleu(decoded_preds, decoded_labels).score

  return {"bleu": bleu}

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./my_yor_translation_model',
    eval_strategy='epoch',
    learning_rate=3e-4,
    weight_decay=.01,
    num_train_epochs=10,
    save_total_limit=1,
    predict_with_generate=True,
    lr_scheduler_type='cosine',
    warmup_steps=100,
    logging_dir='./logs',
    logging_steps=10,
    report_to='none'
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_dataset['train'],
    eval_dataset=preprocessed_dataset['val'],
    data_collator=data_collator,
    processing_class=tokenizer,
    optimizers=(custom_optimizer, None),
    compute_metrics=compute_metrics,
)

# Train model
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu
1,1.2453,1.164943,30.213754
2,1.1906,1.008712,35.930411
3,1.0188,0.931542,52.331757
4,0.8729,0.87814,46.713798
5,0.8589,0.848673,56.234133
6,0.6889,0.827404,79.527073
7,0.7427,0.81027,43.472087
8,0.689,0.805836,28.117066
9,0.6821,0.806482,79.527073
10,0.6486,0.80722,79.527073


TrainOutput(global_step=11150, training_loss=0.900098719062292, metrics={'train_runtime': 3976.6176, 'train_samples_per_second': 22.419, 'train_steps_per_second': 2.804, 'total_flos': 1.1784508735488e+16, 'train_loss': 0.900098719062292, 'epoch': 10.0})

In [None]:
eval_results = trainer.evaluate()

In [None]:
print(eval_results)

{'eval_loss': 0.8072203397750854, 'eval_bleu': 79.52707287670508, 'eval_runtime': 89.2929, 'eval_samples_per_second': 23.104, 'eval_steps_per_second': 2.889, 'epoch': 10.0}


In [None]:
trainer.save_model("./mt5-finetuned-en-yor")

In [None]:
sample_text = new_datasets['test']['en'][:5]
sample_text_target = new_datasets['test']['yor'][:5]
for source, target in zip(sample_text, sample_text_target):
  print(f"Source: {source} \nTarget: {target}\n")

Source: I had to make changes, to try to manifest my feelings more, in order for my friends to understand what I was feeling and to feel closer to me.” 
Target: Mo ní láti ṣe àwọn ìyípadà, kí n lè túbọ̀ máa sọ tinú mi jáde, káwọn ọ̀rẹ́ mi bàa lè mọ ohun tó wà lọ́kàn mi kí ọwọ́ wa sì wọ ọwọ́.”

Source: Explain how different scriptures can help a young person to get a balanced view of employment. 
Target: Ṣàlàyé bí onírúurú ẹsẹ Ìwé Mímọ́ ṣe lè ran ọ̀dọ́ kan lọ́wọ́ láti ní èrò tó yẹ nípa iṣẹ́ oúnjẹ òòjọ́?

Source: Nevertheless, it favors the rich and the powerful over the poor and the needy. 
Target: Àmọ́ àwọn tó lówó àtàwọn tó nípò ló ń jàǹfààní ẹ̀, kò dé ọ̀dọ̀ àwọn mẹ̀kúnnù àtàwọn akúṣẹ̀ẹ́.

Source: And what a powerful reason that is for displaying loving - kindness in our speech as well as in other aspects of our daily life! 
Target: Ìdí pàtàkì sì lèyí jẹ́ fún wa láti máa fi inú - rere - onífẹ̀ẹ́ hàn nínú ọ̀rọ̀ tó ń tẹnu wa jáde àti nínú bá a ṣe ń gbé ìgbé ayé wa lójoojúmọ́!

Source: F

#### Test the model

In [None]:
inputs = ["" + example for example in sample_text]
inputs = tokenizer(inputs, max_length=128,return_tensors='pt', truncation=True, padding="max_length").to(device)
#translated = model.generate(**inputs, num_beams=4, length_penalty=2.0, early_stopping=True, do_sample=True, repetition_penalty=0.2)
translated = model.generate(**inputs, max_length=128, num_beams=3,penalty_alpha=0.6, length_penalty=-2.0, early_stopping=True, repetition_penalty=2.0)
translations = tokenizer.batch_decode(translated, skip_special_tokens=True)
# translations = [x.strip() for x in translations]
for source, target in zip(sample_text, translations):
  print(f"Source: {source} \nTranslation: {target}\n")

Source: I had to make changes, to try to manifest my feelings more, in order for my friends to understand what I was feeling and to feel closer to me.” 
Translation: Mo ní láti máa ṣe àtúnṣe sí i, kí n lè mọ ohun tí mo bá ń sọ̀rọ̀ yìí, kí n sì túbọ̀ mọ ohun tí mo wà lọ́kàn mi.”

Source: Explain how different scriptures can help a young person to get a balanced view of employment. 
Translation: Ṣàlàyé bí àwọn Ìwé Mímọ́ ṣe lè ràn wá lọ́wọ́ láti jẹ́ kí ọ̀dọ́ kan nínú iṣẹ́ ìwàásù.

Source: Nevertheless, it favors the rich and the powerful over the poor and the needy. 
Translation: Síbẹ̀síbẹ̀, ó jẹ́ kí ọ̀pọ̀lọpọ̀ èèyàn àti òṣùwọ̀n tó nílò.

Source: And what a powerful reason that is for displaying loving - kindness in our speech as well as in other aspects of our daily life! 
Translation: Àti pé ìdí tí ó jẹ́ kí nǹkan ṣe pàtàkì jù lọ fún wa nínú ọ̀rọ̀ Ọlọ́run, bí a ṣe ń fi ìfẹ́ tó wà nínú ayé wa!

Source: For areas under direct royal administration, tax brokers or chief tax collectors ​ — we

In [None]:
def translate_en(text):
    inputs = ["" + str(text)]
    inputs = tokenizer(inputs, max_length=128,return_tensors='pt', truncation=True, padding="max_length").to(device)
    translated = model.generate(**inputs, max_length=100, penalty_alpha=0.5, length_penalty=-2.0)
    #translated = model.generate(**inputs, max_length=128, num_beams=3,penalty_alpha=0.6, length_penalty=-2.0, early_stopping=True, repetition_penalty=2.0)
    translations = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return translations

In [None]:
translation = translate_en("A global epidemic of hate")
translation

['Àrùn elétò ìkórìíra']