# Imports

In [None]:
import os
import pandas as pd
import numpy as np
import keras
import keras_nlp
import tensorflow as tf
from sklearn.model_selection import train_test_split

import random
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AdamWeightDecay, TFAutoModelForSeq2SeqLM
from datasets import Dataset

# Preprocess Dataframe

In [2]:
df = pd.read_csv('/kaggle/input/transla/translation_train.csv', encoding ='utf-8')
df.head(10)

Unnamed: 0,English,Arabic
0,I have been dealt four aces.,لقد رميت اربع اوراق من نوع أس.
1,rejoice over the verdict.,ابتهاج نتيجة اصدار الحكم
2,today the people of south africa marched in su...,اليوم شعب جنوب افريقيا يدعم الفلسطينيين الذين ...
3,earlier this month the ministry of health decl...,وقد صرحت وزارة الصحة باكرا هذا الشهر ان اكثر م...
4,sharek posted this video titled sharek partici...,نشر هذا الفيديو تحت عنوان شارك داعيا الشعب للا...
5,When was the castle built?,متى بُنيت القلعة؟
6,ha it s nice to be known as cool and calm it s...,اه من الجيد ان اكون معروفة باني هادية ومرتاحة ...
7,egypt muslim brotherhood trial postponed globa...,مصر تاجيل محاكمة اعضاء في الاخوان المسلمين الا...
8,I love rock.,أعشق موسيقى الروك.
9,This happened for a reason.,حدث هذا لسبب.


In [3]:
df.shape

(23406, 2)

Define a create text pairs fuction where each text pair contains English and its arabic version.

In [4]:
def create_text_pairs(dataframe):
    text_pairs = []
    # Iterate over each row in the DataFrame
    for i in range(0, dataframe.shape[0]):
        # Append English-Arabic text pair as a tuple
        text_pairs.append((dataframe['English'][i], dataframe['Arabic'][i]))
    return text_pairs

In [5]:
create_text_pairs(df)

[('I have been dealt four aces.', 'لقد رميت اربع اوراق من نوع أس.'),
 ('rejoice over the verdict.', 'ابتهاج نتيجة اصدار الحكم'),
 ('today the people of south africa marched in support of the palestinians that are suffering under the occupation nelson mandela once said we know too well that our freedom is incomplete without the freedom of the palestinians so today we stood up to be the voice of the voiceless july.',
  'اليوم شعب جنوب افريقيا يدعم الفلسطينيين الذين يعانون في ظل الاحتلال قال مانديلا مرة نحن نعلم جيدا ان حريتنا ناقصة من دون حرية الفلسطينيين نقفن اليوم لنعطي صوت لمن لا صوت لهم يوليو تموز'),
 ('earlier this month the ministry of health declared that last year more than people lost their lives because of air pollution in tehran iran s capital.',
  'وقد صرحت وزارة الصحة باكرا هذا الشهر ان اكثر من شخصا قد لقى حتفه بسبب تلوث الهواء في العاصمة الايرانية طهران'),
 ('sharek posted this video titled sharek participate calling upon people to join the marchers on friday.',
  'نشر هذا 

# Preprocess Model

Load tokenizer from pre-trained model Goolgle T5. Here we just use the small model.

In [6]:
tokenizer = AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-ar')



Create a data collator for squence to squence with tokenizer and model from Google T5-small.

In [7]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model='Helsinki-NLP/opus-mt-en-ar', return_tensors="tf")

We use Adam Weight Decay instead of reular Adam here.

In [8]:
optimizer = AdamWeightDecay(learning_rate=5e-4, weight_decay_rate=0.01)

Load a pre-trained Google T5-small model under TFAutoModelForSeq2SeqLM.

In [9]:
model = TFAutoModelForSeq2SeqLM.from_pretrained('Helsinki-NLP/opus-mt-en-ar')

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-ar.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In order to train and fine-tune t5 model, we need to have a prefix for a task since the model can do multi-task. Each task has a unique prefix. For translation, the prefix is: 'translate x to y' where x and y stand for name of language.

# Prepare Dataset

In [10]:
prefix = 'translate English to Arabic: '

Create a function preprocess dataset to prepare the tensorflow dataset for model inputs.

In [11]:
def preprocess_dataset(text_pairs, model, data_collator):
    # Separate English and Arabic texts
    en_texts, ar_texts = zip(*text_pairs)
    # Prepend prefix to English texts
    inputs = [prefix + str(text) for text in en_texts]
    # Set targets as Arabic texts
    targets = ar_texts
    # Create TensorFlow dataset
    tf_dataset = model.prepare_tf_dataset(
        Dataset.from_dict(tokenizer(inputs, text_target=targets, max_length=128, truncation=True)),
        shuffle=True,
        batch_size=64,
        collate_fn=data_collator
    )
    return tf_dataset

In [12]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
# Reset index to remove the index level
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [13]:
test_df.shape

(4682, 2)

In [14]:

train_set = preprocess_dataset(create_text_pairs(train_df),model,data_collator)
validation_set = preprocess_dataset(create_text_pairs(test_df),model,data_collator)

In [None]:
train_set

# Fine-tune Model

In [15]:
model.compile(optimizer = optimizer)

In [16]:
model.fit(x=train_set, validation_data = validation_set, epochs=4)

Epoch 1/4
Cause: for/else statement not yet supported
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tf_keras.src.callbacks.History at 0x7af21dc44760>

# Test Model

In [28]:
def translate_text(text):
    # Encode input text with tokenizer
    inputs = tokenizer(prefix + text, return_tensors="tf").input_ids
    # Generate translation output
    outputs = model.generate(inputs, max_new_tokens=64)
    # Decode translated text and remove special tokens
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

In [29]:
text = 'I’m not thirsty'

In [30]:
print(translate_text(text))

أنا لست عطشاناً.


In [22]:
text = 'This castle is amazing.'

In [23]:
print(translate_text(text))

هذه القلعة مذهلة.


# Save Model

In [24]:
model.save_pretrained('en_ar_model')

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62801]], 'forced_eos_token_id': 0}


Also save tokenizer.

In [25]:
tokenizer.save_pretrained('en_ar_model_tok')

('en_ar_model_tok/tokenizer_config.json',
 'en_ar_model_tok/special_tokens_map.json',
 'en_ar_model_tok/vocab.json',
 'en_ar_model_tok/source.spm',
 'en_ar_model_tok/target.spm',
 'en_ar_model_tok/added_tokens.json')

# Load fine-tuned Model from directory

In case if you want to load the fine-tuned model to fit more epochs, or just to do some translation, here is the process.

In [26]:
tokenizer = AutoTokenizer.from_pretrained('en_ar_model_tok')



In [27]:
model = TFAutoModelForSeq2SeqLM.from_pretrained('en_ar_model')

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at en_ar_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


# Calculate Bleu Score

- Evaluate translation quality using BLEU score
- BLEU (Bilingual Evaluation Understudy) score is a commonly used metric
- for assessing the quality of machine translations. It measures the
- similarity between the candidate translation and one or more reference
- translations based on n-gram overlap. While BLEU score provides an
- automated evaluation method, it's important to note that it may not
- fully capture aspects like fluency and semantic equivalence.

In [None]:
sample_size = 10  # Number of samples to take
random_sample = df.sample(n=sample_size)  # Sampling 10 random rows from the DataFrame
 
# Calculate BLEU score for each sample
total_bleu_score = 0
for index, row in random_sample.iterrows():
    source_text = row['English']  
    target_text = row['Arabic']  
 
    translated_text = translate_text(source_text)
    bleu_score = sentence_bleu([target_text.split()], translated_text.split())
    print("Source Text:", source_text)
    print("Target Text:", target_text)
    print("Translated Text:", translated_text)
    print("BLEU Score:", bleu_score)
    print("--------------------------")
    total_bleu_score += bleu_score
 
average_bleu_score = total_bleu_score / sample_size
print("Average BLEU Score:", average_bleu_score)