Machine Translation Project

This project focuses on building a Neural Machine Translation (NMT) system to translate text from English to Arabic using the mBART pre-trained transformer model from Hugging Face. the model was fine-tuned on parallel English-Arabic datasets to enhance translation quality and fluency.


In [None]:
!pip install datasets

In [None]:
# Import All The Depndencies
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from datasets import load_dataset

# **Data** **Preprocessing**

In [None]:
DataMT = load_dataset("akbargherbal/10K_english_to_arabic_dataset_for_FT")

# Rename the columns
DataMT = DataMT.rename_column("output", "Target_Text_Arabic")
DataMT = DataMT.rename_column("input", "Source_Text_English")

# Remove the "instruction" column
DataMT = DataMT.remove_columns("instruction")

# Convert to pandas DataFrame for easy viewing
df = pd.DataFrame(DataMT['train'])

# Display the first few rows
print(df[["Source_Text_English", "Target_Text_Arabic"]].head())

In [None]:
df.shape

In [None]:
df = df.sample(n=6000 , random_state=42).reset_index(drop=True)
df.shape

In [None]:
# Convert all entries to srings
df["Source_Text_English"] = df["Source_Text_English"].astype(str)
df["Target_Text_Arabic"] = df["Target_Text_Arabic"].astype(str)

In [None]:
# Split Data into training and test sets.
train_data , test_data = train_test_split(df , test_size=0.2 , random_state=42)

# Convert data to huggingFace Dataset Format
train_data = Dataset.from_pandas(train_data)
test_data = Dataset.from_pandas(test_data)


# **Tokenization**

In [None]:
from transformers import AutoTokenizer
# Load the tokenizer for a choosen model
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")

# Tokenize the first the english sentence
sentence = train_data[0]["Source_Text_English"]
encoded = tokenizer(sentence, padding='max_length', truncation=True, max_length=120)
print(encoded)

# Tokenize the first the arabic sentence
sentence = train_data[0]["Target_Text_Arabic"]
encoded = tokenizer(sentence, padding='max_length', truncation=True, max_length=120)
print(encoded)

def preprocess_function(examples):
    # Set source and target language codes
    tokenizer.src_lang = "en_XX"
    tokenizer.tgt_lang = "ar_AR"

    inputs = examples["Source_Text_English"]
    targets = examples["Target_Text_Arabic"]

    # Tokenize source sentences
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


train_data = train_data.map(preprocess_function, batched=True)
test_data = test_data.map(preprocess_function, batched=True)

# Set format for pytourch tensors
train_data.set_format(type ="torch", columns=["input_ids","attention_mask","labels"])
test_data.set_format(type ="torch", columns=["input_ids","attention_mask","labels"])

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
results_dir = '/content/drive/MyDrive/results'
model_dir = '/content/drive/MyDrive/MachineTranslation_model'

# Create results dir if is doesnt exist
os.makedirs(results_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

# **Load** **pretrained** **sequnce** **to** **sequence** **model**

In [None]:
from transformers import Seq2SeqTrainingArguments

from transformers import AutoModelForSeq2SeqLM

# AutoModelForSeq2SeqLM is a class in the hugging face transformers library that automatically loads pre-trained sequence to sequence model
# We can use it for training tasks like machine translation , text summerization and text generation where it has both encoder and decoder.

# Load a pretrained sequence to sequence model
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50")

# Set language specific tokens
model.config.decoder_start_token_id = tokenizer.lang_code_to_id['ar_AR']
tokenizer.src_lang = "en_XX"
tokenizer.src_lang = "ar_AR"

# Seq2SeqTrainingArguments is a class in the hugging face library refers to a set of arguments or hyperparameters that control the training process of a model.
# These arguments are passed to the training loop to define how the model will be trained

training_args = Seq2SeqTrainingArguments(
    output_dir=results_dir,
    #evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir='./logs',
    predict_with_generate=True,
    generation_max_length=120,
    remove_unused_columns=False
)

from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

## **Save** **The** **Model**

In [None]:
# Save the fine-tuned model
model.save_pretrained(model_dir)
trainer.save_model(model_dir)


# Save the toeknizer
tokenizer.save_pretrained(model_dir)

loaded_model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
loaded_tokenizer = AutoTokenizer.from_pretrained(model_dir)

print("Done!")

## **Translation System**

In [None]:
def translate_text(text):

  # Tokenize input
  inputs = loaded_tokenizer(text, return_tensors="pt", max_length=120,truncation=True)

  # Generate Translation
  outputs = loaded_model.generate(inputs['input_ids'],max_length=120,num_beams=4,early_stopping=True)

  # Decode the translation
  translated_text = loaded_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
  return translated_text

Text_to_translate = "hello, How are you ?"
translated_text = translate_text(Text_to_translate)
print("Text_to_translate:",Text_to_translate, "\nTranslated Text:",translated_text)