In [2]:
!pip install datasets transformers sacrebleu sentencepiece torch torchvision torchaudio streamlit




In [3]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('Helsinki-NLP/tatoeba_mt', 'ara-eng', trust_remote_code=True)

# Check available splits
print(dataset.keys())  # Expected: ['test', 'validation']

# Assign dataset splits
train_data = dataset['validation'] if 'validation' in dataset.keys() else dataset['test']
valid_data = dataset['test']

# Check dataset columns
print(train_data.column_names)  # Expected: ['sourceLang', 'targetlang', 'sourceString', 'targetString']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

tatoeba_mt.py:   0%|          | 0.00/15.5k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

tatoeba-test.ara-eng.tsv:   0%|          | 0.00/938k [00:00<?, ?B/s]

tatoeba-dev.ara-eng.tsv:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10304 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/19528 [00:00<?, ? examples/s]

dict_keys(['test', 'validation'])
['sourceLang', 'targetlang', 'sourceString', 'targetString']


In [4]:
from transformers import AutoTokenizer

# Load tokenizer from pretrained model
model_checkpoint = "Helsinki-NLP/opus-mt-ar-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Tokenization function
def tokenize_data(batch):
    inputs = tokenizer(batch['sourceString'], padding="longest", truncation=True, max_length=128)
    targets = tokenizer(batch['targetString'], padding="longest", truncation=True, max_length=128)

    # Assign labels properly
    inputs['labels'] = targets['input_ids'].copy()
    return inputs

# Apply tokenization
train_data = train_data.map(tokenize_data, batched=True)
valid_data = valid_data.map(tokenize_data, batched=True)

# Verify tokenization
print(train_data[0])  # Check if tokenization works correctly


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]



Map:   0%|          | 0/19528 [00:00<?, ? examples/s]

Map:   0%|          | 0/10304 [00:00<?, ? examples/s]

{'sourceLang': 'acm', 'targetlang': 'eng', 'sourceString': 'عمرك رايح المكسيك؟', 'targetString': 'Have you ever been to Mexico?', 'input_ids': [7326, 57, 10372, 376, 4624, 55, 0, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62833, 62

In [5]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import DataCollatorForSeq2Seq


In [6]:
# Load Pretrained Transformer Model
model_checkpoint = "Helsinki-NLP/opus-mt-ar-en"  # Ensure this is defined in your previous cells
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)


pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [7]:
# Define Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./mt_model",
    evaluation_strategy="epoch",  # Fixed deprecated parameter
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,  # Use Mixed Precision for GPU optimization
    load_best_model_at_end=True,  # Load the best checkpoint automatically
    save_strategy="epoch",  # Save the best model per epoch
)




In [8]:
# Define Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [9]:
# Define Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=valid_data,
    data_collator=data_collator,
    tokenizer=tokenizer,  # Corrected: Removed `processing_class`
)


  trainer = Seq2SeqTrainer(


In [10]:
import os
os.environ["WANDB_MODE"] = "offline"  # Fully disables WandB
os.environ["WANDB_DISABLED"] = "true"


In [11]:

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,0.2085,0.173716
2,0.1536,0.151667
3,0.1325,0.146998


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


TrainOutput(global_step=3663, training_loss=0.22989022611367582, metrics={'train_runtime': 734.2791, 'train_samples_per_second': 79.784, 'train_steps_per_second': 4.989, 'total_flos': 1980910276706304.0, 'train_loss': 0.22989022611367582, 'epoch': 3.0})

In [12]:
import sacrebleu

def calculate_bleu_score(model, dataset, num_samples=1000):
    model.eval()
    references = []
    hypotheses = []

    for sample in dataset.select(range(num_samples)):
        input_text = sample['sourceString']
        reference = sample['targetString']

        # Tokenize and generate output
        input_ids = tokenizer(input_text, return_tensors="pt").input_ids.cuda()
        output_ids = model.generate(input_ids)

        # Decode outputs
        hypothesis = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        references.append(reference)
        hypotheses.append(hypothesis)

    bleu = sacrebleu.corpus_bleu(hypotheses, [references])
    return bleu.score

bleu_score = calculate_bleu_score(model, valid_data)
print(f"BLEU Score: {bleu_score}")


BLEU Score: 47.8545247149495


In [14]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load Model and Tokenizer
MODEL_NAME = "Helsinki-NLP/opus-mt-ar-en"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)

# Function to Translate Text
def translate_text(input_text):
    input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
    output_ids = model.generate(input_ids, num_beams=15, early_stopping=True)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Take input from user
while True:
    arabic_text = input("\nEnter Arabic text (or type 'exit' to quit): ").strip()

    if arabic_text.lower() == "exit":
        print("Exiting the translator. Goodbye! 👋")
        break

    english_translation = translate_text(arabic_text)
    predicted_translation = translate_text(arabic_text)
    print(f"\n🔹 Actual Translation: {english_translation}\n")
    print(f"\n Model's Predicted Translation: {predicted_translation}")
    print(f"-----------------------------------")



Enter Arabic text (or type 'exit' to quit): exit
Exiting the translator. Goodbye! 👋


In [15]:
from transformers import AutoTokenizer

# Define the path to save the model
model_save_path = "/content/arabic_to_english_model"

# Save the model and tokenizer
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model saved to {model_save_path}")




Model saved to /content/arabic_to_english_model


In [30]:
!zip -r arabic_to_english_model.zip /content/arabic_to_english_model
from google.colab import files
files.download("arabic_to_english_model.zip")


  adding: content/arabic_to_english_model/ (stored 0%)
  adding: content/arabic_to_english_model/generation_config.json (deflated 43%)
  adding: content/arabic_to_english_model/tokenizer_config.json (deflated 68%)
  adding: content/arabic_to_english_model/vocab.json (deflated 77%)
  adding: content/arabic_to_english_model/model.safetensors (deflated 7%)
  adding: content/arabic_to_english_model/config.json (deflated 61%)
  adding: content/arabic_to_english_model/target.spm (deflated 49%)
  adding: content/arabic_to_english_model/source.spm (deflated 55%)
  adding: content/arabic_to_english_model/special_tokens_map.json (deflated 35%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
!pip install gradio
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load Model and Tokenizer
MODEL_NAME = "Helsinki-NLP/opus-mt-ar-en"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)

# Define Translation Function
def translate(arabic_text):
    if not arabic_text.strip():
        return "Please enter Arabic text.", ""

    # Tokenize input
    inputs = tokenizer(arabic_text, return_tensors="pt", padding=True, truncation=True).to(device)

    # Generate translation
    with torch.no_grad():
        output_tokens = model.generate(**inputs)

    # Decode output
    english_translation = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

    return english_translation, english_translation  # Returning twice for "Translation" & "Predicted"

# Create Gradio Interface
iface = gr.Interface(
    fn=translate,
    inputs=gr.Textbox(lines=5, placeholder="Enter Arabic text here..."),
    outputs=[gr.Textbox(label="Translation"), gr.Textbox(label="Predicted")],
    title="Arabic to English Translator",
    description="Enter Arabic text, and this model will translate it into English.",
)

# Launch Gradio App
iface.launch(share=True)


Collecting gradio
  Downloading gradio-5.20.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3



Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://bb0b01bfb8cc75abfe.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


