In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m83.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m74.4 MB/s[0m eta [36m0:00:0

In [None]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895239 sha256=bc7c2b9fa4523b933fc52602d9445bddaec2f11ddc18727a78525546d6e3cf8a
  Stored in directory: /root/.cache/pip/wheels/00/24/97/a2ea5324f36bc626e1ea0267f33db6aa80d157ee977e9e42fb
Successfully built sacremoses
Installing collected packages: sacremoses
Successfully installed sacremoses-0.0.53


In [None]:
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer, AdamW
from torch.utils.data import DataLoader, Dataset
import sacremoses
import sentencepiece


In [None]:
# Load your parallel corpus data into a pandas DataFrame
data = pd.read_csv("ALL_Data.csv")  # Adapt the file name and structure

# Define your fine-tuning dataset class
class TranslationDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_text = self.data.iloc[idx]["English"]
        target_text = self.data.iloc[idx]["Arabic"]

        # Tokenize source and target sentences
        source_tokens = self.tokenizer.encode(source_text, padding="max_length", truncation=True, return_tensors="pt")
        target_tokens = self.tokenizer.encode(target_text, padding="max_length", truncation=True, return_tensors="pt")

        return {
            "input_ids": source_tokens.squeeze(),
            "attention_mask": source_tokens.squeeze().gt(0),  # Create attention mask
            "labels": target_tokens.squeeze(),
        }

In [None]:
data

Unnamed: 0.1,Unnamed: 0,Arabic,English
0,0,تفريج الكروب فى تدبير الحروب,TAFRIJ AL-KURUB FI TADBIR AL-HURUBA Muslim Man...
1,1,مقدمة,INTRODUCTION
2,2,بسمِ الله الرحمنِ الرحيم.. مؤيد الإسلام من سلط...,"IN THE NAME OF GOD, THE MERCIFUL, THE COMPASSI..."
3,3,ومسعد جده العالى بإبادة أعدائه الطغاة المارقين...,And [he is] the cause of his noble sire’s happ...
4,4,وأشهد أن لا إله إلا الله وحده لا شريك له، شهاد...,"I declare that there is no god but God alone, ..."
...,...,...,...
102090,102090,حدثنا عبد الرحمن بن إبراهيم، حدثنا الوليد بن م...,It was narrated from ‘Awf bin Malik Al-Ashja’...
102091,102091,حدثنا أبو بكر بن أبي شيبة، حدثنا سفيان بن عيين...,It was narrated from Abu Hurairah conveying i...
102092,102092,حدثنا أبو بكر بن أبي شيبة، حدثنا سفيان بن عيين...,It was narrated from Abu Hurairah that the Me...
102093,102093,حدثنا أبو بكر بن أبي شيبة، حدثنا أسود بن عامر،...,It was narrated that ‘Amr bin Taghlib said: ...


In [None]:
# Load pretrained model and tokenizer for Arabic to English translation
model_name = "Helsinki-NLP/opus-mt-ar-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Create fine-tuning dataset and dataloader
train_dataset = TranslationDataset(data, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()



In [None]:
# Fine-tuning loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}] - Average Loss: {average_loss:.4f}")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_model")

In [None]:
# Inference function
def translate_arabic_to_english(input_text):
    input_text = [input_text]
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    translated_ids = model.generate(input_ids)
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    return translated_text

In [None]:
# Example usage
input_text = "مرحبًا بالعالم"
translated_text = translate_arabic_to_english(input_text)
print("Input: ", input_text)
print("Translated: ", translated_text)