In [12]:
!pip install transformers 



In [2]:
!pip install sentencepiece





In [3]:
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py): started
  Building wheel for sacremoses (setup.py): finished with status 'done'
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=8a6ec4e7c025e61015bca0528350f79f1376b3f891d9de635e76480ddbc7cff4
  Stored in directory: c:\users\nasser\appdata\local\pip\cache\wheels\12\1c\3d\46cf06718d63a32ff798a89594b61e7f345ab6b36d909ce033
Successfully built sacremoses
Installing collected packages: sacremoses
Successfully installed sacremoses-0.0.53




In [1]:
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer, AdamW
from torch.utils.data import DataLoader, Dataset
import sacremoses
import sentencepiece


In [2]:
# Load your parallel corpus data into a pandas DataFrame
data = pd.read_csv("https://raw.githubusercontent.com/NK-Z/HAT/main/ALL_Data.csv", index_col=0)  # Adapt the file name and structure

# Define your fine-tuning dataset class
class TranslationDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_text = self.data.iloc[idx]["English"]
        target_text = self.data.iloc[idx]["Arabic"]

        # Tokenize source and target sentences
        source_tokens = self.tokenizer.encode(source_text, padding="max_length", truncation=True, return_tensors="pt")
        target_tokens = self.tokenizer.encode(target_text, padding="max_length", truncation=True, return_tensors="pt")

        return {
            "input_ids": source_tokens.squeeze(),
            "attention_mask": source_tokens.squeeze().gt(0),  # Create attention mask
            "labels": target_tokens.squeeze(),
        }

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [184]:
data.iloc[4880]

Arabic     ألا ترى الأصم كيف ينحصر الوجود عنده في المحسوس...
English    One knows that a deaf person feels that the (w...
Name: 4941, dtype: object

In [185]:
data

Unnamed: 0,Arabic,English
0,تفريج الكروب فى تدبير الحروب,TAFRIJ AL-KURUB FI TADBIR AL-HURUBA Muslim Man...
1,مقدمة,INTRODUCTION
2,بسمِ الله الرحمنِ الرحيم.. مؤيد الإسلام من سلط...,"IN THE NAME OF GOD, THE MERCIFUL, THE COMPASSI..."
3,ومسعد جده العالى بإبادة أعدائه الطغاة المارقين...,And [he is] the cause of his noble sire’s happ...
4,وأشهد أن لا إله إلا الله وحده لا شريك له، شهاد...,"I declare that there is no god but God alone, ..."
...,...,...
102090,حدثنا عبد الرحمن بن إبراهيم، حدثنا الوليد بن م...,It was narrated from ‘Awf bin Malik Al-Ashja’...
102091,حدثنا أبو بكر بن أبي شيبة، حدثنا سفيان بن عيين...,It was narrated from Abu Hurairah conveying i...
102092,حدثنا أبو بكر بن أبي شيبة، حدثنا سفيان بن عيين...,It was narrated from Abu Hurairah that the Me...
102093,حدثنا أبو بكر بن أبي شيبة، حدثنا أسود بن عامر،...,It was narrated that ‘Amr bin Taghlib said: ...


In [180]:
data = data[data["English"].str.contains("سقطمنالنسخة") == False]


In [182]:
data.to_csv('Full_Data.csv')

In [9]:
# Load pretrained model and tokenizer for Arabic to English translation
model_name = "Helsinki-NLP/opus-mt-en-ar"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)

In [11]:
# Load pretrained model and tokenizer for Arabic to English translation
model_name = "Helsinki-NLP/opus-mt-en-ar"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)

# Create fine-tuning dataset and dataloader
train_dataset = TranslationDataset(data, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss() 

Downloading pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]



In [None]:

num_epochs = 10
best_loss = float('inf')  # Initialize with a high value

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}] - Batch Loss: {loss.item():.4f}')

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}] - Average Loss: {average_loss:.4f}")

    # Save the best model based on training loss
    if average_loss < best_loss:
        best_loss = average_loss
        model.save_pretrained("best_model_2")

print("Training finished.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch [1/10] - Batch Loss: 0.4456
Epoch [1/10] - Batch Loss: 0.1345
Epoch [1/10] - Batch Loss: 0.3589
Epoch [1/10] - Batch Loss: 0.4374
Epoch [1/10] - Batch Loss: 0.2923
Epoch [1/10] - Batch Loss: 0.3808
Epoch [1/10] - Batch Loss: 0.1383
Epoch [1/10] - Batch Loss: 0.1840
Epoch [1/10] - Batch Loss: 0.2468
Epoch [1/10] - Batch Loss: 0.2551
Epoch [1/10] - Batch Loss: 0.4207
Epoch [1/10] - Batch Loss: 0.2333
Epoch [1/10] - Batch Loss: 0.4775
Epoch [1/10] - Batch Loss: 0.3320
Epoch [1/10] - Batch Loss: 0.3505
Epoch [1/10] - Batch Loss: 0.2134
Epoch [1/10] - Batch Loss: 0.4028
Epoch [1/10] - Batch Loss: 0.5341
Epoch [1/10] - Batch Loss: 0.3900
Epoch [1/10] - Batch Loss: 0.5085
Epoch [1/10] - Batch Loss: 0.1845
Epoch [1/10] - Batch Loss: 0.4438
Epoch [1/10] - Batch Loss: 0.4892
Epoch [1/10] - Batch Loss: 0.2085
Epoch [1/10] - Batch Loss: 0.4597
Epoch [1/10] - Batch Loss: 0.4984
Epoch [1/10] - Batch Loss: 0.2464
Epoch [1/10] - Ba

In [5]:
my_model = MarianMTModel.from_pretrained('model')

In [6]:
my_model_2 = MarianMTModel.from_pretrained('Model_2')

In [172]:
# Inference function
def translate_english__to_arabic(input_text):
    input_text = [input_text]
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    translated_ids = my_model_2.generate(input_ids, max_length=len(str(input_text))+20, num_beams=100, early_stopping=True, no_repeat_ngram_size=2).to(device)
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    return translated_text

In [174]:
# Example usage
input_text = "They fought until they gave up"
translated_text = translate_english__to_arabic(input_text)
print("Input: ", input_text)
print("Translated: ", translated_text)

Input:  They fought until they gave up
Translated:  فقاتلوا حت إلى أَن يَتَرَكَهُم


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Save your model
model.save_pretrained('/content/drive/My Drive/best_model_2.h5')