In [1]:
import torch
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
from torch.utils.data import Dataset, DataLoader


ModuleNotFoundError: No module named 'transformers'

In [None]:
# Load the pretrained model and tokenizer
model_name = "Helsinki-NLP/opus-mt-ar-en"
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Load your dataset
data = pd.read_csv("your_dataset.csv")  # Replace with the path to your CSV file

# Define a custom dataset class
class CustomTranslationDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        source_text = self.data["Arabic"].iloc[index]
        target_text = self.data["English"].iloc[index]
        inputs = self.tokenizer(source_text, return_tensors="pt", truncation=True, padding="max_length")
        targets = self.tokenizer(target_text, return_tensors="pt", truncation=True, padding="max_length")
        return inputs, targets

# Create an instance of the custom dataset
custom_dataset = CustomTranslationDataset(data, tokenizer)

# Create a data loader
batch_size = 8
train_dataloader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        inputs, targets = batch
        inputs = {k: v.to(device) for k, v in inputs.items()}
        targets = {k: v.to(device) for k, v in targets.items()}
        loss = model(**inputs, labels=targets["input_ids"]).loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {loss.item()}")

# Inference function
def translate_arabic_to_english(input_text):
    input_text = [input_text]
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    translated_ids = model.generate(input_ids)
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    return translated_text

# Example usage
input_text = "مرحبًا بالعالم"
translated_text = translate_arabic_to_english(input_text)
print("Input: ", input_text)
print("Translated: ", translated_text)
