In [None]:
import re
import pandas as pd

In [None]:
# Load the data into a Pandas dataframe
df = pd.read_csv("ara_eng.txt", sep="\t", header=None, names=["English","Arabic"])

In [None]:
df['English'] = df['English'].str.lower()
# Remove special characters from English text
df['English'] = df['English'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

# Remove special characters from Arabic text
df['Arabic'] = df['Arabic'].apply(lambda x: re.sub(r'[^\u0621-\u064A\s]', '', x))

In [None]:
from transformers import BertTokenizer
import torch
from torch.utils.data import TensorDataset, DataLoader

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [None]:
input_ids_en = []
attention_masks_en = []

input_ids_ar = []
attention_masks_ar = []

for eng_text, ar_text in zip(df['English'], df['Arabic']):
    encoded_en = tokenizer.encode_plus(
        eng_text,
        add_special_tokens=True,
        max_length=128,
        pad_to_max_length=True,
        return_attention_mask=True
    )
    
    encoded_ar = tokenizer.encode_plus(
        ar_text,
        add_special_tokens=True,
        max_length=128,
        pad_to_max_length=True,
        return_attention_mask=True
    )
    
    input_ids_en.append(encoded_en['input_ids'])
    attention_masks_en.append(encoded_en['attention_mask'])
    
    input_ids_ar.append(encoded_ar['input_ids'])
    attention_masks_ar.append(encoded_ar['attention_mask'])

In [None]:
input_ids_en = torch.tensor(input_ids_en)
attention_masks_en = torch.tensor(attention_masks_en)

input_ids_ar = torch.tensor(input_ids_ar)
attention_masks_ar = torch.tensor(attention_masks_ar)

In [None]:
dataset = TensorDataset(input_ids_en, attention_masks_en, input_ids_ar, attention_masks_ar)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
import torch
import torch.nn as nn
from tqdm import tqdm
from Translation_Transformers import *

In [None]:
import torch
import torch.nn as nn
from tqdm import tqdm

# Transformer model
model = Transformer(src_vocab_size=tokenizer.vocab_size, 
                    tgt_vocab_size=tokenizer.vocab_size,
                    d_model=512, 
                    num_layers=6, 
                    num_heads=8, 
                    d_ff=2048, 
                    max_len=128,
                    dropout=0.1)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Training function
def train(model, dataloader, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        loop = tqdm(dataloader, leave=True)
        for batch in loop:
            input_ids_en, attention_masks_en, input_ids_ar, attention_masks_ar = batch
            
            # Use the provided attention masks
            src_mask = attention_masks_en.unsqueeze(1).unsqueeze(2)
            tgt_mask = attention_masks_ar[:, :-1].unsqueeze(1).unsqueeze(2)
            
            output = model(input_ids_en, input_ids_ar[:, :-1], 
                           src_mask=src_mask, 
                           tgt_mask=tgt_mask)
            
            output = output.reshape(-1, output.size(-1))
            labels = input_ids_ar[:, 1:].contiguous().view(-1)
            
            optimizer.zero_grad()
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            
            loop.set_postfix(loss=loss.item())
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Train the model
train(model, dataloader, num_epochs=10)

In [None]:
            
# Generate subsequent mask for target sequence
def generate_square_subsequent_mask(sz):
    mask = torch.triu(torch.ones(sz, sz) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask