<a href="https://colab.research.google.com/github/Marwan19930/My-project/blob/main/Copy_of_arabic_english_translteration_model_team_12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

First, we import the dependencies

In [None]:
import pandas as pd

Get the url's of dataset files


In [None]:
# Load all datasets
url1 = 'https://raw.githubusercontent.com/Marwan19930/arabic-transliteration-dataset/main/15%2C898_ar2en.csv'
url2 = 'https://raw.githubusercontent.com/Marwan19930/arabic-transliteration-dataset/main/80%2C000_EN-AR_Named-entities.csv'
url3 = 'https://raw.githubusercontent.com/Marwan19930/arabic-transliteration-dataset/main/NETransliteration.csv'
url4 = 'https://raw.githubusercontent.com/Marwan19930/arabic-transliteration-dataset/main/arabic_names_dataset.csv'

df1 = pd.read_csv(url1)
df2 = pd.read_csv(url2)
df3 = pd.read_csv(url3)
df4 = pd.read_csv(url4)

# Combine all datasets vertically
combined_df = pd.concat([df1, df2, df3, df4], ignore_index=True)

In [None]:
print("="*50)
print("BEFORE CLEANING: DATASET SUMMARY")
print("="*50)

# Basic info
print("\n=== Dataset Shape ===")
print(f"Total rows: {len(combined_df)}")
print(f"Columns: {combined_df.columns.tolist()}")

# Check for null values
print("\n=== Null Values ===")
print(combined_df.isnull().sum())

# Check for empty strings
print("\n=== Empty Strings ===")
print(f"Arabic empty: {(combined_df['arabic'] == '').sum()}")
print(f"English empty: {(combined_df['english'] == '').sum()}")

# Check duplicates
print("\n=== Duplicates ===")
duplicates = combined_df.duplicated(subset=['arabic', 'english'], keep=False)
print(f"Total duplicate rows: {duplicates.sum()}")
print("\nSample duplicates (if any):")
print(combined_df[duplicates].head(3) if duplicates.any() else "No duplicates found")

original_size = len(combined_df)

BEFORE CLEANING: DATASET SUMMARY

=== Dataset Shape ===
Total rows: 242203
Columns: ['arabic', 'english']

=== Null Values ===
arabic     7
english    0
dtype: int64

=== Empty Strings ===
Arabic empty: 0
English empty: 0

=== Duplicates ===
Total duplicate rows: 29445

Sample duplicates (if any):
      arabic english
3792      دى      de
3793      دى      de
16173   لانا    Lana


In [None]:
# Standardize column names (adjust based on your actual column names)
combined_df = combined_df.rename(columns={
    'Arabic': 'arabic',
    'English': 'english',
    'AR_NAME': 'arabic',  # If column names differ
    'EN_NAME': 'english'   # Across datasets
})

In [None]:
# Remove rows with empty strings or NaN values
combined_df = combined_df.dropna(subset=['arabic', 'english'])  # Drops NaN
combined_df = combined_df[(combined_df['arabic'] != '') & (combined_df['english'] != '')]

In [None]:
# Remove exact duplicates
combined_df = combined_df.drop_duplicates(subset=['arabic', 'english'])

# Optional: Case-insensitive deduplication (if needed)
combined_df['arabic_lower'] = combined_df['arabic'].str.lower()
combined_df['english_lower'] = combined_df['english'].str.lower()
combined_df = combined_df.drop_duplicates(subset=['arabic_lower', 'english_lower'])
combined_df = combined_df.drop(columns=['arabic_lower', 'english_lower'])

In [None]:
import re

def remove_tashkeel(text):
    """Remove Arabic diacritics"""
    tashkeel = re.compile(r'[\u064b-\u065f\u0670]')
    return re.sub(tashkeel, '', str(text))

def clean_arabic(text):
    """Keep only Arabic letters and basic punctuation"""
    text = remove_tashkeel(text)
    arabic_pattern = re.compile(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF 0-9!?.,-]')
    return re.sub(arabic_pattern, '', str(text)).strip()

def clean_english(text):
    """Keep only Latin characters and basic punctuation"""
    return re.sub(r'[^A-Za-z0-9 !?.,-]', '', str(text)).strip()

# Apply cleaning
combined_df['arabic'] = combined_df['arabic'].apply(clean_arabic)
combined_df['english'] = combined_df['english'].apply(clean_english)

# Remove rows that became empty after cleaning
combined_df = combined_df[(combined_df['arabic'] != '') & (combined_df['english'] != '')]

In [None]:
print(f"Final dataset size: {len(combined_df)}")
print("Sample rows:")
print(combined_df.sample(3))

Final dataset size: 217734
Sample rows:
              arabic        english
200872  روبن بورياني  Ruben Buriani
56264         كريبتو         Crypto
120151   مانويل فالس   Manuel Valls


In [None]:
# Clean the dataset (as in previous steps)
combined_df['arabic'] = combined_df['arabic'].apply(clean_arabic)
combined_df['english'] = combined_df['english'].apply(clean_english)
combined_df = combined_df[(combined_df['arabic'] != '') & (combined_df['english'] != '')]
combined_df = combined_df.drop_duplicates(subset=['arabic', 'english'])

print("\n\n" + "="*50)
print("AFTER CLEANING: DATASET SUMMARY")
print("="*50)

# Basic info
print("\n=== Dataset Shape ===")
print(f"Total rows remaining: {len(combined_df)}")
print(f"Rows removed: {original_size - len(combined_df)}")

# Verify no nulls/empty
print("\n=== Null/Empty Check ===")
print("Nulls after cleaning:")
print(combined_df.isnull().sum())
print(f"\nArabic empty: {(combined_df['arabic'] == '').sum()}")
print(f"English empty: {(combined_df['english'] == '').sum()}")

# Verify no duplicates
print("\n=== Duplicates Check ===")
duplicates = combined_df.duplicated(subset=['arabic', 'english'], keep=False)
print(f"Remaining duplicates: {duplicates.sum()}")



AFTER CLEANING: DATASET SUMMARY

=== Dataset Shape ===
Total rows remaining: 217654
Rows removed: 24549

=== Null/Empty Check ===
Nulls after cleaning:
arabic     0
english    0
dtype: int64

Arabic empty: 0
English empty: 0

=== Duplicates Check ===
Remaining duplicates: 0


In [None]:
combined_df.to_csv('cleaned_arabic_english_transliteration.csv', index=False)

In [None]:
combined_df.head()

Unnamed: 0,arabic,english
0,العالي,aal
1,أعشى,asha
2,أعثم,atham
3,آا,aa
4,أدلاند,aadland


#####Prepare the dataset from combined_df for training a Seq2Seq LSTM or GRU model for Arabic-to-English transliteration.

In [None]:
#Check column names

print(combined_df.columns)

Index(['arabic', 'english'], dtype='object')


In [None]:
#Add start (\t) and end (\n) tokens to English

combined_df['input'] = combined_df['arabic']
combined_df['target'] = combined_df['english'].apply(lambda x: '\t' + x + '\n')

In [None]:
combined_df.head()

Unnamed: 0,arabic,english,input,target
0,العالي,aal,العالي,\taal\n
1,أعشى,asha,أعشى,\tasha\n
2,أعثم,atham,أعثم,\tatham\n
3,آا,aa,آا,\taa\n
4,أدلاند,aadland,أدلاند,\taadland\n


Extract character sets from the actual data

In [None]:
input_texts = combined_df['input'].tolist()
target_texts = combined_df['target'].tolist()

# Get unique characters in input and target
input_characters = sorted(set(''.join(input_texts)))
target_characters = sorted(set(''.join(target_texts)))

# Create char-to-index dictionaries
input_token_index = {char: i for i, char in enumerate(input_characters)}
target_token_index = {char: i for i, char in enumerate(target_characters)}

Define vocab sizes and max sequence lengths

In [None]:
encoder_vocab_size = len(input_characters)
decoder_vocab_size = len(target_characters)

max_encoder_seq_length = max(len(txt) for txt in input_texts)
max_decoder_seq_length = max(len(txt) for txt in target_texts)

Vectorize your sequences (convert to model input format)

In [None]:
import numpy as np

num_samples = len(input_texts)
encoder_input_data = np.zeros((num_samples, max_encoder_seq_length), dtype='int32')
decoder_input_data = np.zeros((num_samples, max_decoder_seq_length), dtype='int32')
decoder_target_data = np.zeros((num_samples, max_decoder_seq_length), dtype='int32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t] = input_token_index[char]
    for t, char in enumerate(target_text):
        decoder_input_data[i, t] = target_token_index[char]
        if t > 0:
            decoder_target_data[i, t - 1] = target_token_index[char]

In [None]:
print("Encoder input shape:", encoder_input_data.shape)
print("Decoder input shape:", decoder_input_data.shape)
print("Decoder target shape (before expand):", decoder_target_data.shape)

Encoder input shape: (217654, 44)
Decoder input shape: (217654, 47)
Decoder target shape (before expand): (217654, 47)


Building a GRU-based Seq2Seq model architecture for Arabic-to-English transliteration

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, GRU, Dense
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import time # For session keeping


# 2. Define model parameters
embedding_dim = 128
hidden_dim = 256

# 3. Build the Encoder
encoder_inputs = Input(shape=(None,), name='encoder_input')
encoder_embedding = Embedding(input_dim=encoder_vocab_size, output_dim=embedding_dim, name='encoder_embedding')(encoder_inputs)
encoder_gru = GRU(hidden_dim, return_state=True, name='encoder_gru')
encoder_outputs, encoder_state = encoder_gru(encoder_embedding)

# 4. Build the Decoder
decoder_inputs = Input(shape=(None,), name='decoder_input')
decoder_embedding = Embedding(input_dim=decoder_vocab_size, output_dim=embedding_dim, name='decoder_embedding')(decoder_inputs)
decoder_gru = GRU(hidden_dim, return_sequences=True, return_state=True, name='decoder_gru')
decoder_outputs, _ = decoder_gru(decoder_embedding, initial_state=encoder_state)
decoder_dense = Dense(decoder_vocab_size, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

# 5. Define and compile the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 6. Print model summary
model.summary()

# 7. Prepare data for training
num_samples = len(input_texts)
max_encoder_seq_length = max(len(txt) for txt in input_texts)
max_decoder_seq_length = max(len(txt) for txt in target_texts)

encoder_input_data = np.zeros((num_samples, max_encoder_seq_length), dtype='int32')
decoder_input_data = np.zeros((num_samples, max_decoder_seq_length), dtype='int32')
decoder_target_data = np.zeros((num_samples, max_decoder_seq_length), dtype='int32')

input_token_index = {char: i for i, char in enumerate(input_characters)}
target_token_index = {char: i for i, char in enumerate(target_characters)}

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t] = input_token_index[char]
    for t, char in enumerate(target_text):
        decoder_input_data[i, t] = target_token_index[char]
        if t > 0:
            decoder_target_data[i, t - 1] = target_token_index[char]

# One-hot encode target data
decoder_target_data = to_categorical(decoder_target_data, num_classes=decoder_vocab_size)

# Split data into training and validation sets
encoder_input_data_train, encoder_input_data_val, decoder_input_data_train, decoder_input_data_val, decoder_target_data_train, decoder_target_data_val = train_test_split(
    encoder_input_data, decoder_input_data, decoder_target_data, test_size=0.2, random_state=42
)

# Define DataGenerator class
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, x_data, y_data, batch_size):
        self.x_data = x_data
        self.y_data = y_data
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.x_data[0]))

    def __len__(self):
        return int(np.floor(len(self.x_data[0]) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        x_batch_encoder = self.x_data[0][indexes]
        x_batch_decoder = self.x_data[1][indexes]
        y_batch = self.y_data[indexes]
        return [x_batch_encoder, x_batch_decoder], y_batch

    def on_epoch_end(self):
        np.random.shuffle(self.indexes)

    def to_tf_dataset(self):
        """Creates a tf.data.Dataset from the generator."""
        # Define output_signature with nested tuples
        output_signature = (
            (
                tf.TensorSpec(shape=(None, max_encoder_seq_length), dtype=tf.int32),  # Encoder input
                tf.TensorSpec(shape=(None, max_decoder_seq_length), dtype=tf.int32)   # Decoder input
            ),
            tf.TensorSpec(shape=(None, max_decoder_seq_length, decoder_vocab_size), dtype=tf.float32)  # Target
        )

        return tf.data.Dataset.from_generator(
            self, output_signature=output_signature
        ).prefetch(tf.data.AUTOTUNE)

# Create data generators
train_generator = DataGenerator([encoder_input_data_train, decoder_input_data_train], decoder_target_data_train, batch_size=32)
validation_generator = DataGenerator([encoder_input_data_val, decoder_input_data_val], decoder_target_data_val, batch_size=32)

# Create and use the tf.data.Dataset
train_dataset = train_generator.to_tf_dataset()
validation_dataset = validation_generator.to_tf_dataset()

# 8. Train the model with tf.data.Dataset
model.fit(train_dataset, epochs=20, validation_data=validation_dataset)
