In [32]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense , Dropout


from sklearn.model_selection import train_test_split
import pickle

In [1]:
from datasets import load_dataset

dataset = load_dataset("opus100", "ar-en")

print(dataset)
print(dataset["train"][0])


  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})
{'translation': {'ar': 'و هذه؟', 'en': 'And this?'}}


In [2]:
import pandas as pd

train_data = dataset["train"]

# حساب الإحصائيات الأساسية
def avg_length(data, lang):
    lengths = [len(x["translation"][lang].split()) for x in data]
    return sum(lengths)/len(lengths)

print("Train samples:", len(train_data))
print("Average EN length:", avg_length(train_data, "en"))
print("Average AR length:", avg_length(train_data, "ar"))


Train samples: 1000000
Average EN length: 10.161813
Average AR length: 8.585617


In [4]:
import re, unicodedata, csv

# Arabic normalization
arabic_diacritics = re.compile(r"[ًٌٍَُِّْـ]")
def normalize_arabic(text):
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(arabic_diacritics, "", text)
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ـ+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# basic clean
def clean_text_general(text, lang):
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    # remove control chars
    text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C")
    # remove weird unicode (keep Arabic/Latin/numbers/punctuation)
    if lang == "ar":
        text = re.sub(r"[^؀-ۿ0-9\s\.,;:\-؟؛!\?()\"'٪]", "", text)
        text = normalize_arabic(text)
    else:
        text = re.sub(r"[^a-zA-Z0-9\s\.,;:\-!\?()\"'/%@#]", "", text)
    return text

def preprocess_generator(dataset_split, max_len=64, min_len=3):
    seen = set()
    for item in dataset_split:
        en = clean_text_general(item["translation"]["en"], "en")
        ar = clean_text_general(item["translation"]["ar"], "ar")
        if not en or not ar: 
            continue
        if len(en.split()) < min_len or len(ar.split()) < min_len:
            continue
        if len(en.split()) > max_len or len(ar.split()) > max_len:
            continue
        key = (en, ar)
        if key in seen:
            continue
        seen.add(key)
        yield {"en": en, "ar": ar}

# save a subset for tokenization
max_save = 200_000
with open("train_clean_sample.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["en", "ar"])
    writer.writeheader()
    i = 0
    for pair in preprocess_generator(train_data, max_len=64):
        writer.writerow(pair)
        i += 1
        if i >= max_save:
            break
print("Saved", i)


Saved 200000


In [31]:
csv_path = "train_clean_sample.csv"

all_english_sentences = []
all_arabic_sentences = []

print("Reading CSV file...")
with open(csv_path, newline='', encoding='utf-8') as f_in:
    reader = csv.DictReader(f_in)
    for row in reader:
        all_english_sentences.append(row['en'].strip())
        all_arabic_sentences.append(row['ar'].strip())
num_words=20000
print(f"Read {len(all_english_sentences)} sentence pairs.")

START_TOKEN = "<start>"
END_TOKEN = "<end>"
OOV_TOKEN = "<oov>" 

preprocessed_arabic_sentences = [f"{START_TOKEN} {s} {END_TOKEN}" for s in all_arabic_sentences]

print("Fitting Arabic tokenizer")
arabic_tokenizer = Tokenizer(oov_token=OOV_TOKEN,num_words=20000) 
arabic_tokenizer.fit_on_texts(preprocessed_arabic_sentences)

print("Fitting English tokenizer")
english_tokenizer = Tokenizer(num_words=num_words,oov_token="<OOV>") 
english_tokenizer.fit_on_texts(all_english_sentences)


max_len_english = max(len(s.split()) for s in all_english_sentences)
max_len_arabic = max(len(s.split()) for s in preprocessed_arabic_sentences)
MAX_LEN = max(max_len_english, max_len_arabic)

encoder_input_data = english_tokenizer.texts_to_sequences(all_english_sentences)
encoder_input_data = pad_sequences(encoder_input_data, maxlen=MAX_LEN, padding='post')

decoder_data = arabic_tokenizer.texts_to_sequences(preprocessed_arabic_sentences)
decoder_input_data = pad_sequences(decoder_data, maxlen=MAX_LEN, padding='post')

decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]

print(f"Shape of encoder_input_data: {encoder_input_data.shape}")
print(f"Shape of decoder_input_data: {decoder_input_data.shape}")
print(f"Shape of decoder_target_data: {decoder_target_data.shape}")

Reading CSV file...
Read 200000 sentence pairs.
Fitting Arabic tokenizer
Fitting English tokenizer
Shape of encoder_input_data: (200000, 66)
Shape of decoder_input_data: (200000, 66)
Shape of decoder_target_data: (200000, 66)


In [24]:
learning_rate = 0.001

In [38]:
EMBEDDING_DIM=128
LSTM_DIM=128
encoder_inputs = Input(shape=(MAX_LEN,), name="encoder_input")
decoder_inputs = Input(shape=(MAX_LEN,), name="decoder_input")


enc_embedding_layer = Embedding(num_words, EMBEDDING_DIM, name="encoder_embedding")
enc_embedding_output = enc_embedding_layer(encoder_inputs)

encoder_lstm = LSTM(LSTM_DIM, return_state=True, name="encoder_lstm")
_, state_h, state_c = encoder_lstm(enc_embedding_output)

encoder_states = [state_h, state_c]


dec_embedding_layer = Embedding(num_words, EMBEDDING_DIM, name="decoder_embedding")
dec_embedding_output = dec_embedding_layer(decoder_inputs)

decoder_lstm = LSTM(LSTM_DIM, return_sequences=True, return_state=False, name="decoder_lstm")

decoder_outputs = decoder_lstm(dec_embedding_output, initial_state=encoder_states)

decoder_dense = Dense(arabic_vocab_size, activation='softmax', name="decoder_output_dense")
decoder_outputs = decoder_dense(decoder_outputs)


model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])
model.summary()

In [43]:
X_train_enc, X_val_enc, \
X_train_dec, X_val_dec, \
y_train_dec, y_val_dec = train_test_split(encoder_input_data, 
                                          decoder_input_data, 
                                          decoder_target_data, 
                                          test_size=0.2, 
                                          random_state=42)

history = model.fit(
    [X_train_enc, X_train_dec], 
    y_train_dec,           
    batch_size=4,
    epochs=10,                  
    validation_data=(
        [X_val_enc, X_val_dec], 
        y_val_dec               
    )
)

Epoch 1/10
[1m   25/40000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:56:00[0m 174ms/step - accuracy: 0.8663 - loss: 1.1732


KeyboardInterrupt

