# Libraries 

In [7]:
pip install -r requirements.txt --upgrade

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'


In [49]:
import nltk
#to open csv file
import csv
import pandas as pd
import numpy as np
#sentences & words tokenization
from nltk.tokenize import sent_tokenize, word_tokenize
#regular expression 
import re
#for stopwords
from nltk.corpus import stopwords
import string
# from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.stem import ISRIStemmer
from nltk.stem import WordNetLemmatizer

In [50]:
#for preprocessing
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ANTER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ANTER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ANTER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# csv path

In [51]:
dataset_path = r'C:\Users\ANTER\Downloads\arabic_english.csv'
dataset = pd.read_csv(dataset_path, encoding="utf-8")
dataset = dataset.head(5000) 
max_len = 30 
dataset.head()

Unnamed: 0,arabic,english
0,متى أنشئت هذه الجامعة؟,When was this university founded?
1,أراها نادراً,I see it rarely.
2,يعزف على البيانو بشكل جيد جداً,He plays the piano very well.
3,مع كل احترامي.,With all due respect.
4,نظف أسنانك,Brush your teeth clean.


# preprocessing function

### arabic preprocessing

In [52]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
import re

def preprocess_text(text):
    # Apply lowercase
    text = text.lower()
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove Arabic stopwords
    stop_words = set(stopwords.words('arabic'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Remove Arabic punctuation and other non-alphanumeric characters
    tokens = [re.sub(r'[^\w\s]', '', token) for token in tokens]
    
    # Remove empty tokens
    tokens = [token for token in tokens if token]
    
    # Apply stemming (no need for lemmatization for Arabic)
    stemmer = ISRIStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    return tokens


In [53]:
dataset['arabic_preprocessed'] = dataset['arabic'].apply(preprocess_text)

In [54]:
dataset['arabic_preprocessed']

0                      [شئت, جمع]
1                      [ارا, ندر]
2       [عزف, ينو, شكل, جيد, جدا]
3                           [حرم]
4                      [نظف, سنن]
                  ...            
4995         [سفر, ابي, خرج, احا]
4996         [ابي, غضب, مني, جدا]
4997         [يحب, ابي, يتز, كثر]
4998         [سبق, لأب, سفر, خرج]
4999              [سفر, ابي, خرج]
Name: arabic_preprocessed, Length: 5000, dtype: object

### english preprocessing

In [55]:
def preprocess_english_text(text):
    # Apply lowercase
    text = text.lower()
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove English stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Remove English punctuation and other non-alphanumeric characters
    tokens = [re.sub(r'[^a-zA-Z]', '', token) for token in tokens]
    # Remove empty tokens
    tokens = [token for token in tokens if token]
    # Apply lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

In [56]:
dataset['english_preprocessed'] = dataset['english'].apply(preprocess_english_text)

In [57]:
dataset['english_preprocessed']

0                 [university, founded]
1                         [see, rarely]
2                   [play, piano, well]
3                        [due, respect]
4                 [brush, teeth, clean]
                     ...               
4995    [father, sometimes, go, abroad]
4996                    [father, angry]
4997        [father, like, pizza, much]
4998            [father, never, abroad]
4999            [father, never, abroad]
Name: english_preprocessed, Length: 5000, dtype: object

# Feature Extraction

### word embedding

In [58]:
from gensim.models import Word2Vec

# Assuming you have lists of tokenized Arabic and English sentences
arabic_corpus = [sentence for sentence in dataset['arabic_preprocessed']]
english_corpus = [sentence for sentence in dataset['english_preprocessed']]

In [59]:
# Train Arabic Word2Vec model
arabic_model = Word2Vec(sentences=arabic_corpus, vector_size=100, window=5, min_count=5, workers=4)

# Train English Word2Vec model
english_model = Word2Vec(sentences=english_corpus, vector_size=100, window=5, min_count=5, workers=4)

In [60]:
arabic_model.save('arabic_word2vec.model')
english_model.save('english_word2vec.model')

In [61]:
from gensim.models import Word2Vec

arabic_model = Word2Vec.load('arabic_word2vec.model')
english_model = Word2Vec.load('english_word2vec.model')

In [62]:
import numpy as np

def text_to_embeddings(text, model):
    embeddings = []
    for word in text:
        if word in model.wv:
            embeddings.append(model.wv[word])
        else:
            # Handle out-of-vocabulary words
            embeddings.append(np.zeros(model.vector_size))
    return np.array(embeddings)

In [63]:
dataset['arabic_embeddings'] = dataset['arabic_preprocessed'].apply(lambda x: text_to_embeddings(x, arabic_model))
dataset['english_embeddings'] = dataset['english_preprocessed'].apply(lambda x: text_to_embeddings(x, english_model))

In [64]:
dataset['arabic_embeddings']

0       [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
1       [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
2       [[-0.005348652, 0.015084333, 0.0045133135, 0.0...
3       [[-0.009835003, 0.001242528, -0.008418427, -0....
4       [[-0.008891041, 0.010265926, 0.0072207046, -0....
                              ...                        
4995    [[-0.0007908391, 0.0063931425, 0.010211208, 0....
4996    [[-0.013162706, 0.02190433, 0.007337241, -0.00...
4997    [[-0.007895629853010178, 0.01972855255007744, ...
4998    [[-0.0068212547339499, 0.01469762809574604, -4...
4999    [[-0.0007908391, 0.0063931425, 0.010211208, 0....
Name: arabic_embeddings, Length: 5000, dtype: object

In [65]:
dataset['english_embeddings']

0       [[-0.005334263, 0.012371778, -0.0060608396, -0...
1       [[0.005007278174161911, 0.00018593885761220008...
2       [[0.0078026424, 0.00641275, -0.011493867, 0.00...
3       [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
4       [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
                              ...                        
4995    [[-0.009947495, 0.008167277, 0.0039029298, 0.0...
4996    [[-0.009947495, 0.008167277, 0.0039029298, 0.0...
4997    [[-0.009947494603693485, 0.008167277090251446,...
4998    [[-0.009947495, 0.008167277, 0.0039029298, 0.0...
4999    [[-0.009947495, 0.008167277, 0.0039029298, 0.0...
Name: english_embeddings, Length: 5000, dtype: object

# Model architecture

### LSTM (seq2seq)

In [66]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

In [67]:
# Define the maximum sequence length for both input and output sequences
max_encoder_seq_length = max(len(seq) for seq in dataset['arabic_preprocessed'])
max_decoder_seq_length = max(len(seq) for seq in dataset['english_preprocessed'])

# Define the input sequence
encoder_inputs = Input(shape=(max_encoder_seq_length, arabic_model.vector_size))

# Define the LSTM encoder
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

# Define the input sequence for the decoder
decoder_inputs = Input(shape=(max_decoder_seq_length, english_model.vector_size))

# Define the LSTM decoder
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

# Define the Dense layer for output
decoder_dense = Dense(len(english_model.wv.key_to_index), activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

# Print the model summary
print(model.summary())

None


# Data Preparation

In [68]:

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Function to pad or truncate embedding sequences to a fixed length
def pad_embedding_sequence(embeddings, max_len, vector_size):
    padded = np.zeros((max_len, vector_size))
    for i in range(min(max_len, len(embeddings))):
        padded[i] = embeddings[i]
    return padded

# Prepare encoder input data (Arabic embeddings)
encoder_input_data = np.array([
    pad_embedding_sequence(seq, max_encoder_seq_length, arabic_model.vector_size)
    for seq in dataset['arabic_embeddings']
])

# Prepare decoder input data (English embeddings)
decoder_input_data = np.array([
    pad_embedding_sequence(seq, max_decoder_seq_length, english_model.vector_size)
    for seq in dataset['english_embeddings']
])

# Tokenize English preprocessed text
tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(dataset['english_preprocessed'])

# Convert English sentences to integer sequences
decoder_input_sequences = tokenizer_eng.texts_to_sequences(dataset['english_preprocessed'])

# Prepare decoder target sequences using teacher forcing (shifted by one)
decoder_target_sequences = [seq[1:] + [0] for seq in decoder_input_sequences]

# Get vocabulary size for decoder output layer
eng_vocab_size = len(tokenizer_eng.word_index) + 1

# Pad decoder input and target sequences to uniform length
decoder_input_sequences = pad_sequences(decoder_input_sequences, maxlen=max_decoder_seq_length, padding='post')
decoder_target_sequences = pad_sequences(decoder_target_sequences, maxlen=max_decoder_seq_length, padding='post')

# One-hot encode decoder target data
decoder_target_data = np.zeros(
    (len(decoder_target_sequences), max_decoder_seq_length, eng_vocab_size),
    dtype='float32'
)

for i, seq in enumerate(decoder_target_sequences):
    for t, word_id in enumerate(seq):
        if word_id > 0:
            decoder_target_data[i, t, word_id] = 1.0



#  Model Training

In [71]:
# -----------------------------
#  Model Training (Seq2Seq with LSTM)
# -----------------------------

# Import necessary modules from Keras
from keras.models import Model
from keras.layers import Input, LSTM, Dense

# Define encoder input layer (Arabic input embeddings)
encoder_inputs = Input(shape=(max_encoder_seq_length, arabic_model.vector_size))

# Encoder LSTM: returns hidden and cell states
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)

# Store the encoder states to initialize the decoder
encoder_states = [state_h, state_c]

# Define decoder input layer (English input embeddings)
decoder_inputs = Input(shape=(max_decoder_seq_length, english_model.vector_size))

# Decoder LSTM: takes encoder states as initial state
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

# Dense layer to predict the next word from decoder outputs
decoder_dense = Dense(eng_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the full Seq2Seq model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model using categorical crossentropy loss
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

# Display the model architecture
print(model.summary())

# -----------------------------
#  Train the model
# -----------------------------
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=64,
    epochs=20,
    validation_split=0.2
)

# -----------------------------
#  Save the trained model and tokenizer
# -----------------------------
model.save("arabic_to_english_translation_model.keras")

import pickle
with open("english_tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer_eng, f)

print(" Training complete. Model and tokenizer saved.")


None
Epoch 1/20
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 216ms/step - loss: 0.3533 - val_loss: 0.3460
Epoch 2/20
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 204ms/step - loss: 0.3409 - val_loss: 0.3439
Epoch 3/20
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 205ms/step - loss: 0.3436 - val_loss: 0.3432
Epoch 4/20
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 205ms/step - loss: 0.3370 - val_loss: 0.3435
Epoch 5/20
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 203ms/step - loss: 0.3408 - val_loss: 0.3430
Epoch 6/20
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 207ms/step - loss: 0.3431 - val_loss: 0.3435
Epoch 7/20
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 207ms/step - loss: 0.3393 - val_loss: 0.3436
Epoch 8/20
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 203ms/step - loss: 0.3549 - val_loss: 0.3432
Epoch 9/20
[1m63/63[0m [

# Model Evaluation


In [None]:
# -----------------------------
#  Model Evaluation Section
# -----------------------------

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from tqdm import tqdm
import numpy as np

# Create reverse tokenizer to map indices back to words
reverse_tokenizer_eng = {v: k for k, v in tokenizer_eng.word_index.items()}
smoothie = SmoothingFunction().method1

# Function to decode a sequence using the trained encoder and decoder
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1, english_model.vector_size))
    decoded_sentence = []

    for _ in range(max_decoder_seq_length):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_tokenizer_eng.get(sampled_token_index, '')

        if sampled_word == '' or sampled_word == '<end>':
            break

        decoded_sentence.append(sampled_word)

        # Update the target sequence with the new word embedding
        if sampled_word in english_model.wv:
            target_seq[0, 0, :] = english_model.wv[sampled_word]
        else:
            target_seq[0, 0, :] = np.zeros((english_model.vector_size,))

        states_value = [h, c]

    return ' '.join(decoded_sentence)

# Function to compute BLEU score for evaluation
def evaluate_bleu_score(dataset, sample_size=100):
    bleu_scores = []

    for i in tqdm(range(sample_size)):
        input_embedding = encoder_input_data[i:i+1]
        predicted = decode_sequence(input_embedding)
        reference = [dataset['english_preprocessed'][i]]
        candidate = predicted.split()

        score = sentence_bleu(reference, candidate, smoothing_function=smoothie)
        bleu_scores.append(score)

        # Print every 10 samples for visual reference
        if i % 10 == 0:
            print(f"\nArabic Input    : {' '.join(dataset['arabic_preprocessed'][i])}")
            print(f"Reference Output: {' '.join(reference[0])}")
            print(f"Model Prediction: {' '.join(candidate)}")
            print(f"BLEU Score      : {score:.4f}")

    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    print(f"\n Average BLEU Score on {sample_size} samples: {avg_bleu:.4f}")
    return avg_bleu

# Function to compute classification metrics: accuracy, precision, recall, F1-score
def classification_metrics(dataset, sample_size=100):
    y_true = []
    y_pred = []

    for i in tqdm(range(sample_size)):
        input_embedding = encoder_input_data[i:i+1]
        predicted = decode_sequence(input_embedding).split()
        reference = dataset['english_preprocessed'][i]

        # Pad sequences to equal length
        max_len = max(len(predicted), len(reference))
        pred_padded = predicted + [''] * (max_len - len(predicted))
        ref_padded = reference + [''] * (max_len - len(reference))

        for ref_word, pred_word in zip(ref_padded, pred_padded):
            y_true.append(ref_word)
            y_pred.append(pred_word)

    # Convert words to numerical labels
    all_words = list(set(y_true + y_pred))
    word2idx = {word: i for i, word in enumerate(all_words)}
    y_true_ids = [word2idx[word] for word in y_true]
    y_pred_ids = [word2idx[word] for word in y_pred]

    # Compute metrics
    accuracy = accuracy_score(y_true_ids, y_pred_ids)
    precision = precision_score(y_true_ids, y_pred_ids, average='macro', zero_division=0)
    recall = recall_score(y_true_ids, y_pred_ids, average='macro', zero_division=0)
    f1 = f1_score(y_true_ids, y_pred_ids, average='macro', zero_division=0)

    print("\n Classification Metrics:")
    print(f" Accuracy : {accuracy:.4f}")
    print(f" Precision: {precision:.4f}")
    print(f" Recall   : {recall:.4f}")
    print(f" F1-score : {f1:.4f}")

    return accuracy, precision, recall, f1

# --------- Run Evaluation ---------
print(" Evaluating model...\n")
bleu = evaluate_bleu_score(dataset, sample_size=100)
accuracy, precision, recall, f1 = classification_metrics(dataset, sample_size=100)